## Dataframe Basics

In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [14]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [10]:
# get or create Spark session

app_name = "intro-to-pyspark"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [15]:
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv"
spark.sparkContext.addFile(url)

df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true') \
    .load(SparkFiles.get("food.csv"))
df.head()

Row(food='pizza', price='0')

In [16]:
# Print our schema
df.printSchema()

root
 |-- food: string (nullable = true)
 |-- price: string (nullable = true)



In [17]:
# Show the columns
df.columns

['food', 'price']

In [19]:
# Describe our data
df.describe().show()

+-------+-------+-----------------+
|summary|   food|            price|
+-------+-------+-----------------+
|  count|      3|                3|
|   mean|   null|7.333333333333333|
| stddev|   null|6.429100507328637|
|    min|chinese|                0|
|    max|  sushi|               12|
+-------+-------+-----------------+



In [20]:
# Import struct fields that we can use
from pyspark.sql.types import (
    StructField, 
    StringType, 
    IntegerType, 
    StructType
)

In [21]:
# Next we need to create the list of struct fields
schema = [StructField("food", StringType(), True), StructField("price", IntegerType(), True),]
schema

[StructField(food,StringType,true), StructField(price,IntegerType,true)]

In [22]:
# Pass in our fields
final = StructType(fields=schema)
final

StructType(List(StructField(food,StringType,true),StructField(price,IntegerType,true)))

In [27]:
# Read our data with our new schema
dataframe = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', sep=",") \
    .schema(final) \
    .load(SparkFiles.get("food.csv"))
dataframe.show()


+-------+-----+
|   food|price|
+-------+-----+
|  pizza|    0|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [28]:
# Print it out
dataframe.printSchema()

root
 |-- food: string (nullable = true)
 |-- price: integer (nullable = true)



### Accessing data

In [29]:
dataframe['price']

Column<b'price'>

In [30]:
type(dataframe['price'])

pyspark.sql.column.Column

In [None]:
%pyspark
dataframe.select('price')

In [31]:
type(dataframe.select('price'))

pyspark.sql.dataframe.DataFrame

In [32]:
dataframe.select('price').show()

+-----+
|price|
+-----+
|    0|
|   12|
|   10|
+-----+



### Manipulating Columns

In [33]:
# Add new column
dataframe.withColumn('newprice', dataframe['price']).show()

+-------+-----+--------+
|   food|price|newprice|
+-------+-----+--------+
|  pizza|    0|       0|
|  sushi|   12|      12|
|chinese|   10|      10|
+-------+-----+--------+



In [34]:
# Update column name
dataframe.withColumnRenamed('price','newerprice').show()

+-------+----------+
|   food|newerprice|
+-------+----------+
|  pizza|         0|
|  sushi|        12|
|chinese|        10|
+-------+----------+



In [35]:
# Double the price
dataframe.withColumn('doubleprice',dataframe['price']*2).show()

+-------+-----+-----------+
|   food|price|doubleprice|
+-------+-----+-----------+
|  pizza|    0|          0|
|  sushi|   12|         24|
|chinese|   10|         20|
+-------+-----+-----------+



In [36]:
# Add a dollar to the price
dataframe.withColumn('add_one_dollar',dataframe['price']+1).show()

+-------+-----+--------------+
|   food|price|add_one_dollar|
+-------+-----+--------------+
|  pizza|    0|             1|
|  sushi|   12|            13|
|chinese|   10|            11|
+-------+-----+--------------+



In [37]:
# Half the price
dataframe.withColumn('half_price',dataframe['price']/2).show()

+-------+-----+----------+
|   food|price|half_price|
+-------+-----+----------+
|  pizza|    0|       0.0|
|  sushi|   12|       6.0|
|chinese|   10|       5.0|
+-------+-----+----------+



In [38]:
# Collecting a column as a list
dataframe.select("price").collect()

[Row(price=0), Row(price=12), Row(price=10)]

# Converting PySpark DataFrame to Pandas DataFrame

In [39]:
import pandas as pd
pandas_df = dataframe.toPandas() 

In [40]:
pandas_df.head()

Unnamed: 0,food,price
0,pizza,0
1,sushi,12
2,chinese,10
