# Spark DataFrames

This iPython notebooks introduces to Spark DataFrames (compared to NumPy/Pandas).

## Creating and working with a Spark DataFrame

In [1]:
from pyspark.sql import SparkSession

In [2]:
# create a Spark Session
spark = SparkSession\
    .builder\
    .appName("Spark DataFrame Sample")\
    .getOrCreate()

In [4]:
import pandas as pd
pd.options.display.max_rows = 5

In [5]:
# read in sample csv file
data = pd.read_csv('data/text1.csv', 
                   sep=',', 
                   header=0,
                   dtype={'SepalLength': float, 'Species': str})

data

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
...,...,...,...,...,...
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


## Creating a Spark DataFrame

In [6]:
# Create the Spark DataFrame from the data
df = spark.createDataFrame(data)
# Get basic information about the SparkDataFrame
df

DataFrame[SepalLength: double, SepalWidth: double, PetalLength: double, PetalWidth: double, Species: string]

In [7]:
df.show()

+-----------+----------+-----------+----------+-------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
|        5.4|       3.9|        1.7|       0.4| setosa|
|        4.6|       3.4|        1.4|       0.3| setosa|
|        5.0|       3.4|        1.5|       0.2| setosa|
|        4.4|       2.9|        1.4|       0.2| setosa|
|        4.9|       3.1|        1.5|       0.1| setosa|
|        5.4|       3.7|        1.5|       0.2| setosa|
|        4.8|       3.4|        1.6|       0.2| setosa|
|        4.8|       3.0|        1.4|       0.1| setosa|
|        4.3|       3.0|        1.1|       0.1| setosa|
|        5.8|       4.0|        1.2|       0.2| 

## Querying a Spark DataFrame

In [8]:
pandasdf = df.toPandas()

pandasdf

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
...,...,...,...,...,...
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [9]:
# select the entries from column 'Species' and calculate some other attribute (length * width)
df.select((df.SepalLength * df.SepalWidth).alias('area'), df.Species)
# nothing happens? what's wrong?

DataFrame[area: double, Species: string]

In [10]:
# execute collect() and show 10 entries only
df.select((df.SepalLength * df.SepalWidth).alias('area'), df.Species).collect()[:10]

[Row(area=17.849999999999998, Species='setosa'),
 Row(area=14.700000000000001, Species='setosa'),
 Row(area=15.040000000000001, Species='setosa'),
 Row(area=14.26, Species='setosa'),
 Row(area=18.0, Species='setosa'),
 Row(area=21.060000000000002, Species='setosa'),
 Row(area=15.639999999999999, Species='setosa'),
 Row(area=17.0, Species='setosa'),
 Row(area=12.76, Species='setosa'),
 Row(area=15.190000000000001, Species='setosa')]

In [11]:
# here, only 10 entries are retrieved
df.filter(df.Species == 'setosa').limit(10).collect()

[Row(SepalLength=5.1, SepalWidth=3.5, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=4.9, SepalWidth=3.0, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=4.7, SepalWidth=3.2, PetalLength=1.3, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=4.6, SepalWidth=3.1, PetalLength=1.5, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=5.0, SepalWidth=3.6, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=5.4, SepalWidth=3.9, PetalLength=1.7, PetalWidth=0.4, Species='setosa'),
 Row(SepalLength=4.6, SepalWidth=3.4, PetalLength=1.4, PetalWidth=0.3, Species='setosa'),
 Row(SepalLength=5.0, SepalWidth=3.4, PetalLength=1.5, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=4.4, SepalWidth=2.9, PetalLength=1.4, PetalWidth=0.2, Species='setosa'),
 Row(SepalLength=4.9, SepalWidth=3.1, PetalLength=1.5, PetalWidth=0.1, Species='setosa')]

## Grouping data in a Spark DataFrame

In [12]:
# grouping entries
df.groupBy(df.Species).count().limit(3).collect()

[Row(Species='setosa', count=50),
 Row(Species='versicolor', count=50),
 Row(Species='virginica', count=50)]

In [13]:
# grouping and sorting entries
df.groupBy(df.Species).count().sort('Species').limit(3).collect()

[Row(Species='setosa', count=50),
 Row(Species='versicolor', count=50),
 Row(Species='virginica', count=50)]

## Manipulating a Spark DataFrame

In [14]:
# operating on columns using select
df.select('*', (df.SepalLength * df.SepalWidth).alias('SepalArea')).show()

+-----------+----------+-----------+----------+-------+------------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|         SepalArea|
+-----------+----------+-----------+----------+-------+------------------+
|        5.1|       3.5|        1.4|       0.2| setosa|17.849999999999998|
|        4.9|       3.0|        1.4|       0.2| setosa|14.700000000000001|
|        4.7|       3.2|        1.3|       0.2| setosa|15.040000000000001|
|        4.6|       3.1|        1.5|       0.2| setosa|             14.26|
|        5.0|       3.6|        1.4|       0.2| setosa|              18.0|
|        5.4|       3.9|        1.7|       0.4| setosa|21.060000000000002|
|        4.6|       3.4|        1.4|       0.3| setosa|15.639999999999999|
|        5.0|       3.4|        1.5|       0.2| setosa|              17.0|
|        4.4|       2.9|        1.4|       0.2| setosa|             12.76|
|        4.9|       3.1|        1.5|       0.1| setosa|15.190000000000001|
|        5.4|       3.7| 

In [15]:
# operating on columns (and adding a new one)
df = df.withColumn("SepalArea", df.SepalLength * df.SepalWidth)
df.show()

+-----------+----------+-----------+----------+-------+------------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|         SepalArea|
+-----------+----------+-----------+----------+-------+------------------+
|        5.1|       3.5|        1.4|       0.2| setosa|17.849999999999998|
|        4.9|       3.0|        1.4|       0.2| setosa|14.700000000000001|
|        4.7|       3.2|        1.3|       0.2| setosa|15.040000000000001|
|        4.6|       3.1|        1.5|       0.2| setosa|             14.26|
|        5.0|       3.6|        1.4|       0.2| setosa|              18.0|
|        5.4|       3.9|        1.7|       0.4| setosa|21.060000000000002|
|        4.6|       3.4|        1.4|       0.3| setosa|15.639999999999999|
|        5.0|       3.4|        1.5|       0.2| setosa|              17.0|
|        4.4|       2.9|        1.4|       0.2| setosa|             12.76|
|        4.9|       3.1|        1.5|       0.1| setosa|15.190000000000001|
|        5.4|       3.7| 

In [16]:
# remove an attribute from the Spark DataFrame
df = df.drop(df.SepalArea)
df.show()

+-----------+----------+-----------+----------+-------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
|        5.4|       3.9|        1.7|       0.4| setosa|
|        4.6|       3.4|        1.4|       0.3| setosa|
|        5.0|       3.4|        1.5|       0.2| setosa|
|        4.4|       2.9|        1.4|       0.2| setosa|
|        4.9|       3.1|        1.5|       0.1| setosa|
|        5.4|       3.7|        1.5|       0.2| setosa|
|        4.8|       3.4|        1.6|       0.2| setosa|
|        4.8|       3.0|        1.4|       0.1| setosa|
|        4.3|       3.0|        1.1|       0.1| setosa|
|        5.8|       4.0|        1.2|       0.2| 

In [17]:
# the API provides drop(), dropna() and dropDuplicates(); drop() is applied to
# remove columns! we use filter() to sort out some rows from the DataFrame
print('overall number of entries: ', df.count())
print('remove entries with a SepalLength below 4.5: ', df.filter(df.SepalLength < 4.5).count())
print('resulting DataFrame (without the 4 entries): ', df.filter(df.SepalLength >= 4.5).count())
df.filter(df.SepalLength >= 4.5).show()
# store back to df if the removal should be persisted!

overall number of entries:  150
remove entries with a SepalLength below 4.5:  4
resulting DataFrame (without the 4 entries):  146
+-----------+----------+-----------+----------+-------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|Species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
|        5.4|       3.9|        1.7|       0.4| setosa|
|        4.6|       3.4|        1.4|       0.3| setosa|
|        5.0|       3.4|        1.5|       0.2| setosa|
|        4.9|       3.1|        1.5|       0.1| setosa|
|        5.4|       3.7|        1.5|       0.2| setosa|
|        4.8|       3.4|        1.6|       0.2| setosa|
|        4.8|       3.0|        1.4|       0.1| setosa|
|        5.8|       4.0|      

## Applying a custom function on a Spark DataFrame

In [18]:
import pyspark.sql.functions as f

df.withColumn(
    "Species",
    f.when(
        f.col("Species") == "setosa",
        "Some setosa species"
    ).when(
        f.col("Species") != "setosa",
        "Something else"
    )
).show()

# allows applying more complex transformation operations on the data, e.g.
# data correction algorithms or even classifier...

+-----------+----------+-----------+----------+-------------------+
|SepalLength|SepalWidth|PetalLength|PetalWidth|            Species|
+-----------+----------+-----------+----------+-------------------+
|        5.1|       3.5|        1.4|       0.2|Some setosa species|
|        4.9|       3.0|        1.4|       0.2|Some setosa species|
|        4.7|       3.2|        1.3|       0.2|Some setosa species|
|        4.6|       3.1|        1.5|       0.2|Some setosa species|
|        5.0|       3.6|        1.4|       0.2|Some setosa species|
|        5.4|       3.9|        1.7|       0.4|Some setosa species|
|        4.6|       3.4|        1.4|       0.3|Some setosa species|
|        5.0|       3.4|        1.5|       0.2|Some setosa species|
|        4.4|       2.9|        1.4|       0.2|Some setosa species|
|        4.9|       3.1|        1.5|       0.1|Some setosa species|
|        5.4|       3.7|        1.5|       0.2|Some setosa species|
|        4.8|       3.4|        1.6|       0.2|S

## Calculating descriptive statistics about a Spark DataFrame

In [19]:
# descriptive statistics about DataFrame
df.describe().show()

+-------+------------------+------------------+------------------+------------------+---------+
|summary|       SepalLength|        SepalWidth|       PetalLength|        PetalWidth|  Species|
+-------+------------------+------------------+------------------+------------------+---------+
|  count|               150|               150|               150|               150|      150|
|   mean| 5.843333333333334|3.0573333333333332|3.7580000000000005|1.1993333333333331|     null|
| stddev|0.8280661279778632|0.4358662849366984| 1.765298233259466|0.7622376689603464|     null|
|    min|               4.3|               2.0|               1.0|               0.1|   setosa|
|    max|               7.9|               4.4|               6.9|               2.5|virginica|
+-------+------------------+------------------+------------------+------------------+---------+



In [20]:
# close session properly
spark.stop()