# Creating a Spark session

In [1]:
from pyspark.sql import *
from pyspark.sql import functions as f
from pyspark.sql.types import *

spark = SparkSession.builder.appName("SparkIntro").getOrCreate()

# Reading data
We specify that we want to interpret the first line as headers and that we want Spark to try to infer the types of the data.

In [2]:
df = spark.read.option('header', True).option('inferSchema', True).csv('data/titanic.csv')

# Inspecting the file
The first three things you should always do when working with data:
- Check that it looks ok (`show()`)
- Check that the schema is correct (`printSchema()`)
- See how big the dataset is (`count()`, unless it is huge!)

In [3]:
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|          Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------------+-------+-----+--------+
|          1|   false|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|       A/5 21171|   7.25| null|       S|
|          2|    true|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|        PC 17599|71.2833|  C85|       C|
|          3|    true|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|STON/O2. 3101282|  7.925| null|       S|
|          4|    true|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|          113803|   53.1| C123|       S|
|          5|   false|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|          373450|   8.05| null|       S|
|          6|   false|     3|    Moran, Mr. James|  male|null|    0|    0|      

In [4]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: boolean (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [5]:
df.count()

891

# Filtering dataframe

In [6]:
filtered_df = df.filter(f.col('survived') == True).filter(f.col('Cabin').isNotNull())

In [7]:
filtered_df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------+--------+-----------+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Ticket|    Fare|      Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------+--------+-----------+--------+
|          2|    true|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|  PC 17599| 71.2833|        C85|       C|
|          4|    true|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|    113803|    53.1|       C123|       S|
|         11|    true|     3|Sandstrom, Miss. ...|female| 4.0|    1|    1|   PP 9549|    16.7|         G6|       S|
|         12|    true|     1|Bonnell, Miss. El...|female|58.0|    0|    0|    113783|   26.55|       C103|       S|
|         22|    true|     2|Beesley, Mr. Lawr...|  male|34.0|    0|    0|    248698|    13.0|        D56|       S|
|         24|    true|     1|Sloper, Mr. Willi...|  male|28.0|    0|    

In [8]:
filtered_df.count()

136

# Sorting the dataframe
Only sort when you really need it and when the dataframe is small. It is an expensive operation.

In [34]:
df.orderBy('Name').show()

+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-----+--------+
|PassengerId|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|    Ticket|   Fare|Cabin|Embarked|
+-----------+--------+------+--------------------+------+----+-----+-----+----------+-------+-----+--------+
|        147|    true|     3|"Andersson, Mr. A...|  male|27.0|    0|    0|    350043| 7.7958| null|       S|
|        519|    true|     2|"Angle, Mrs. Will...|female|36.0|    1|    0|    226875|   26.0| null|       S|
|        291|    true|     1|"Barber, Miss. El...|female|26.0|    0|    0|     19877|  78.85| null|       S|
|        625|   false|     3|"Bowen, Mr. David...|  male|21.0|    0|    0|     54636|   16.1| null|       S|
|        508|    true|     1|"Bradley, Mr. Geo...|  male|null|    0|    0|    111427|  26.55| null|       S|
|        346|    true|     2|"Brown, Miss. Ame...|female|24.0|    0|    0|    248733|   13.0|  F33|       S|
|        209|    tr

# Creating a dataframe from data in memory
If you obtain data from other sources than a file on disk, e.g., a remote API, you need to create a dataframe manually in order to work with it in Spark.

In [9]:
deck = [
    ['S', 'lower deck'],
    ['C', 'lower deck'],
    ['Q', 'upper deck']
]

In [10]:
fields = [StructField('Embarked', StringType(), True), StructField('Level', StringType(), True)]
schema = StructType(fields)
deck_df = spark.createDataFrame(deck, schema)

In [11]:
deck_df.printSchema()

root
 |-- Embarked: string (nullable = true)
 |-- Level: string (nullable = true)



# Joining dataframes
Joins are the same thing as in Pandas and SQL.

Be careful with joins if you have a lot of data. More on joins next week.

In [12]:
level_df = df.join(deck_df, ['Embarked'], 'inner')

In [13]:
level_df.printSchema()

root
 |-- Embarked: string (nullable = true)
 |-- PassengerId: integer (nullable = true)
 |-- Survived: boolean (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Level: string (nullable = true)



In [33]:
level_df.select('PassengerId', 'Embarked', 'Level').show()

+-----------+--------+----------+
|PassengerId|Embarked|     Level|
+-----------+--------+----------+
|        889|       S|lower deck|
|        888|       S|lower deck|
|        887|       S|lower deck|
|        885|       S|lower deck|
|        884|       S|lower deck|
|        883|       S|lower deck|
|        882|       S|lower deck|
|        881|       S|lower deck|
|        879|       S|lower deck|
|        878|       S|lower deck|
|        877|       S|lower deck|
|        874|       S|lower deck|
|        873|       S|lower deck|
|        872|       S|lower deck|
|        871|       S|lower deck|
|        870|       S|lower deck|
|        869|       S|lower deck|
|        868|       S|lower deck|
|        866|       S|lower deck|
|        865|       S|lower deck|
+-----------+--------+----------+
only showing top 20 rows



# Computing aggregates
Group values and compute statistics.

Functions:

- avg
- count
- max
- min
- sum

In [14]:
# Total fare
level_df.select(f.sum('Fare')).show()

+------------------+
|         sum(Fare)|
+------------------+
|28533.949299999967|
+------------------+



In [15]:
# Total, average, max and min
level_df.select(f.sum('Fare'), f.avg('Fare'), f.max('Fare'), f.min('Fare')).show()

+------------------+-----------------+---------+---------+
|         sum(Fare)|        avg(Fare)|max(Fare)|min(Fare)|
+------------------+-----------------+---------+---------+
|28533.949299999967|32.09668087739029| 512.3292|      0.0|
+------------------+-----------------+---------+---------+



In [16]:
# ... with filter
level_df.filter(f.col('Fare') > 0.0).select(f.sum('Fare'), f.avg('Fare'), f.max('Fare'), f.min('Fare')).show()

+------------------+------------------+---------+---------+
|         sum(Fare)|         avg(Fare)|max(Fare)|min(Fare)|
+------------------+------------------+---------+---------+
|28533.949299999967|32.647539244851224| 512.3292|   4.0125|
+------------------+------------------+---------+---------+



In [17]:
# Total per passenger class

In [18]:
level_df.groupBy('Pclass').agg(f.sum('Fare')).show()

+------+------------------+
|Pclass|         sum(Fare)|
+------+------------------+
|     1|18017.412500000002|
|     3| 6714.695100000008|
|     2|3801.8417000000004|
+------+------------------+



# Adding and removing columns

In [19]:
df.drop('PassengerId').printSchema()

root
 |-- Survived: boolean (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [20]:
df.withColumn('RealFare', f.col('Fare')/(f.col('SibSp')+f.col('Parch')+1)).select('SibSp', 'Parch', 'Fare', 'RealFare').show()

+-----+-----+-------+------------------+
|SibSp|Parch|   Fare|          RealFare|
+-----+-----+-------+------------------+
|    1|    0|   7.25|             3.625|
|    1|    0|71.2833|          35.64165|
|    0|    0|  7.925|             7.925|
|    1|    0|   53.1|             26.55|
|    0|    0|   8.05|              8.05|
|    0|    0| 8.4583|            8.4583|
|    0|    0|51.8625|           51.8625|
|    3|    1| 21.075|             4.215|
|    0|    2|11.1333|            3.7111|
|    1|    0|30.0708|           15.0354|
|    1|    1|   16.7| 5.566666666666666|
|    0|    0|  26.55|             26.55|
|    0|    0|   8.05|              8.05|
|    1|    5| 31.275|4.4678571428571425|
|    0|    0| 7.8542|            7.8542|
|    0|    0|   16.0|              16.0|
|    4|    1| 29.125| 4.854166666666667|
|    0|    0|   13.0|              13.0|
|    1|    0|   18.0|               9.0|
|    0|    0|  7.225|             7.225|
+-----+-----+-------+------------------+
only showing top

# Custom functions
If you want to apply your own function to a column you need to register it as a UDF. Functions can not modify colums so you need to use it with `select()` or `withColumn()`.

Let's create a function that extracts the title of the passenger. Names are in the form `Lastname, Title. Firstname`.

In [30]:
def extract_title(name):
    name_parts = name.split(',')
    title_and_first = name_parts[1].split('.')
    return title_and_first[0]

In [31]:
extract_title_udf = f.udf(extract_title)

In [32]:
df.withColumn('Title', extract_title_udf(f.col('Name'))).select('Name', 'Title').show()

+--------------------+-------+
|                Name|  Title|
+--------------------+-------+
|Braund, Mr. Owen ...|     Mr|
|Cumings, Mrs. Joh...|    Mrs|
|Heikkinen, Miss. ...|   Miss|
|Futrelle, Mrs. Ja...|    Mrs|
|Allen, Mr. Willia...|     Mr|
|    Moran, Mr. James|     Mr|
|McCarthy, Mr. Tim...|     Mr|
|Palsson, Master. ...| Master|
|Johnson, Mrs. Osc...|    Mrs|
|Nasser, Mrs. Nich...|    Mrs|
|Sandstrom, Miss. ...|   Miss|
|Bonnell, Miss. El...|   Miss|
|Saundercock, Mr. ...|     Mr|
|Andersson, Mr. An...|     Mr|
|Vestrom, Miss. Hu...|   Miss|
|Hewlett, Mrs. (Ma...|    Mrs|
|Rice, Master. Eugene| Master|
|Williams, Mr. Cha...|     Mr|
|Vander Planke, Mr...|    Mrs|
|Masselmani, Mrs. ...|    Mrs|
+--------------------+-------+
only showing top 20 rows

