# Introduction

Every Spark instance begins with `SparkSession`

Content

* PySpark Dataframes
* Reading Datasets
* Checking data types of columns (Schema)
* Selecting columns and indexing
* Summary of dataset
* Adding and Dropping Columns

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
  .builder \
  .appName('FCC Intro') \
  .getOrCreate()

In [3]:
spark

# Reading Dataframes

How to read various filetypes as Spark Dataframes

In [5]:
# Reading a CSV
df_pyspark_csv = spark \
  .read \
  .option('header', 'true') \
  .csv('people.csv', inferSchema=True)

# Alternative - pass options as variables
df_pyspark_csv = spark.read.csv('people.csv', header=True, inferSchema=True)

In [6]:
# Running the variable will display the columnn headers
df_pyspark_csv

DataFrame[name: string, age: int, job: string]

In [7]:
# Checking the Schema
df_pyspark_csv.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)



In [8]:
# Select columns
df_pyspark_csv.select('name').show()
df_pyspark_csv.select(['name', 'age']).show()

+-----+
| name|
+-----+
|Jorge|
|  Bob|
| John|
| null|
|Jacob|
|  Kim|
+-----+

+-----+---+
| name|age|
+-----+---+
|Jorge| 30|
|  Bob| 32|
| John| 32|
| null| 43|
|Jacob| 23|
|  Kim| 14|
+-----+---+



In [9]:
# Adding Columns
df_pyspark_csv = df_pyspark_csv.withColumn('double age', df_pyspark_csv['age'] * 2)
df_pyspark_csv.show()

# Dropping Columns
df_pyspark_csv.drop('double age').show()

# Renaming Columns
df_pyspark_csv.withColumnRenamed('name', 'rename').show()

+-----+---+---------+----------+
| name|age|      job|double age|
+-----+---+---------+----------+
|Jorge| 30|Developer|        60|
|  Bob| 32|Developer|        64|
| John| 32|     null|        64|
| null| 43|   Dancer|        86|
|Jacob| 23|    Pilot|        46|
|  Kim| 14|Marketing|        28|
+-----+---+---------+----------+

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
| John| 32|     null|
| null| 43|   Dancer|
|Jacob| 23|    Pilot|
|  Kim| 14|Marketing|
+-----+---+---------+

+------+---+---------+----------+
|rename|age|      job|double age|
+------+---+---------+----------+
| Jorge| 30|Developer|        60|
|   Bob| 32|Developer|        64|
|  John| 32|     null|        64|
|  null| 43|   Dancer|        86|
| Jacob| 23|    Pilot|        46|
|   Kim| 14|Marketing|        28|
+------+---+---------+----------+



# Handling Missing Values

* Dropping rows
* Various parameter in dropping functionalities
* handling missing values by mean, median, mode

In [4]:
df_spark = spark.read.csv('people.csv', header=True, inferSchema=True)
df_spark.show()

+-------+----+----------+
|   name| age|       job|
+-------+----+----------+
|  Jorge|  30| Developer|
|    Bob|  32| Developer|
|   John|  32|      null|
|   null|  43|    Dancer|
|  Jacob|  23|     Pilot|
|    Kim|  14| Marketing|
|Johnson|null|     Coder|
|  Frank|null|Consultant|
+-------+----+----------+



In [11]:
# Drop all na rows
df_spark.na.drop().show()
#   - how: any (as long as 1 is null) | all (all values in row is null)
#   - thresh: int (min number to drop)
#   - subset: column_name (only from specified column)

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
|Jacob| 23|    Pilot|
|  Kim| 14|Marketing|
+-----+---+---------+



In [12]:
# Fill missing values
df_spark.na.fill('Missing values').show()

+--------------+---+--------------+
|          name|age|           job|
+--------------+---+--------------+
|         Jorge| 30|     Developer|
|           Bob| 32|     Developer|
|          John| 32|Missing values|
|Missing values| 43|        Dancer|
|         Jacob| 23|         Pilot|
|           Kim| 14|     Marketing|
+--------------+---+--------------+



In [13]:
# Using imputer to replace null values with mean of the column
from pyspark.ml.feature import Imputer

imputer = Imputer(
  inputCols=['age'],
  outputCols=["age_imputed"],
).setStrategy('mean')

In [17]:
imputer.fit(df_spark).transform(df_spark).show()

+-------+----+----------+-----------+
|   name| age|       job|age_imputed|
+-------+----+----------+-----------+
|  Jorge|  30| Developer|         30|
|    Bob|  32| Developer|         32|
|   John|  32|      null|         32|
|   null|  43|    Dancer|         43|
|  Jacob|  23|     Pilot|         23|
|    Kim|  14| Marketing|         14|
|Johnson|null|     Coder|         29|
|  Frank|null|Consultant|         29|
+-------+----+----------+-----------+



# PySpark DataFrames

* Filter Operation
* Logical Flow

In [23]:
# Filter Operations
df_spark.filter("age < 30").show()

# AND
df_spark.filter((df_spark['age'] < 30) & (df_spark['age'] > 20)).show()

# OR
df_spark.filter((df_spark['age'] < 30) | (df_spark['age'] > 20)).show()

# NOT
df_spark.filter(~(df_spark['age'] < 30)).show()

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jacob| 23|    Pilot|
|  Kim| 14|Marketing|
+-----+---+---------+

+-----+---+-----+
| name|age|  job|
+-----+---+-----+
|Jacob| 23|Pilot|
+-----+---+-----+

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
| John| 32|     null|
| null| 43|   Dancer|
|Jacob| 23|    Pilot|
|  Kim| 14|Marketing|
+-----+---+---------+

+-----+---+---------+
| name|age|      job|
+-----+---+---------+
|Jorge| 30|Developer|
|  Bob| 32|Developer|
| John| 32|     null|
| null| 43|   Dancer|
+-----+---+---------+



In [19]:
# selecting a column
df_spark.filter("age < 30").select('name').show()

+-----+
| name|
+-----+
|Jacob|
|  Kim|
+-----+



# Groupby and Aggregate Functions

In [26]:
# groupby column and select the aggregate function
df_spark.groupBy('job').count().show()

+----------+-----+
|       job|count|
+----------+-----+
|      null|    1|
|Consultant|    1|
| Developer|    2|
|     Coder|    1|
| Marketing|    1|
|    Dancer|    1|
|     Pilot|    1|
+----------+-----+



In [30]:
df_spark.groupBy('job').sum('age').show()

+----------+--------+
|       job|sum(age)|
+----------+--------+
|      null|      32|
|Consultant|    null|
| Developer|      62|
|     Coder|    null|
| Marketing|      14|
|    Dancer|      43|
|     Pilot|      23|
+----------+--------+



In [14]:
# Aggregate functions across entire dataframe
df_spark.agg({
  'age': 'mean'
}).show()

+--------+
|avg(age)|
+--------+
|    29.0|
+--------+

