### Pyspark advantages
- Run workloads 100x faster.
- Ease of use; write in java, scala, python, R
- Generality; Combine SQL, streaming and complex analytics.
- Runs everywhere; Hadoop, Kubernetes, standalone or cloud

### Creating venv and activate.
- Run env.sh passing in 1 param which is name of virtual env.
- In vscode bottom right select python version and set to virtual env.
- Run pip installs.

In [None]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

### Basic Pyspark operations

In [4]:
# Initialise pyspark sessions
spark = SparkSession.builder.appName("MySparkSession").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/05 16:36:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [12]:
# Open csv as pyspark dataframe
df_pyspark = spark.read.csv("test1.csv")
df_pyspark.show()

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [13]:
# pyspark does not recognise the header row to column names, use read options
df_pyspark = spark.read.option("header", "true").csv("test1.csv")
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [14]:
# Pyspark has it's own df object type
print(type(df_pyspark))

# Show schema of df
df_pyspark.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- Name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



### Data Frame Operations

In [15]:
# In above print Schema all columns were noted as string, below will infer schema types and header in csv option
df_pyspark = spark.read.csv("test1.csv", header = True, inferSchema = True)
df_pyspark.show()
df_pyspark.printSchema()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [26]:
# Get column names
print(df_pyspark.columns)

# Get top 3 records in list format.
print(df_pyspark.head(3))

['Name', 'age', 'Experience', 'Salary']
[Row(Name='Krish', age=31, Experience=10, Salary=30000), Row(Name='Sudhanshu', age=30, Experience=8, Salary=25000), Row(Name='Sunny', age=29, Experience=4, Salary=20000)]


In [28]:
# Check data types
print(df_pyspark.dtypes)
df_pyspark.describe().show()

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]
+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  NULL|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  NULL| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



In [20]:
# Selecting a particular column
df_pyspark.select("Name").show()

# Selecting multiple column names
df_pyspark.select(["Name", "Experience"]).show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [29]:
# Adding new columns
df_pyspark = df_pyspark.withColumn("Experience After 2 Years", df_pyspark["Experience"] + 2)
df_pyspark.show()

# Drop columns
df_pyspark = df_pyspark.drop("Experience After 2 Years")
df_pyspark.show()

# Rename columns
df_pyspark.withColumnRenamed("Name", "Full Name").show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience After 2 Years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sudhanshu| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 23|         2| 18000|                       4|
+---------+---+----------+------+------------------------+

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+

+---------+---+----------+------+
|Full Name|age|Experience|Salary|


### Handling missing values

In [61]:
df_pyspark = spark.read.csv("test2.csv", header = True, inferSchema= True)
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [62]:
# Drop all rows where na values are present. 
df_pypspark.na.drop().show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [63]:
# Drop has a how arg, default to any, meaning if a row any column values as null/na then drop, or all the entire row has to be null/na
df_pypspark.na.drop(how="all").show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [64]:
# Drop has a threshold arg, specifies the limit of number of NON-NULL values for a row. i.e. if a row has at least 2 non nulls keep it, otherwise omit.
df_pypspark.na.drop(thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [65]:
# Drop has a subset arg, will drop rows only if the null/na values are in the subset columns. 
df_pypspark.na.drop(subset = ["Age"]).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
|     NULL| 36|      NULL|  NULL|
+---------+---+----------+------+



In [66]:
# Fill missing values from particular columns with a specific value. Below places missing when na/null appears in experience or age column
df_pyspark.na.fill("missing", ["Experience", "age"]).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [67]:
# Using imputer to fill missing values with a strategy i.e. mean, median or mode.
from pyspark.ml.feature import Imputer

imp = Imputer(
    inputCols = ["age", "Experience", "Salary"],
    outputCols = ["{}_impd".format(col) for col in ["age", "Experience", "Salary"]]
).setStrategy("mean")

In [73]:
imp.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+--------+---------------+-----------+
|     Name| age|Experience|Salary|age_impd|Experience_impd|Salary_impd|
+---------+----+----------+------+--------+---------------+-----------+
|    Krish|  31|        10| 30000|      31|             10|      30000|
|Sudhanshu|  30|         8| 25000|      30|              8|      25000|
|    Sunny|  29|         4| 20000|      29|              4|      20000|
|     Paul|  24|         3| 20000|      24|              3|      20000|
|   Harsha|  21|         1| 15000|      21|              1|      15000|
|  Shubham|  23|         2| 18000|      23|              2|      18000|
|   Mahesh|NULL|      NULL| 40000|      28|              5|      40000|
|     NULL|  34|        10| 38000|      34|             10|      38000|
|     NULL|  36|      NULL|  NULL|      36|              5|      25750|
+---------+----+----------+------+--------+---------------+-----------+



### Filter operations