In [269]:
#%pip install pyspark

In [270]:
import pyspark

In [271]:
import pandas as pd
pd.read_csv('test1.csv')

Unnamed: 0,Name,Age,Experience
0,Krish,31,10
1,Sudhansh,30,8
2,Sunny,29,4


In [272]:
type(pd.read_csv('test1.csv'))

pandas.core.frame.DataFrame

In [273]:
from pyspark.sql import SparkSession

This starts the spark session and enables us to run it in a single-node cluster called the 'Master' node, or host.

In [274]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [275]:
spark

In [276]:
df_pyspark = spark.read.csv('test1.csv')

In [277]:
df_pyspark.show()

+--------+---+----------+
|     _c0|_c1|       _c2|
+--------+---+----------+
|    Name|Age|Experience|
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



In [278]:
spark.read.option('header','true').csv('test1.csv')

DataFrame[Name: string, Age: string, Experience: string]

In [279]:
spark.read.option('header','true').csv('test1.csv').show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



In [280]:
df_pyspark = spark.read.option('header','true').csv('test1.csv')

In [281]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [282]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)



This is a SQL dataframe, similar to a Pandas dataframe (if you want to convert to a Pandas dataframe simply apply the .toPandas() method). Let's check to see if we can read the first few rows?

In [283]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10'),
 Row(Name='Sudhansh', Age='30', Experience='8'),
 Row(Name='Sunny', Age='29', Experience='4')]

In [284]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



The 'select( )' method must be used to identify a column name to show.

In [285]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [286]:
df_pyspark.select('Name').show()

+--------+
|    Name|
+--------+
|   Krish|
|Sudhansh|
|   Sunny|
+--------+



In [287]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

Selecting more than one column to reveal the row entries.

In [288]:
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Krish|        10|
|Sudhansh|         8|
|   Sunny|         4|
+--------+----------+



In [289]:
df_pyspark['Name']

Column<'Name'>

In [290]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'string'), ('Experience', 'string')]

In [291]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string]

In [292]:
df_pyspark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| Age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| null|30.0|7.333333333333333|
| stddev| null| 1.0|3.055050463303893|
|    min|Krish|  29|               10|
|    max|Sunny|  31|                8|
+-------+-----+----+-----------------+



Obviously no numeric values can be used for the string 'Name' variable. The min and max values for the 'Name' variable have been determined by the index number values which happen to be lowest for Krish and highest for Sunny.

## Adding a Column

In [293]:
df_pyspark.withColumn('Experience After 2 Years', df_pyspark['Experience']+2)

DataFrame[Name: string, Age: string, Experience: string, Experience After 2 Years: double]

In order for this 'withColumn' method to be reflected it must be assigned to a variable:

In [294]:
df_pyspark = df_pyspark.withColumn('Experience After 2 Years', df_pyspark['Experience']+2)

In [295]:
df_pyspark

DataFrame[Name: string, Age: string, Experience: string, Experience After 2 Years: double]

In [296]:
df_pyspark.show()

+--------+---+----------+------------------------+
|    Name|Age|Experience|Experience After 2 Years|
+--------+---+----------+------------------------+
|   Krish| 31|        10|                    12.0|
|Sudhansh| 30|         8|                    10.0|
|   Sunny| 29|         4|                     6.0|
+--------+---+----------+------------------------+



## Dropping Columns

In [297]:
df_pyspark.drop('Experience After 2 Years').show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



Once again, assign this method to a variable, so in order to see that the column has been dropped assign it to the df_pyspark variable once again:

In [298]:
df_pyspark = df_pyspark.drop('Experience After 2 Years')
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



## Re-naming a Column

In [299]:
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudhansh| 30|         8|
|   Sunny| 29|         4|
+--------+---+----------+



## PySpark Handling Missing Values
1. Dropping Columns
2. Dropping Rows
3. Various Parameter in Dropping Functionalities
4. Handling Missing Values by Mean, Median and Mode

In [300]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practise').getOrCreate()

In [301]:
spark.read.csv('test2.csv', header=True, inferSchema=True)

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

To see the entire dataset.

In [302]:
spark.read.csv('test2.csv', header=True, inferSchema=True).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



Save the dataset as a dataframe variable.

In [303]:
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

This could also be achieved by applying the .toDF() method to the spark dataset, then assigning it to the 'df_pyspark' variable.

In [304]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [305]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [306]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Dropping the Columns (Again)

In [307]:
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



To reset the dataframe simply use the show( ) method again. The 'Name' column will only be dropped permanently if the value of the expression is assigned. It could be assigned once again to 'df_pyspark', or to a completely different variable name such as 'df', or perhaps something more easily remembered like 'no_name' but the important point to remember is that until the expression is assigned it will not be stored in local memory!!!

In [308]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



## Dropping Specific Rows

This will drop any rows with Null values.

In [309]:
df_pyspark.na.drop().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+



### Drop Function

Looking at the arguments in the drop( ) function we have: 'how', 'thresh' and 'subset'. I can view these simply by placing the cursor at the function parentheses and typing Shift-Tab. This will show a little drop down comment bubble explaining configuration options for each function argument. (Actually, it's also important to note that key-value args always follow positional args in the order).

Hitting the '+' icon in the top right of the dropdown bubble expands the explanation options.

In [310]:
# how argument
df_pyspark.na.drop(how='any').show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+



In this example dataset the drop( ) method has removed 'any' instance which contains a value of null. If we set the 'how' arg to 'all', then it would only remove an instance or row in the dataset if all values were null within that instance or row. See below...

In [311]:
# how argument set to 'all'
df_pyspark.na.drop(how='all').show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



So the how argument doesn't remove any of the instances this time!

In [312]:
# thresh
df_pyspark.na.drop(thresh=2).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
+--------+----+----------+------+



The 'threshold' argument means that the row or instance will only be dropped if there are more non-null values than the threshold specified! If set to two then there must be at least 3 non-null values in the row before it's dropped.

In [313]:
# subset
df_pyspark.na.drop(how='any', subset=['Experience']).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
|    null| 34|        10| 38000|
+--------+---+----------+------+



Only those null values which appear in the Experience column will have their rows dropped. This is a form of dataset slicing in PySpark.

In [314]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



## Filling Missing Values

In [315]:
df_pyspark.fillna('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|      Sudhansh|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



This doesn't seem to be working! Why not? The fill method only appears to be changing the values in one column, the 'Name' column. It should be changing the null values in all columns. 

According to the PySpark documentation there are two main arguments to address which are the positional replacement 'value' (a string, number, or simply " ") for the Null items in your dataset and the 'subset' key-value argument which can be set to None, or a list [], or tuple () of 'column_name' values. It can represent only one column's values, or several column values if desired. 

In [316]:
df_pyspark.fillna(value='Missing', subset=None).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
| Missing|  34|        10| 38000|
| Missing|  36|      null|  null|
+--------+----+----------+------+



In [317]:
df_pyspark.na.fill(value='Missing', subset=['Name','Age','Experience','Salary']).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
| Missing|  34|        10| 38000|
| Missing|  36|      null|  null|
+--------+----+----------+------+



Try setting fill(value=0) for all null values.

In [318]:
df_pyspark.na.fill(value=0).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
|  Mahesh|  0|         0| 40000|
|    null| 34|        10| 38000|
|    null| 36|         0|     0|
+--------+---+----------+------+



This hasn't worked either which tells me the problem lies in the different datatypes associated with each attribute or column. The 'Name' column is the only type which is a String. All the other columns are Integer, so this is preventing the operation from occurring.

In [319]:
df_pyspark = df_pyspark.na.fill("").show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|        |  34|        10| 38000|
|        |  36|      null|  null|
+--------+----+----------+------+



Storing the na.fill( ) method in a variable doesn't seem to work either.

In [320]:
# Read the CSV file using PySpark
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

# Cast integer columns to string data type
df_pyspark = df_pyspark.withColumn('Age', df_pyspark['Age'].cast('string'))
df_pyspark = df_pyspark.withColumn('Experience',
                                   df_pyspark['Experience'].cast('string'))
df_pyspark = df_pyspark.withColumn('Salary',
                                   df_pyspark['Salary'].cast('string'))

# Replace all null values with 'Missing Value'
df_pyspark = df_pyspark.fillna('Missing Value')

# Display the updated DataFrame
df_pyspark.show()

+-------------+-------------+-------------+-------------+
|         Name|          Age|   Experience|       Salary|
+-------------+-------------+-------------+-------------+
|        Krish|           31|           10|        30000|
|     Sudhansh|           30|            8|        25000|
|        Sunny|           29|            4|        20000|
|         Paul|           24|            3|        20000|
|       Harsha|           21|            1|        15000|
|      Shubham|           23|            2|        18000|
|       Mahesh|Missing Value|Missing Value|        40000|
|Missing Value|           34|           10|        38000|
|Missing Value|           36|Missing Value|Missing Value|
+-------------+-------------+-------------+-------------+



In [321]:
spark.stop()