In [1]:
#%pip install pyspark

In [2]:
import pyspark

In [3]:
import pandas as pd
pd.read_csv('test1.csv')

Unnamed: 0,Name,Age,Experience,Salary
0,Krish,31,10,30000
1,Sudhanshu,30,8,25000
2,Sunny,29,4,20000
3,Paul,24,3,20000
4,Harsha,21,1,15000
5,Shubham,23,2,18000


In [4]:
type(pd.read_csv('test1.csv'))

pandas.core.frame.DataFrame

In [5]:
from pyspark.sql import SparkSession

This starts the spark session and enables us to run it in a single-node cluster called the 'Master' node, or host.

In [6]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [7]:
spark

In [8]:
df_pyspark = spark.read.csv('test1.csv')

In [9]:
df_pyspark.show()

+---------+---+----------+------+
|      _c0|_c1|       _c2|   _c3|
+---------+---+----------+------+
|     Name|Age|Experience|Salary|
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [10]:
spark.read.option('header','true').csv('test1.csv')

DataFrame[Name: string, Age: string, Experience: string, Salary: string]

In [11]:
spark.read.option('header','true').csv('test1.csv').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [12]:
df_pyspark = spark.read.option('header','true').csv('test1.csv')

In [13]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [14]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Experience: string (nullable = true)
 |-- Salary: string (nullable = true)



This is a SQL dataframe, similar to a Pandas dataframe (if you want to convert to a Pandas dataframe simply apply the .toPandas() method). Let's check to see if we can read the first few rows?

In [15]:
df_pyspark.head(3)

[Row(Name='Krish', Age='31', Experience='10', Salary='30000'),
 Row(Name='Sudhanshu', Age='30', Experience='8', Salary='25000'),
 Row(Name='Sunny', Age='29', Experience='4', Salary='20000')]

In [16]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



The 'select( )' method must be used to identify a column name to show.

In [17]:
df_pyspark.select('Name')

DataFrame[Name: string]

In [18]:
df_pyspark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [19]:
type(df_pyspark.select('Name'))

pyspark.sql.dataframe.DataFrame

Selecting more than one column to reveal the row entries.

In [20]:
df_pyspark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [21]:
df_pyspark['Name']

Column<'Name'>

In [22]:
df_pyspark.dtypes

[('Name', 'string'),
 ('Age', 'string'),
 ('Experience', 'string'),
 ('Salary', 'string')]

In [23]:
df_pyspark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string, Salary: string]

In [24]:
df_pyspark.describe().show()

+-------+------+------------------+-----------------+------------------+
|summary|  Name|               Age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|                8|             30000|
+-------+------+------------------+-----------------+------------------+



Obviously no numeric values can be used for the string 'Name' variable. The min and max values for the 'Name' variable have been determined by the index number values which happen to be lowest for Krish and highest for Sunny.

## Adding a Column

In [25]:
df_pyspark.withColumn('Experience After 2 Years', df_pyspark['Experience']+2)

DataFrame[Name: string, Age: string, Experience: string, Salary: string, Experience After 2 Years: double]

In order for this 'withColumn' method to be reflected it must be assigned to a variable:

In [26]:
df_pyspark = df_pyspark.withColumn('Experience After 2 Years', df_pyspark['Experience']+2)

In [27]:
df_pyspark

DataFrame[Name: string, Age: string, Experience: string, Salary: string, Experience After 2 Years: double]

In [28]:
df_pyspark.show()

+---------+---+----------+------+------------------------+
|     Name|Age|Experience|Salary|Experience After 2 Years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                    12.0|
|Sudhanshu| 30|         8| 25000|                    10.0|
|    Sunny| 29|         4| 20000|                     6.0|
|     Paul| 24|         3| 20000|                     5.0|
|   Harsha| 21|         1| 15000|                     3.0|
|  Shubham| 23|         2| 18000|                     4.0|
+---------+---+----------+------+------------------------+



## Dropping Columns

In [29]:
df_pyspark.drop('Experience After 2 Years').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



Once again, assign this method to a variable, so in order to see that the column has been dropped assign it to the df_pyspark variable once again:

In [30]:
df_pyspark = df_pyspark.drop('Experience After 2 Years')
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## Re-naming a Column

In [31]:
df_pyspark.withColumnRenamed('Name', 'New Name').show()

+---------+---+----------+------+
| New Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## PySpark Handling Missing Values
1. Dropping Columns
2. Dropping Rows
3. Various Parameter in Dropping Functionalities
4. Handling Missing Values by Mean, Median and Mode

In [32]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Practise').getOrCreate()

In [33]:
spark.read.csv('test2.csv', header=True, inferSchema=True)

DataFrame[Name: string, Age: int, Experience: int, Salary: int]

To see the entire dataset.

In [34]:
spark.read.csv('test2.csv', header=True, inferSchema=True).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



Save the dataset as a dataframe variable.

In [35]:
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

This could also be achieved by applying the .toDF() method to the spark dataset, then assigning it to the 'df_pyspark' variable.

In [36]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



In [37]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [38]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Dropping the Columns (Again)

In [39]:
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



To reset the dataframe simply use the show( ) method again. The 'Name' column will only be dropped permanently if the value of the expression is assigned. It could be assigned once again to 'df_pyspark', or to a completely different variable name such as 'df', or perhaps something more easily remembered like 'no_name' but the important point to remember is that until the expression is assigned it will not be stored in local memory!!!

In [40]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



## Dropping Specific Rows

This will drop any rows with Null values.

In [41]:
df_pyspark.na.drop().show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+



### Drop Function

Looking at the arguments in the drop( ) function we have: 'how', 'thresh' and 'subset'. I can view these simply by placing the cursor at the function parentheses and typing Shift-Tab. This will show a little drop down comment bubble explaining configuration options for each function argument. (Actually, it's also important to note that key-value args always follow positional args in the order).

Hitting the '+' icon in the top right of the dropdown bubble expands the explanation options.

In [42]:
# how argument
df_pyspark.na.drop(how='any').show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
+--------+---+----------+------+



In this example dataset the drop( ) method has removed 'any' instance which contains a value of null. If we set the 'how' arg to 'all', then it would only remove an instance or row in the dataset if all values were null within that instance or row. See below...

In [43]:
# how argument set to 'all'
df_pyspark.na.drop(how='all').show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



So the how argument doesn't remove any of the instances this time!

In [44]:
# thresh
df_pyspark.na.drop(thresh=2).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
+--------+----+----------+------+



The 'threshold' argument means that the row or instance will only be dropped if there are more non-null values than the threshold specified! If set to two then there must be at least 3 non-null values in the row before it's dropped.

In [45]:
# subset
df_pyspark.na.drop(how='any', subset=['Experience']).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
|    null| 34|        10| 38000|
+--------+---+----------+------+



Only those null values which appear in the Experience column will have their rows dropped. This is a form of dataset slicing in PySpark.

In [46]:
df_pyspark.show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|    null|  34|        10| 38000|
|    null|  36|      null|  null|
+--------+----+----------+------+



## Filling Missing Values

In [47]:
df_pyspark.na.fill('Missing Values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|         Krish|  31|        10| 30000|
|      Sudhansh|  30|         8| 25000|
|         Sunny|  29|         4| 20000|
|          Paul|  24|         3| 20000|
|        Harsha|  21|         1| 15000|
|       Shubham|  23|         2| 18000|
|        Mahesh|null|      null| 40000|
|Missing Values|  34|        10| 38000|
|Missing Values|  36|      null|  null|
+--------------+----+----------+------+



This doesn't seem to be working! Why not? The fill method only appears to be changing the values in one column, the 'Name' column. It should be changing the null values in all columns. 

According to the PySpark documentation there are two main arguments to address which are the positional replacement 'value' (a string, number, or simply " ") for the Null items in your dataset and the 'subset' key-value argument which can be set to None, or a list [], or tuple () of 'column_name' values. It can represent only one column's values, or several column values if desired. 

In [48]:
df_pyspark.na.fill(value='Missing', subset=None).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
| Missing|  34|        10| 38000|
| Missing|  36|      null|  null|
+--------+----+----------+------+



In [49]:
df_pyspark.na.fill(value='Missing', subset=['Name','Age','Experience','Salary']).show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
| Missing|  34|        10| 38000|
| Missing|  36|      null|  null|
+--------+----+----------+------+



Try setting fill(value=0) for all null values.

In [50]:
df_pyspark.na.fill(value=0).show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
|   Krish| 31|        10| 30000|
|Sudhansh| 30|         8| 25000|
|   Sunny| 29|         4| 20000|
|    Paul| 24|         3| 20000|
|  Harsha| 21|         1| 15000|
| Shubham| 23|         2| 18000|
|  Mahesh|  0|         0| 40000|
|    null| 34|        10| 38000|
|    null| 36|         0|     0|
+--------+---+----------+------+



This hasn't worked either which tells me the problem lies in the different datatypes associated with each attribute or column. The 'Name' column is the only type which is a String. All the other columns are Integer, so this is preventing the operation from occurring.

In [51]:
df_pyspark = df_pyspark.na.fill("").show()

+--------+----+----------+------+
|    Name| Age|Experience|Salary|
+--------+----+----------+------+
|   Krish|  31|        10| 30000|
|Sudhansh|  30|         8| 25000|
|   Sunny|  29|         4| 20000|
|    Paul|  24|         3| 20000|
|  Harsha|  21|         1| 15000|
| Shubham|  23|         2| 18000|
|  Mahesh|null|      null| 40000|
|        |  34|        10| 38000|
|        |  36|      null|  null|
+--------+----+----------+------+



Storing the na.fill( ) method in a variable doesn't seem to work either.

In [52]:
# Read the CSV file using PySpark
df_pyspark = spark.read.csv('test2.csv', header=True, inferSchema=True)

# Cast integer columns to string data type
df_pyspark = df_pyspark.withColumn('Age', df_pyspark['Age'].cast('string'))
df_pyspark = df_pyspark.withColumn('Experience', df_pyspark['Experience'].cast('string'))
df_pyspark = df_pyspark.withColumn('Salary', df_pyspark['Salary'].cast('string'))

# Replace all null values with 'Missing Value'
df_pyspark = df_pyspark.fillna('Missing Value')

# Display the updated DataFrame
df_pyspark.show()

+-------------+-------------+-------------+-------------+
|         Name|          Age|   Experience|       Salary|
+-------------+-------------+-------------+-------------+
|        Krish|           31|           10|        30000|
|     Sudhansh|           30|            8|        25000|
|        Sunny|           29|            4|        20000|
|         Paul|           24|            3|        20000|
|       Harsha|           21|            1|        15000|
|      Shubham|           23|            2|        18000|
|       Mahesh|Missing Value|Missing Value|        40000|
|Missing Value|           34|           10|        38000|
|Missing Value|           36|Missing Value|Missing Value|
+-------------+-------------+-------------+-------------+



In [53]:
df_pyspark.na.fill('Missing Value', ['Experience','Age']).show()

+-------------+-------------+-------------+-------------+
|         Name|          Age|   Experience|       Salary|
+-------------+-------------+-------------+-------------+
|        Krish|           31|           10|        30000|
|     Sudhansh|           30|            8|        25000|
|        Sunny|           29|            4|        20000|
|         Paul|           24|            3|        20000|
|       Harsha|           21|            1|        15000|
|      Shubham|           23|            2|        18000|
|       Mahesh|Missing Value|Missing Value|        40000|
|Missing Value|           34|           10|        38000|
|Missing Value|           36|Missing Value|Missing Value|
+-------------+-------------+-------------+-------------+



In [54]:
df_pyspark.show()

+-------------+-------------+-------------+-------------+
|         Name|          Age|   Experience|       Salary|
+-------------+-------------+-------------+-------------+
|        Krish|           31|           10|        30000|
|     Sudhansh|           30|            8|        25000|
|        Sunny|           29|            4|        20000|
|         Paul|           24|            3|        20000|
|       Harsha|           21|            1|        15000|
|      Shubham|           23|            2|        18000|
|       Mahesh|Missing Value|Missing Value|        40000|
|Missing Value|           34|           10|        38000|
|Missing Value|           36|Missing Value|Missing Value|
+-------------+-------------+-------------+-------------+



Replace the Null values with the mean values for a particular column or for each column. This involves imputing the mean values. In this example I will replace the Null values in the 'Experience' column only.

In [55]:
# Cast string columns to integer data type
df_pyspark = df_pyspark.withColumn('Age', df_pyspark['Age'].cast('integer'))
df_pyspark = df_pyspark.withColumn('Experience', df_pyspark['Experience'].cast('integer'))
df_pyspark = df_pyspark.withColumn('Salary', df_pyspark['Salary'].cast('integer'))

In [56]:
from pyspark.ml.feature import Imputer

In [57]:
imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]).setStrategy("mean")

In [58]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------------+----+----------+------+-----------+------------------+--------------+
|         Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------------+----+----------+------+-----------+------------------+--------------+
|        Krish|  31|        10| 30000|         31|                10|         30000|
|     Sudhansh|  30|         8| 25000|         30|                 8|         25000|
|        Sunny|  29|         4| 20000|         29|                 4|         20000|
|         Paul|  24|         3| 20000|         24|                 3|         20000|
|       Harsha|  21|         1| 15000|         21|                 1|         15000|
|      Shubham|  23|         2| 18000|         23|                 2|         18000|
|       Mahesh|null|      null| 40000|         28|                 5|         40000|
|Missing Value|  34|        10| 38000|         34|                10|         38000|
|Missing Value|  36|      null|  null|         36|               

The same can be done with median values also. Note, the imputed values are created in entirely new columns with the average values.

In [59]:
imputer = Imputer(
    inputCols=['Age','Experience','Salary'],
    outputCols=["{}_imputed".format(c) for c in ['Age','Experience','Salary']]).setStrategy("median")

# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------------+----+----------+------+-----------+------------------+--------------+
|         Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+-------------+----+----------+------+-----------+------------------+--------------+
|        Krish|  31|        10| 30000|         31|                10|         30000|
|     Sudhansh|  30|         8| 25000|         30|                 8|         25000|
|        Sunny|  29|         4| 20000|         29|                 4|         20000|
|         Paul|  24|         3| 20000|         24|                 3|         20000|
|       Harsha|  21|         1| 15000|         21|                 1|         15000|
|      Shubham|  23|         2| 18000|         23|                 2|         18000|
|       Mahesh|null|      null| 40000|         29|                 4|         40000|
|Missing Value|  34|        10| 38000|         34|                10|         38000|
|Missing Value|  36|      null|  null|         36|               

## PySpark DataFrames
1. Filter Operations
2. &, |, == (And, Or, Equals)
3. ~ (Not)

In [60]:
from pyspark.sql import SparkSession

In [61]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [62]:
df_pyspark = spark.read.csv('test1.csv', header=True, inferSchema=True)
df_pyspark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



There are a number of filters which can be applied using Pandas dataframes also. 

## Filter Operations

In [63]:
# salary of people less than or equal to 20000
df_pyspark.filter("Salary<=20000").show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [64]:
df_pyspark.filter("Salary<=20000").select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [65]:
df_pyspark.filter(df_pyspark['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### The AND operand
An important point to note here is that the comparison statements which are being evaluated, must be contained within parentheses (brackets).

In [66]:
df_pyspark.filter((df_pyspark['Salary']<=20000) & (df_pyspark['Salary']>=18000)).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



### The OR operand

In [67]:
df_pyspark.filter((df_pyspark['Salary']<=20000) | (df_pyspark['Salary']>=15000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



### The NOT operand

In [68]:
df_pyspark.filter(~(df_pyspark['Salary']<=20000)).show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
+---------+---+----------+------+



## PySpark GroupBy and Aggregate Functions

In [69]:
from pyspark.sql import SparkSession

In [70]:
spark = SparkSession.builder.appName('Agg').getOrCreate()

In [71]:
spark

In [72]:
df_pyspark = spark.read.csv('test3.csv', header=True, inferSchema=True)

In [73]:
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [74]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



So 'Name' is a string, 'Departments' is a string and 'Salary' is an integer data type.

### Groupby
#### Grouped to Find the Maximum Salary
Finding the mean salary group by Name.

In [75]:
df_pyspark.groupBy('Name')

<pyspark.sql.group.GroupedData at 0x27668833820>

GroupBy and Aggregate functions work together, so once groupBy( ) is applied, then an agg( ) function can follow.

In [76]:
df_pyspark.groupBy('Name').sum()

DataFrame[Name: string, sum(Salary): bigint]

This time a SQL dataframe is returned.

Let's show this dataframe.

In [77]:
df_pyspark.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



Press dot or period and tab...this will provide a whole list of possible functions.

In [78]:
df_pyspark.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [79]:
df_pyspark.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [80]:
df_pyspark.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



Try applying a direct aggregate function

In [81]:
df_pyspark.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



In [82]:
df_pyspark.groupBy('Name').max().show()

+---------+-----------+
|     Name|max(Salary)|
+---------+-----------+
|Sudhanshu|      20000|
|    Sunny|      10000|
|    Krish|      10000|
|   Mahesh|       4000|
+---------+-----------+



And also, to see the minimum value with respect to Name:

In [83]:
df_pyspark.groupBy('Name').min().show()

+---------+-----------+
|     Name|min(Salary)|
+---------+-----------+
|Sudhanshu|       5000|
|    Sunny|       2000|
|    Krish|       4000|
|   Mahesh|       3000|
+---------+-----------+



In [84]:
df_pyspark.groupBy('Name').avg().show()

+---------+------------------+
|     Name|       avg(Salary)|
+---------+------------------+
|Sudhanshu|11666.666666666666|
|    Sunny|            6000.0|
|    Krish| 6333.333333333333|
|   Mahesh|            3500.0|
+---------+------------------+



## Machine Learning with DataFrame API and RDD API's
### Examples of PySpark ML
Using a Linear Regression example.

In [85]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Missing').getOrCreate()

In [86]:
# read the dataset
training = spark.read.csv('test1.csv', header=True, inferSchema=True)

In [87]:
training.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [88]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [89]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In sklearn we have a dataset which we would perform a train-test split on, providing us with independent and dependent variables. In PySpark it's slightly different.

['Age','Experience'] ----> New Feature ----> Independent Feature

Grouping the independent variable features together is performed using the 'VectorAssembler' module.

In [90]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age", "Experience"], outputCol="Independent Features")

In [91]:
output = featureassembler.transform(training)

In [92]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



Note the 'Independent Features' column has 2 values in each row listing Age and Experience. This is a type of feature engineering.

In [93]:
output.columns

['Name', 'Age', 'Experience', 'Salary', 'Independent Features']

In [94]:
finalized_data = output.select("Independent Features","Salary")

In [95]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [96]:
from pyspark.ml.regression import LinearRegression

# train test split
train_data, test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Salary")
regressor = regressor.fit(train_data)

In [97]:
# Coefficients
regressor.coefficients

DenseVector([-323.2867, 1696.8066])

In [98]:
# Intercepts
regressor.intercept

22295.299605312008

In [99]:
# Prediction
pred_results = regressor.evaluate(test_data)

In [100]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [21.0,1.0]| 15000|17203.085755292603|
+--------------------+------+------------------+



In [101]:
pred_results.meanAbsoluteError, pred_results.meanSquaredError

(2203.0857552926027, 4853586.845173177)

## Multi-Linear Regression Problem
There are several features included in the independent values. In Databricks, you can start on the Databricks icon top left then click on 'Import & Explore Data' in the middle. Or alternatively, go to the 'Data' icon on the left menu and select the 'tips.csv' file and upload it.

You'll be presented with a choice of 'Create Table with UI' or 'Create Table in Notebook'. The file will be uplodaed into the 'DBFS' file system which is one of the tabs at the top. The file path will look something like: '/FileStore/tables/tips.csv'.

In [None]:
spark.stop()