### Handling Missing Values
    * Dropping Columns
    * Dropping Rows
    * Various parameters in dropping functionalities
    * Handling missing values by mean, median, mode

In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Imputer

In [3]:
# Create SparkSession
spark = SparkSession \
        .builder \
        .appName('handling_missing_values') \
        .getOrCreate()

spark

In [4]:
df = spark.read.csv('test_dataset_02.csv', header=True, inferSchema=True)
df.show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  null|  27|      null| 48000|
|  Ritu|  24|        10| 38000|
|  null|  27|      null|  null|
+------+----+----------+------+



In [5]:
# drop the column
df.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  23|         5| 30000|
|  24|         8| 25000|
|  24|        10| 20000|
|  25|         7| 20000|
|  27|         4| 15000|
|null|         2| 18000|
|  27|      null| 48000|
|  24|        10| 38000|
|  27|      null|  null|
+----+----------+------+



In [6]:
# drop null rows
df.na.drop().show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Tarek| 23|         5| 30000|
| Forid| 24|         8| 25000|
| Ridoy| 24|        10| 20000|
| Imran| 25|         7| 20000|
|Saiful| 27|         4| 15000|
|  Ritu| 24|        10| 38000|
+------+---+----------+------+



In [7]:
# drop rows containing any null field
df.na.drop(how='any').show()

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Tarek| 23|         5| 30000|
| Forid| 24|         8| 25000|
| Ridoy| 24|        10| 20000|
| Imran| 25|         7| 20000|
|Saiful| 27|         4| 15000|
|  Ritu| 24|        10| 38000|
+------+---+----------+------+



In [8]:
# drop rows where all fields of row is null
df.na.drop(how='all').show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  null|  27|      null| 48000|
|  Ritu|  24|        10| 38000|
|  null|  27|      null|  null|
+------+----+----------+------+



In [9]:
# threshold
df.na.drop(how='any', thresh=3).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  Ritu|  24|        10| 38000|
+------+----+----------+------+



In [10]:
# subset - drop null values from selected columns
df.na.drop(how='any', subset=['Experience']).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  Ritu|  24|        10| 38000|
+------+----+----------+------+



In [13]:
# filling the missing value
df.show()
df.na.fill(0).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  null|  27|      null| 48000|
|  Ritu|  24|        10| 38000|
|  null|  27|      null|  null|
+------+----+----------+------+

+------+---+----------+------+
|  Name|Age|Experience|Salary|
+------+---+----------+------+
| Tarek| 23|         5| 30000|
| Forid| 24|         8| 25000|
| Ridoy| 24|        10| 20000|
| Imran| 25|         7| 20000|
|Saiful| 27|         4| 15000|
|  Mitu|  0|         2| 18000|
|  null| 27|         0| 48000|
|  Ritu| 24|        10| 38000|
|  null| 27|         0|     0|
+------+---+----------+------+



In [16]:
df.na.fill(0, ['Experience']).show()

+------+----+----------+------+
|  Name| Age|Experience|Salary|
+------+----+----------+------+
| Tarek|  23|         5| 30000|
| Forid|  24|         8| 25000|
| Ridoy|  24|        10| 20000|
| Imran|  25|         7| 20000|
|Saiful|  27|         4| 15000|
|  Mitu|null|         2| 18000|
|  null|  27|         0| 48000|
|  Ritu|  24|        10| 38000|
|  null|  27|         0|  null|
+------+----+----------+------+



In [18]:
imputer = Imputer(
    inputCols = ['Age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age', 'Experience', 'Salary']]
).setStrategy('mean')

In [19]:
# add imputation cols to df
imputer.fit(df).transform(df).show()

+------+----+----------+------+-----------+------------------+--------------+
|  Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+------+----+----------+------+-----------+------------------+--------------+
| Tarek|  23|         5| 30000|         23|                 5|         30000|
| Forid|  24|         8| 25000|         24|                 8|         25000|
| Ridoy|  24|        10| 20000|         24|                10|         20000|
| Imran|  25|         7| 20000|         25|                 7|         20000|
|Saiful|  27|         4| 15000|         27|                 4|         15000|
|  Mitu|null|         2| 18000|         25|                 2|         18000|
|  null|  27|      null| 48000|         27|                 6|         48000|
|  Ritu|  24|        10| 38000|         24|                10|         38000|
|  null|  27|      null|  null|         27|                 6|         26750|
+------+----+----------+------+-----------+------------------+--