### Pyspark Handling Missing Values
* Dropping Columns
* Dropping Rows
* Various Parameter In Dropping functionalities
* Handling Missing values by Mean, Median And Mode

In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [2]:
df_pyspark=spark.read.csv('test2.csv',header=True,inferSchema=True)

In [3]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [4]:
df_pyspark.show()

+-------+----+----------+------+
|   name| age|experience|salary|
+-------+----+----------+------+
|  alpha|  45|        25| 30000|
|   beta|  35|        15| 25000|
|  gamma|  25|         5| 20000|
|  delta|  29|         3| 20000|
|epsilon|  21|         2| 15000|
|   zeta|  23|         1| 18000|
|    eta|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [5]:
##drop the columns
df_pyspark.drop('Name').show()

+----+----------+------+
| age|experience|salary|
+----+----------+------+
|  45|        25| 30000|
|  35|        15| 25000|
|  25|         5| 20000|
|  29|         3| 20000|
|  21|         2| 15000|
|  23|         1| 18000|
|null|      null| 40000|
|  34|        10| 38000|
|  36|      null|  null|
+----+----------+------+



In [6]:
df_pyspark.show()

+-------+----+----------+------+
|   name| age|experience|salary|
+-------+----+----------+------+
|  alpha|  45|        25| 30000|
|   beta|  35|        15| 25000|
|  gamma|  25|         5| 20000|
|  delta|  29|         3| 20000|
|epsilon|  21|         2| 15000|
|   zeta|  23|         1| 18000|
|    eta|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [7]:
### by default droping all the row with null value present in them
df_pyspark.na.drop().show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  alpha| 45|        25| 30000|
|   beta| 35|        15| 25000|
|  gamma| 25|         5| 20000|
|  delta| 29|         3| 20000|
|epsilon| 21|         2| 15000|
|   zeta| 23|         1| 18000|
+-------+---+----------+------+



In [8]:
### any==how ; we can use how='all' to set condition to drop rows where all elements in a row is null
df_pyspark.na.drop(how="any").show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  alpha| 45|        25| 30000|
|   beta| 35|        15| 25000|
|  gamma| 25|         5| 20000|
|  delta| 29|         3| 20000|
|epsilon| 21|         2| 15000|
|   zeta| 23|         1| 18000|
+-------+---+----------+------+



In [9]:
##threshold
df_pyspark.na.drop(how="any",thresh=3).show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  alpha| 45|        25| 30000|
|   beta| 35|        15| 25000|
|  gamma| 25|         5| 20000|
|  delta| 29|         3| 20000|
|epsilon| 21|         2| 15000|
|   zeta| 23|         1| 18000|
|   null| 34|        10| 38000|
+-------+---+----------+------+



In [10]:
##Subset
df_pyspark.na.drop(how="any",subset=['Age']).show()

+-------+---+----------+------+
|   name|age|experience|salary|
+-------+---+----------+------+
|  alpha| 45|        25| 30000|
|   beta| 35|        15| 25000|
|  gamma| 25|         5| 20000|
|  delta| 29|         3| 20000|
|epsilon| 21|         2| 15000|
|   zeta| 23|         1| 18000|
|   null| 34|        10| 38000|
|   null| 36|      null|  null|
+-------+---+----------+------+



In [11]:
### Filling the Missing Value
df_pyspark.na.fill('Missing Values',['Experience','age']).show()

+-------+----+----------+------+
|   name| age|experience|salary|
+-------+----+----------+------+
|  alpha|  45|        25| 30000|
|   beta|  35|        15| 25000|
|  gamma|  25|         5| 20000|
|  delta|  29|         3| 20000|
|epsilon|  21|         2| 15000|
|   zeta|  23|         1| 18000|
|    eta|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [12]:
df_pyspark.show()

+-------+----+----------+------+
|   name| age|experience|salary|
+-------+----+----------+------+
|  alpha|  45|        25| 30000|
|   beta|  35|        15| 25000|
|  gamma|  25|         5| 20000|
|  delta|  29|         3| 20000|
|epsilon|  21|         2| 15000|
|   zeta|  23|         1| 18000|
|    eta|null|      null| 40000|
|   null|  34|        10| 38000|
|   null|  36|      null|  null|
+-------+----+----------+------+



In [13]:
df_pyspark.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [18]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'experience', 'salary'], 
    outputCols=["{}_imputed".format(c) for c in ['age', 'experience', 'salary']]
    ).setStrategy("median")

In [19]:
# Add imputation cols to df
imputer.fit(df_pyspark).transform(df_pyspark).show()

+-------+----+----------+------+-----------+------------------+--------------+
|   name| age|experience|salary|age_imputed|experience_imputed|salary_imputed|
+-------+----+----------+------+-----------+------------------+--------------+
|  alpha|  45|        25| 30000|         45|                25|         30000|
|   beta|  35|        15| 25000|         35|                15|         25000|
|  gamma|  25|         5| 20000|         25|                 5|         20000|
|  delta|  29|         3| 20000|         29|                 3|         20000|
|epsilon|  21|         2| 15000|         21|                 2|         15000|
|   zeta|  23|         1| 18000|         23|                 1|         18000|
|    eta|null|      null| 40000|         29|                 5|         40000|
|   null|  34|        10| 38000|         34|                10|         38000|
|   null|  36|      null|  null|         36|                 5|         20000|
+-------+----+----------+------+-----------+--------