### Pyspark Handling Missing Values

* Dropping columns
* Dropping rows
* Various Parameters in Dropping functionalities
* Handling Missing values by Mean, Median and Mode

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('DataFrame Handling Missing Values').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv('test2.csv', header = True, inferSchema = True)

In [5]:
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [6]:
# Dropping columns
df2 = df.drop('Name')
df2.show()

+----+----------+------+
| age|Experience|Salary|
+----+----------+------+
|  31|        10| 30000|
|  30|         8| 25000|
|  29|         4| 20000|
|  24|         3| 20000|
|  21|         1| 15000|
|  23|         2| 18000|
|NULL|      NULL| 40000|
|  34|        10| 38000|
|  36|      NULL|  NULL|
+----+----------+------+



In [7]:
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [8]:
# DataFrame.na  --> .na is used to handle missing/null values
df.na.drop().show()
# This will drop every record containing any null field by default unless we give some argument

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [9]:
# ### df.na.drop()

# Signature:
# df.na.drop(
#     how: str = 'any',
#     thresh: Optional[int] = None,
#     subset: Union[str, Tuple[str, ...], List[str], NoneType] = None,
# ) -> pyspark.sql.dataframe.DataFrame
# Docstring:
# Returns a new :class:`DataFrame` omitting rows with null values.
# :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.

# .. versionadded:: 1.3.1

# .. versionchanged:: 3.4.0
#     Supports Spark Connect.

# Parameters
# ----------
# how : str, optional
#     'any' or 'all'.
#     If 'any', drop a row if it contains any nulls.
#     If 'all', drop a row only if all its values are null.
# thresh: int, optional
#     default None
#     If specified, drop rows that have less than `thresh` non-null values.
#     This overwrites the `how` parameter.
# subset : str, tuple or list, optional
#     optional list of column names to consider.

# Returns
# -------
# :class:`DataFrame`
#     DataFrame with null only rows excluded.

# Examples
# --------
# >>> from pyspark.sql import Row
# >>> df = spark.createDataFrame([
# ...     Row(age=10, height=80, name="Alice"),
# ...     Row(age=5, height=None, name="Bob"),
# ...     Row(age=None, height=None, name="Tom"),
# ...     Row(age=None, height=None, name=None),
# ... ])
# >>> df.na.drop().show()
# +---+------+-----+
# |age|height| name|
# +---+------+-----+
# | 10|    80|Alice|
# +---+------+-----+
# File:      /usr/local/spark/python/pyspark/sql/dataframe.py
# Type:      method

In [10]:
# how = any
df.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [11]:
df.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [12]:
# thresh: at least non-null values equal to threshod must be there
# if not then it'll drop
# 'thresh' parameter overrides 'how' parameter
df.na.drop(how='any', thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [13]:
# subset
df.na.drop(how='any', subset=['Experience']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [14]:
# Fill missing value
df.na.fill('Missing Value').show()

+-------------+----+----------+------+
|         Name| age|Experience|Salary|
+-------------+----+----------+------+
|        Krish|  31|        10| 30000|
|    Sudhanshu|  30|         8| 25000|
|        Sunny|  29|         4| 20000|
|         Paul|  24|         3| 20000|
|       Harsha|  21|         1| 15000|
|      Shubham|  23|         2| 18000|
|       Mahesh|NULL|      NULL| 40000|
|Missing Value|  34|        10| 38000|
|Missing Value|  36|      NULL|  NULL|
+-------------+----+----------+------+



In [15]:
df.na.fill({'Name': 'Missing Value', 'age': 0, 'Experience': 0, 'Salary': 0}).show()

+-------------+---+----------+------+
|         Name|age|Experience|Salary|
+-------------+---+----------+------+
|        Krish| 31|        10| 30000|
|    Sudhanshu| 30|         8| 25000|
|        Sunny| 29|         4| 20000|
|         Paul| 24|         3| 20000|
|       Harsha| 21|         1| 15000|
|      Shubham| 23|         2| 18000|
|       Mahesh|  0|         0| 40000|
|Missing Value| 34|        10| 38000|
|Missing Value| 36|         0|     0|
+-------------+---+----------+------+



In [16]:
df.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [17]:
# Replace NULL values based on mean or median of that column

In [18]:
# Mean
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
).setStrategy("mean")

In [19]:
# Add imputation columns to df

imputer.fit(df).transform(df).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         28|                 5|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-

In [20]:
# Median
imputer = Imputer(
    inputCols = ['age', 'Experience', 'Salary'],
    outputCols = [f"{c}_imputed" for c in ['age', 'Experience', 'Salary']]
).setStrategy('median')

In [21]:
imputer.fit(df).transform(df).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         20000|
+---------+----+----------+-