In [16]:
import pyspark
from pyspark.sql import SparkSession

In [17]:
spark = SparkSession.builder.appName('Part2').getOrCreate()

In [18]:
spark

In [6]:
df_pyspark = spark.read.csv('data_new.csv',header=True,inferSchema=True)

In [7]:
df_pyspark.show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+



In [8]:
#dropping feature or column
df_pyspark.drop('Name').show()

+----+----------+------+
| Age|Experience|Salary|
+----+----------+------+
|  23|         2|300000|
|  34|        10| 40000|
|null|      null|  null|
|  45|        20| 45000|
|null|      null|120000|
|  56|        10|380000|
|  36|      null|  null|
+----+----------+------+



In [9]:
# dropping null rows
df_pyspark.na.drop().show()


+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| Tom| 23|         2|300000|
| Yom| 34|        10| 40000|
| Lom| 45|        20| 45000|
+----+---+----------+------+



In [10]:
# use of how parameter
'''
if you want to drop all rows that contain entire null value then use 'all'
'''

df_pyspark.show()

df_pyspark.na.drop(how='all').show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+



In [11]:
'''
if you want to drop all rows that contain atleast one or more null value then use 'any'
'''

df_pyspark.show()

df_pyspark.na.drop(how='any').show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| Tom| 23|         2|300000|
| Yom| 34|        10| 40000|
| Lom| 45|        20| 45000|
+----+---+----------+------+



In [12]:
'''
thresh:
it will check keep row record with non null values threshold and delete remaining ones for example if thresh=3 then in a rows it will
keep on those that has 3 non null values and delete remaining null records.
'''

df_pyspark.show()

df_pyspark.na.drop(how='any',thresh=3).show()


+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| Tom| 23|         2|300000|
| Yom| 34|        10| 40000|
| Lom| 45|        20| 45000|
|null| 56|        10|380000|
+----+---+----------+------+



In [13]:
#subset: where in particular 'column' if there is null value then whole record in row is deleted.


df_pyspark.show()

df_pyspark.na.drop(how='any',subset=['Experience']).show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+

+----+---+----------+------+
|Name|Age|Experience|Salary|
+----+---+----------+------+
| Tom| 23|         2|300000|
| Yom| 34|        10| 40000|
| Lom| 45|        20| 45000|
|null| 56|        10|380000|
+----+---+----------+------+



In [14]:
## handling missing values

df_pyspark.na.fill('Missing values').show()

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|           Tom|  23|         2|300000|
|           Yom|  34|        10| 40000|
|Missing values|null|      null|  null|
|           Lom|  45|        20| 45000|
|           Som|null|      null|120000|
|Missing values|  56|        10|380000|
|Missing values|  36|      null|  null|
+--------------+----+----------+------+



In [20]:
## handling missing value on a particular column
df_pyspark.show()
df_pyspark.na.fill('Missing values','Name').show()

+----+----+----------+------+
|Name| Age|Experience|Salary|
+----+----+----------+------+
| Tom|  23|         2|300000|
| Yom|  34|        10| 40000|
|null|null|      null|  null|
| Lom|  45|        20| 45000|
| Som|null|      null|120000|
|null|  56|        10|380000|
|null|  36|      null|  null|
+----+----+----------+------+

+--------------+----+----------+------+
|          Name| Age|Experience|Salary|
+--------------+----+----------+------+
|           Tom|  23|         2|300000|
|           Yom|  34|        10| 40000|
|Missing values|null|      null|  null|
|           Lom|  45|        20| 45000|
|           Som|null|      null|120000|
|Missing values|  56|        10|380000|
|Missing values|  36|      null|  null|
+--------------+----+----------+------+



In [15]:
df_pyspark.na.fill({'Name':'Not availablle','Age':0,'Experience':0,'Salary':0}).show()

+--------------+---+----------+------+
|          Name|Age|Experience|Salary|
+--------------+---+----------+------+
|           Tom| 23|         2|300000|
|           Yom| 34|        10| 40000|
|Not availablle|  0|         0|     0|
|           Lom| 45|        20| 45000|
|           Som|  0|         0|120000|
|Not availablle| 56|        10|380000|
|Not availablle| 36|         0|     0|
+--------------+---+----------+------+



In [21]:
### use of imputer class to replace null values with mean
from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['Age','Experience','Salary'],outputCols=[ "{}_imputed".format(c) for c in ['Age','Experience','Salary']]).setStrategy("mean")

# imputer = Imputer(inputCols=['Age','Experience','Salary'],outputCols=[ "{}_imputed".format(c) for c in ['Age','Experience','Salary']]).setStrategy("median")

In [22]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+----+----+----------+------+-----------+------------------+--------------+
|Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+----+----+----------+------+-----------+------------------+--------------+
| Tom|  23|         2|300000|         23|                 2|        300000|
| Yom|  34|        10| 40000|         34|                10|         40000|
|null|null|      null|  null|         38|                10|        177000|
| Lom|  45|        20| 45000|         45|                20|         45000|
| Som|null|      null|120000|         38|                10|        120000|
|null|  56|        10|380000|         56|                10|        380000|
|null|  36|      null|  null|         36|                10|        177000|
+----+----+----------+------+-----------+------------------+--------------+

