In [39]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [40]:
df_ps = spark.read.csv('work/test2.csv', header=True, inferSchema=True)
df_ps.show()

+-----+----+----------+------+
| name| age|experience|salary|
+-----+----+----------+------+
|krish|  31|        10|    20|
| sudj|  30|         8|    30|
|  sun|  19|         4|    40|
|  pau|null|      null|    50|
+-----+----+----------+------+



In [41]:
df_ps.drop('age').show()

+-----+----------+------+
| name|experience|salary|
+-----+----------+------+
|krish|        10|    20|
| sudj|         8|    30|
|  sun|         4|    40|
|  pau|      null|    50|
+-----+----------+------+



In [42]:
df_ps.na.drop().show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|krish| 31|        10|    20|
| sudj| 30|         8|    30|
|  sun| 19|         4|    40|
+-----+---+----------+------+



In [43]:
df_ps.na.drop(how="all").show()

+-----+----+----------+------+
| name| age|experience|salary|
+-----+----+----------+------+
|krish|  31|        10|    20|
| sudj|  30|         8|    30|
|  sun|  19|         4|    40|
|  pau|null|      null|    50|
+-----+----+----------+------+



In [44]:
df_ps.na.drop(how="any", thresh=3).show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|krish| 31|        10|    20|
| sudj| 30|         8|    30|
|  sun| 19|         4|    40|
+-----+---+----------+------+



In [45]:
df_ps.na.drop(how="any", subset=['experience']).show()

+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|krish| 31|        10|    20|
| sudj| 30|         8|    30|
|  sun| 19|         4|    40|
+-----+---+----------+------+



In [46]:
# filling missing values
# type of column matter
df_ps.na.fill(0, ['age', 'experience']).show()


+-----+---+----------+------+
| name|age|experience|salary|
+-----+---+----------+------+
|krish| 31|        10|    20|
| sudj| 30|         8|    30|
|  sun| 19|         4|    40|
|  pau|  0|         0|    50|
+-----+---+----------+------+



In [47]:
# change type of column
from pyspark.sql.types import DoubleType
for field in ['age', 'experience', 'salary']:
    df_ps = df_ps.withColumn(field, df_ps[field].cast(DoubleType()))
df_ps.show()

+-----+----+----------+------+
| name| age|experience|salary|
+-----+----+----------+------+
|krish|31.0|      10.0|  20.0|
| sudj|30.0|       8.0|  30.0|
|  sun|19.0|       4.0|  40.0|
|  pau|null|      null|  50.0|
+-----+----+----------+------+



In [48]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'experience', 'salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'experience', 'salary']]
).setStrategy("mean")

In [49]:
imputer.fit(df_ps).transform(df_ps).show()

+-----+----+----------+------+------------------+------------------+--------------+
| name| age|experience|salary|       age_imputed|experience_imputed|salary_imputed|
+-----+----+----------+------+------------------+------------------+--------------+
|krish|31.0|      10.0|  20.0|              31.0|              10.0|          20.0|
| sudj|30.0|       8.0|  30.0|              30.0|               8.0|          30.0|
|  sun|19.0|       4.0|  40.0|              19.0|               4.0|          40.0|
|  pau|null|      null|  50.0|26.666666666666668| 7.333333333333333|          50.0|
+-----+----+----------+------+------------------+------------------+--------------+

