# Spark DataFrames - Missing Data

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('miss').getOrCreate()

In [0]:
df = spark.read.csv('/FileStore/tables/ContainsNull.csv', header=True, inferSchema=True)

In [0]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
df.na.drop(thresh=2).show() # Row needs 2 null values to be dropped

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
df.na.drop(subset=['Sales']).show() # Drop is sales are missing

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
# Fill in missing values
df.na.fill('FILL VALUE').show() # String value

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|FILL VALUE| null|
|emp3|FILL VALUE|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [0]:
df.na.fill(0).show() # Numeric value

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
df.na.fill('No Name', subset=['Name']).show() # Should always declare subset

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [0]:
# Fill values with mean of column
from pyspark.sql.functions import mean

In [0]:
mean_val = df.select(mean(df['Sales'])).collect()

In [0]:
mean_val

Out[14]: [Row(avg(Sales)=400.5)]

In [0]:
mean_sales = mean_val[0][0]

In [0]:
df.na.fill(mean_sales, ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [0]:
# All in one line
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0], ['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+

