In [1]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession

In [2]:
# start a spark session
spark = SparkSession.builder.appName('Demo_missing_value').getOrCreate()

In [3]:
spark

In [5]:
# read the data
df = spark.read.csv('train.csv',header=True,inferSchema=True)

In [7]:
#drop columns
df.drop('row_id').show()

+-----+--------------+-------+-------------------+---------------------+------+
|cfips|        county|  state| first_day_of_month|microbusiness_density|active|
+-----+--------------+-------+-------------------+---------------------+------+
| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|
| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|
| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|
| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|
| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|
| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|
| 1001|Autauga County|Alabama|2020-02-01 00:00:00|            2.9093256|  1217|
| 1001|Autauga County|Alabama|2020-03-01 00:00:00|            2.9332314|  1227|
| 1001|Autauga County|Alabama|2020-04-01 00:00:00|            3.0001674|  1255|
| 1001|Autauga County|Alabama|2020-05-01

In [8]:
# drop row when there is null values exist more than 2
df.na.drop(how="any",thresh=2).show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+
|         row_id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|
+---------------+-----+--------------+-------+-------------------+---------------------+------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|
|1001_2019-12-01| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|
|1001_2020-01-01| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|
|1001_2020-02-01| 1001|Autauga County|Alabama|2020-02-01 00:00:00|            2.9093256|  1217|
|1001_2020-03-01| 1001|Autauga County|Al

In [None]:
# drop row when there is null values in columns selected
df.na.drop(how="any",subset = ['county']).show()

In [9]:
# fill the missing value with 'missing values'  defined
df.na.fill('Missing Values').show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+
|         row_id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|
+---------------+-----+--------------+-------+-------------------+---------------------+------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|            2.9932332|  1243|
|1001_2019-12-01| 1001|Autauga County|Alabama|2019-12-01 00:00:00|            2.9932332|  1243|
|1001_2020-01-01| 1001|Autauga County|Alabama|2020-01-01 00:00:00|              2.96909|  1242|
|1001_2020-02-01| 1001|Autauga County|Alabama|2020-02-01 00:00:00|            2.9093256|  1217|
|1001_2020-03-01| 1001|Autauga County|Al

In [12]:
#input missing value with mean
from pyspark.ml.feature import Imputer

imputer = Imputer (inputCols = ['cfips','active','microbusiness_density'], 
                  outputCols = ["{}_imputed".format(c) for c in ['cfips','active','microbusiness_density']]).setStrategy("mean")

In [13]:
#input missing value with mean
imputer.fit(df).transform(df).show()

+---------------+-----+--------------+-------+-------------------+---------------------+------+-------------+--------------+-----------------------------+
|         row_id|cfips|        county|  state| first_day_of_month|microbusiness_density|active|cfips_imputed|active_imputed|microbusiness_density_imputed|
+---------------+-----+--------------+-------+-------------------+---------------------+------+-------------+--------------+-----------------------------+
|1001_2019-08-01| 1001|Autauga County|Alabama|2019-08-01 00:00:00|            3.0076818|  1249|         1001|          1249|                    3.0076818|
|1001_2019-09-01| 1001|Autauga County|Alabama|2019-09-01 00:00:00|            2.8848701|  1198|         1001|          1198|                    2.8848701|
|1001_2019-10-01| 1001|Autauga County|Alabama|2019-10-01 00:00:00|            3.0558431|  1269|         1001|          1269|                    3.0558431|
|1001_2019-11-01| 1001|Autauga County|Alabama|2019-11-01 00:00:00|    