## Dealing with Missing Data

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('missing_values').getOrCreate()

In [5]:
df = spark.read.csv('ContainsNull.csv', header=True, inferSchema=True)

In [6]:
df.show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [8]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [14]:
# Drop a row if row has atleast 'threshold' number of nulls
df.na.drop(thresh=2).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [15]:
df.na.drop(how='all').show() # Only where all is NA

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John| null|
|emp2| null| null|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [17]:
df.na.drop(subset=['Sales']).show() # Only if 'Sales' is NA

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [21]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sales: double (nullable = true)



In [22]:
df.na.fill('fill_value').show() #Auto detects string

+----+----------+-----+
|  Id|      Name|Sales|
+----+----------+-----+
|emp1|      John| null|
|emp2|fill_value| null|
|emp3|fill_value|345.0|
|emp4|     Cindy|456.0|
+----+----------+-----+



In [24]:
df.na.fill(0).show() #Auto detects numerics

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|  0.0|
|emp2| null|  0.0|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [26]:
df.na.fill('No Name', subset=['Name']).show() #Always declare the subset for ease of reading

+----+-------+-----+
|  Id|   Name|Sales|
+----+-------+-----+
|emp1|   John| null|
|emp2|No Name| null|
|emp3|No Name|345.0|
|emp4|  Cindy|456.0|
+----+-------+-----+



In [27]:
# Fill Value with the mean of the column

In [29]:
from pyspark.sql.functions import mean

In [30]:
mean_val = df.select(mean(df['Sales'])).collect()

In [33]:
mean_sales = mean_val[0][0]

In [34]:
df.na.fill(mean_sales,['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



In [45]:
# All in one ugly line
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()

+----+-----+-----+
|  Id| Name|Sales|
+----+-----+-----+
|emp1| John|400.5|
|emp2| null|400.5|
|emp3| null|345.0|
|emp4|Cindy|456.0|
+----+-----+-----+



## Dates and TimeStamps

In [46]:
df = spark.read.csv('appl_stock.csv', header=True, inferSchema=True)

In [49]:
df.head(1)

[Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [52]:
df.select(['Date','Open']).show()

+----------+------------------+
|      Date|              Open|
+----------+------------------+
|2010-01-04|        213.429998|
|2010-01-05|        214.599998|
|2010-01-06|        214.379993|
|2010-01-07|            211.75|
|2010-01-08|        210.299994|
|2010-01-11|212.79999700000002|
|2010-01-12|209.18999499999998|
|2010-01-13|        207.870005|
|2010-01-14|210.11000299999998|
|2010-01-15|210.92999500000002|
|2010-01-19|        208.330002|
|2010-01-20|        214.910006|
|2010-01-21|        212.079994|
|2010-01-22|206.78000600000001|
|2010-01-25|202.51000200000001|
|2010-01-26|205.95000100000001|
|2010-01-27|        206.849995|
|2010-01-28|        204.930004|
|2010-01-29|        201.079996|
|2010-02-01|192.36999699999998|
+----------+------------------+
only showing top 20 rows



In [53]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, dayofmonth,
                                   weekofyear, month, year, format_number,
                                   date_format)

In [55]:
df.select(dayofmonth(df['date'])).show()

+----------------+
|dayofmonth(date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [56]:
df.select(hour(df['date'])).show()

+----------+
|hour(date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



In [57]:
df.select(month(df['date'])).show()

+-----------+
|month(date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



In [61]:
# Average closing price per year
new_df = df.withColumn('Year', year(df['Date']))

In [75]:
result = new_df.groupBy('Year').mean().select(['Year','avg(Close)'])

In [77]:
new = result.withColumnRenamed('avg(Close)','Average Closing Price')

In [84]:
new.select(['Year',format_number('Average Closing Price',2).alias('Avg Close')]).show()

+----+---------+
|Year|Avg Close|
+----+---------+
|2015|   120.04|
|2013|   472.63|
|2014|   295.40|
|2012|   576.05|
|2016|   104.60|
|2010|   259.84|
|2011|   364.00|
+----+---------+



In [85]:
new.select(['Year',format_number('Average Closing Price',2).alias('Avg Close')]).orderBy('Year').show()

+----+---------+
|Year|Avg Close|
+----+---------+
|2010|   259.84|
|2011|   364.00|
|2012|   576.05|
|2013|   472.63|
|2014|   295.40|
|2015|   120.04|
|2016|   104.60|
+----+---------+



In [86]:
spark.stop()