## Read table and cache the sorted version

In [2]:
pageviewsDF = spark.read.csv("pageviews_short.tsv", sep="\t", inferSchema=True, header=True)
pageviews_ordered_df = pageviewsDF.orderBy("timestamp", pageviewsDF.site.desc())

In [7]:
pageviews_ordered_df.registerTempTable("pageviews_ordered")

In [8]:
sqlContext.cacheTable("pageviews_ordered")

In [9]:
pageviews_ordered_df.count()

3499999

_Check storage tab to make sure it's cached with the correct name_

In [1]:
pageviews_ordered_df.unpersist()

NameError: name 'pageviews_ordered_df' is not defined

In [3]:
sqlContext.setConf("spark.sql.shuffle.partitions", "8")

In [4]:
pageviews_ordered_df.registerTempTable("pageviews_ordered")
sqlContext.cacheTable("pageviews_ordered")
pageviews_ordered_df.count()

3499999

_There should be 8 partitions._

In [13]:
pageviews_ordered_df.show(5)

+--------------------+-------+--------+
|           timestamp|   site|requests|
+--------------------+-------+--------+
|2015-03-16 00:00:...| mobile|    1628|
|2015-03-16 00:00:...|desktop|    2343|
|2015-03-16 00:00:...| mobile|    1636|
|2015-03-16 00:00:...|desktop|    2382|
|2015-03-16 00:00:...| mobile|    1619|
+--------------------+-------+--------+
only showing top 5 rows



In [14]:
pageviews_ordered_df.count() # this should run very fast

3499999

#### Q-1) How many total incoming requests were to the mobile site vs the desktop site?

In [5]:
pageviews_ordered_df.schema

StructType(List(StructField(timestamp,TimestampType,true),StructField(site,StringType,true),StructField(requests,IntegerType,true)))

In [6]:
from pyspark.sql import functions as F

In [7]:
pageviews_ordered_df.filter(pageviews_ordered_df.site == "mobile").select(F.sum(pageviews_ordered_df.requests)).show()

+-------------+
|sum(requests)|
+-------------+
|   2241257072|
+-------------+



In [8]:
pageviews_ordered_df.filter(pageviews_ordered_df.site == "desktop").select(F.sum(pageviews_ordered_df.requests)).show()

+-------------+
|sum(requests)|
+-------------+
|   4244792112|
+-------------+



#### Q-2) What is the start and end range of time for the pageviews data? How many days of data is in the DataFrame?

In [9]:
pageviews_ordered_df.select(pageviews_ordered_df.timestamp).show(1)

+--------------------+
|           timestamp|
+--------------------+
|2015-03-16 00:00:...|
+--------------------+
only showing top 1 row



In [16]:
pageviews_range = pageviews_ordered_df.select(F.min(pageviews_ordered_df.timestamp).alias('min'),
                                              F.max(pageviews_ordered_df.timestamp).alias('max'))

In [18]:
pageviews_range.select(F.datediff(pageviews_range.max, pageviews_range.min)).show()

+------------------+
|datediff(max, min)|
+------------------+
|                21|
+------------------+



#### Q-3) Can you figure out how to check which months of 2015 the data covers

In [21]:
pageviews_ordered_df.select(F.month('timestamp').alias('month')).distinct().show()

+-----+
|month|
+-----+
|    3|
|    4|
+-----+



#### Q-4) Why is there so much more traffic on Monday vs. other days of the week?

In [24]:
pageviews_ordered_df.groupby(F.date_format('timestamp', 'EEEE').alias('day')).sum('requests').show()

+---------+-------------+
|      day|sum(requests)|
+---------+-------------+
|   Friday|    828664178|
|   Monday|   1257641406|
|   Sunday|    858268863|
|Wednesday|    885201551|
| Thursday|    858325690|
| Saturday|    806108605|
|  Tuesday|    991838891|
+---------+-------------+



In [36]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

ImportError: No module named pandas

In [29]:
pageviews_by_day = pageviews_ordered_df.groupby(F.dayofyear('timestamp').alias('day')).sum('requests')

In [39]:
pageviews_by_day.select(F.min('sum(requests)')).show()

+------------------+
|min(sum(requests))|
+------------------+
|         214848933|
+------------------+



In [35]:
plt.bar(pageviews_by_day.select('day').toPandas(), pageviews_by_day.select('sum(requests)').toPandas())

ImportError: No module named pandas