# Introduction
In this notebook, I will be doing similar analysis to the Time Series Analysis notebook: however, in this one, I will be analyzing how the traffic data changes depending on the day of the week, particularly for weekends versus weekdays.

In [1]:
from pyspark.sql import SparkSession, SQLContext
import pyspark.sql.types as T
import pyspark
import pyspark.sql.functions as F
import seaborn as sns
import glob
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
plt.rcParams["axes.titlesize"] = 40
plt.rcParams["axes.labelsize"] = 15

# Part 1: Reading in the Data

In [3]:
data_dir = "../data/uber/uber-trip-data/"
uber_files = glob.glob(data_dir + '*.csv') 
uber_files

['../data/uber/uber-trip-data/uber-raw-data-jun14.csv',
 '../data/uber/uber-trip-data/uber-raw-data-may14.csv',
 '../data/uber/uber-trip-data/taxi-zone-lookup.csv',
 '../data/uber/uber-trip-data/uber-raw-data-jul14.csv',
 '../data/uber/uber-trip-data/uber-raw-data-sep14.csv',
 '../data/uber/uber-trip-data/uber-raw-data-apr14.csv',
 '../data/uber/uber-trip-data/uber-raw-data-aug14.csv']

In [4]:
uber_files.remove('../data/uber/uber-trip-data/taxi-zone-lookup.csv')

In [5]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

22/05/09 11:08:35 WARN Utils: Your hostname, DSGPU05 resolves to a loopback address: 127.0.1.1; using 10.10.11.64 instead (on interface eno1)
22/05/09 11:08:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 11:08:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/09 11:08:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [6]:
df = spark.read.csv(uber_files, header=True)

In [7]:
df.show()

+----------------+-------+--------+------+
|       Date/Time|    Lat|     Lon|  Base|
+----------------+-------+--------+------+
|9/1/2014 0:01:00|40.2201|-74.0021|B02512|
|9/1/2014 0:01:00|  40.75|-74.0027|B02512|
|9/1/2014 0:03:00|40.7559|-73.9864|B02512|
|9/1/2014 0:06:00| 40.745|-73.9889|B02512|
|9/1/2014 0:11:00|40.8145|-73.9444|B02512|
|9/1/2014 0:12:00|40.6735|-73.9918|B02512|
|9/1/2014 0:15:00|40.7471|-73.6472|B02512|
|9/1/2014 0:16:00|40.6613|-74.2691|B02512|
|9/1/2014 0:32:00|40.3745|-73.9999|B02512|
|9/1/2014 0:33:00|40.7633|-73.9773|B02512|
|9/1/2014 0:33:00|40.7467|-73.6131|B02512|
|9/1/2014 0:37:00|40.8105|  -73.96|B02512|
|9/1/2014 0:38:00| 40.679|-74.0111|B02512|
|9/1/2014 0:39:00|40.4023|-73.9839|B02512|
|9/1/2014 0:48:00|40.7378|-74.0395|B02512|
|9/1/2014 0:48:00|40.7214|-73.9884|B02512|
|9/1/2014 0:49:00|40.8646|-73.9081|B02512|
|9/1/2014 1:08:00|40.7398|-74.0061|B02512|
|9/1/2014 1:17:00|40.6793|-74.0116|B02512|
|9/1/2014 1:19:00|40.7328|-73.9875|B02512|
+----------

In [8]:
df = df.withColumn("Date/Time", F.to_date(F.to_timestamp(F.col("Date/Time"), "M/d/yyyy H:mm:ss")))

# Part 2: Adding a Column for Weekdays

In [None]:
df = df.withColumn("DayOfWeek")