### spark session and libraries

In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql.functions import col, udf
from IPython.display import display
from datetime import datetime
from pyspark.sql.types import FloatType, BooleanType, IntegerType
from time import mktime
from datetime import datetime
import pandas as pd

spark_conf = SparkConf() \
    .setAll([
         ['spark.serializer','org.apache.spark.serializer.KryoSerializer'],
         ['spark.rdd.compress','true'],   
    ])

spark = SparkSession \
    .builder \
    .appName("pager") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

# ETL

### create dataframe and drop low level features
save the processed file as parquet since it is a columnar format we can perfrom groupby operations faster.

In [2]:
# read 311_service_requests data from hdfs
df = spark.read.csv('hdfs://ip-172-31-51-46.ec2.internal/pager/311_service_requests.csv', header=True,
                   inferSchema=True, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None)

# rename columns and convert to lower case
for col, dtype in df.dtypes:
    new_col = col.replace(" ", "")
    df = df.withColumnRenamed(col, new_col)
    if dtype == 'string':
        df = df.withColumn(new_col, functions.lower(df[new_col]))



df = df.dropna(how='any', subset=['CreatedDate','ClosedDate'])

# UDF fuctions
get_month_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').month, IntegerType())
get_year_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').year, IntegerType())
time_difference_func = udf(lambda x, y: (datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p') - 
                                         datetime.strptime(y, '%m/%d/%Y %I:%M:%S %p')).total_seconds()/3600, 
                           FloatType())
is_school_func = udf(lambda x: True if x == "Unspecified" else False, BooleanType())


# create new columns and select required columns
df = df.withColumn('Month', get_month_func(df['CreatedDate']))
df = df.withColumn('Year', get_year_func(df['CreatedDate']))
df = df.withColumn('TimeTaken', time_difference_func(df['ClosedDate'], df['CreatedDate']))
df = df.withColumn('SchoolZone', is_school_func(df['SchoolName']))

# filters
df.filter(df['TimeTaken']>0)
df = df[['UniqueKey', 'Month', 'Year', 'TimeTaken', 'Agency', 'ComplaintType', 'Descriptor', 'LocationType',
         'Incidentzip', 'AddressType', 'City', 'FacilityType', 'Borough', 'Status', 'SchoolZone', 'CreatedDate',
         'ClosedDate']]

df.write.parquet('hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data', mode='overwrite')

# Exploratory Analysis

 Group based on the each of the selected column for selected column + Time Taken column pair
 
 For each grouping use aggregate for count and mean
 
 Convert the grouped data into pandas dataframes and write into excel sheets
 
 Perfrom exploratory analysis means of grouped data by joining all the pandas dataframes formed
 
 Check for variance in the means
 
  (i) Large variance/stddev implies that the feature is important driver because it means that in each 
      feature the category are diverse and have extreme varying effects

In [8]:
df = spark.read.parquet("hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data")

In [4]:
grouped_counts_dict = {}
pandas_df_dict = {}

select_columns = ['Agency', 'ComplaintType', 'LocationType', 'AddressType', 'City', 'FacilityType', 'Borough', 
                  'Incidentzip', 'Status', 'SchoolZone']

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('./pager_analysis.xlsx', engine='xlsxwriter')

for col in select_columns:
    group = [col] + ['TimeTaken']
    grouped = df.select(group).groupBy(col)
    grouped_counts_dict[col] = {}
    grouped_counts_dict[col]['counts'] = grouped.count()
    grouped_counts_dict[col]['mean_time_taken'] = grouped.mean('TimeTaken')

    pandas_df_count = grouped_counts_dict[col]['counts'].toPandas()
    pandas_df_mean = grouped_counts_dict[col]['mean_time_taken'].toPandas()
    
    pandas_df = pandas_df_count.join(pandas_df_mean.set_index(col), on=col)
    pandas_df_dict[col] = pandas_df[['avg(TimeTaken)']].describe()
    
    pandas_df.to_excel(writer, sheet_name=col)

exploratory_analysis = pd.concat([pandas_df_dict[col] for col in select_columns], axis=1).reset_index()
exploratory_analysis.columns = ['TimeTaken'] + select_columns
exploratory_analysis.to_excel(writer, sheet_name='exploratory_analysis')

display(exploratory_analysis)