### spark session and libraries

In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql.functions import col, udf
from IPython.display import display
from datetime import datetime
from pyspark.sql.types import FloatType, BooleanType, IntegerType
from time import mktime
from datetime import datetime
import pandas as pd

spark_conf = SparkConf() \
    .setAll([
         ['spark.serializer','org.apache.spark.serializer.KryoSerializer'],
         ['spark.rdd.compress','true'],   
    ])

spark = SparkSession \
    .builder \
    .appName("pager") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

# ETL

### create dataframe and drop low level features
save the processed file as parquet since it is a columnar format and when perfrom groupby operations faster

In [None]:
df = spark.read.csv('hdfs://ip-172-31-51-46.ec2.internal/pager/311_service_requests.csv', header=True,
                   inferSchema=True, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None)

# rename columns
for col in df.columns:
    if (' ' in col) == True:
        new_col = col.replace(" ", "")
        df = df.withColumnRenamed(col, new_col)


df = df.dropna(how='any', subset=['CreatedDate','ClosedDate'])

# UDF fuctions
get_month_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').month, IntegerType())
get_year_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').year, IntegerType())
time_difference_func = udf(lambda x, y: (datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p') - 
                                         datetime.strptime(y, '%m/%d/%Y %I:%M:%S %p')).total_seconds()/3600, 
                           FloatType())
is_school_func = udf(lambda x: True if x == "Unspecified" else False, BooleanType())


# create new columns and select required columns

df = df.withColumn('Month', get_month_func(df['CreatedDate']))
df = df.withColumn('Year', get_month_func(df['CreatedDate']))
df = df.withColumn('TimeTaken', time_difference_func(df['ClosedDate'], df['CreatedDate']))
df = df.withColumn('SchoolZone', is_school_func(df['SchoolName']))
df = df[['UniqueKey', 'Month', 'Year', 'TimeTaken', 'Agency', 'ComplaintType', 'LocationType', 'Incidentzip',
         'AddressType', 'City', 'FacilityType', 'Borough', 'Status', 'SchoolZone', 'CreatedDate',
         'ClosedDate']]

df.write.parquet('hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data', mode='overwrite')

# Exploratory Analysis

 Group based on the each of the selected column for selected column + Time Taken column pair
 
 For each grouping use aggregate for count and mean
 
 Convert the grouped data into pandas dataframes and write into excel sheets
 
 Perfrom exploratory analysis means of grouped data by joining all the pandas dataframes formed
 
 Check for variance in the means
 
  (i) Large variance implies that the feature is important driver because it means that in each 
      feature the category are diverse and have extreme varying effects

In [5]:
df = spark.read.parquet("hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data")

grouped_counts_dict = {}
pandas_df_dict = {}

select_columns = ['Agency', 'ComplaintType', 'LocationType', 'AddressType', 'City', 'FacilityType', 'Borough', 
                  'Status', 'SchoolZone']

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('./pager_analysis.xlsx', engine='xlsxwriter')

for col in select_columns:
    group = [col] + ['TimeTaken']
    grouped = df.select(group).groupBy(col)
    grouped_counts_dict[col] = {}
    grouped_counts_dict[col]['counts'] = grouped.count()
    grouped_counts_dict[col]['mean_time_taken'] = grouped.mean('TimeTaken')

    pandas_df_count = grouped_counts_dict[col]['counts'].toPandas()
    pandas_df_mean = grouped_counts_dict[col]['mean_time_taken'].toPandas()
    
    pandas_df = pandas_df_count.join(pandas_df_mean.set_index(col), on=col)
    pandas_df_dict[col] = pandas_df[['avg(Time Taken)']].describe()
    
    pandas_df.to_excel(writer, sheet_name=col)

exploratory_analysis = pd.concat([pandas_df_dict[col] for col in select_columns], axis=1).reset_index()
exploratory_analysis.columns = ['TimeTaken'] + select_columns
exploratory_analysis.to_excel(writer, sheet_name='exploratory_analysis')

display(exploratory_analysis)

Unnamed: 0,Time Taken,Agency,Complaint Type,Location Type,Address Type,City,Facility Type,Borough,Status,School Name
0,count,29.0,287.0,154.0,6.0,2327.0,6.0,549.0,22.0,3713.0
1,mean,-240899.0,-29602.362875,-66703.513744,-62.621543,92.564891,-119119.589022,295.882093,-180390.380575,743.26702
2,std,434119.6,149789.827246,238563.583714,364.725634,13995.783103,293709.568884,693.739331,372479.377116,3642.604292
3,min,-1028939.0,-986899.875,-977421.773707,-754.161199,-645431.919074,-718644.858181,-271.246716,-991193.293689,-122694.422104
4,25%,-14788.68,3.21018,3.917154,-90.911113,322.388555,-43.495131,64.76667,-3271.359866,128.763882
5,50%,85.17787,129.801978,186.296327,38.834693,371.320557,-7.003123,124.0,32.418155,297.744579
6,75%,307.7275,482.746302,620.864969,160.169901,404.72907,311.472515,235.449997,180.69197,950.647997
7,max,1539.2,16222.077081,2198.201251,234.307302,8398.497027,3579.358369,8135.866699,6093.748832,32159.117188


## Do the top features change over time