### spark session and libraries

In [13]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions
from pyspark.sql.functions import col, udf
from IPython.display import display
from datetime import datetime
from pyspark.sql.types import FloatType, BooleanType, IntegerType
from time import mktime
from datetime import datetime
import pandas as pd

spark_conf = SparkConf() \
    .setAll([
         ['spark.serializer','org.apache.spark.serializer.KryoSerializer'],
         ['spark.rdd.compress','true'],   
    ])

spark = SparkSession \
    .builder \
    .appName("pager") \
    .config(conf=spark_conf) \
    .enableHiveSupport() \
    .getOrCreate()

# ETL

### create dataframe and drop low level features
save the processed file as parquet since it is a columnar format we can perfrom groupby operations faster.

In [14]:
# read 311_service_requests data from hdfs
df = spark.read.csv('hdfs://ip-172-31-51-46.ec2.internal/pager/311_service_requests.csv', header=True,
                   inferSchema=True, ignoreLeadingWhiteSpace=None, ignoreTrailingWhiteSpace=None)

# rename columns and convert to lower case
for col, dtype in df.dtypes:
    new_col = col.replace(" ", "")
    df = df.withColumnRenamed(col, new_col)
    if dtype == 'string':
        df = df.withColumn(new_col, functions.lower(df[new_col]))

df = df.withColumnRenamed('XCoordinate(StatePlane)', 'XCoordinateStatePlane')
df = df.withColumnRenamed('YCoordinate(StatePlane)', 'YCoordinateStatePlane')



df = df.dropna(how='any', subset=['CreatedDate','ClosedDate'])

# UDF fuctions
get_month_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').month, IntegerType())
get_year_func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').year, IntegerType())
time_difference_func = udf(lambda x, y: (datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p') - 
                                         datetime.strptime(y, '%m/%d/%Y %I:%M:%S %p')).total_seconds()/3600, 
                           FloatType())
is_school_func = udf(lambda x: True if x == "unspecified" else False, BooleanType())


# create new columns and select required columns
df = df.withColumn('Month', get_month_func(df['CreatedDate']))
df = df.withColumn('Year', get_year_func(df['CreatedDate']))
df = df.withColumn('TimeTaken', time_difference_func(df['ClosedDate'], df['CreatedDate']))
df = df.withColumn('SchoolZone', is_school_func(df['SchoolName']))

# filters
df = df.filter(df['TimeTaken']>0)

df.write.parquet('hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data', mode='overwrite')

High level features identified by looking at subsample of the data set

['UniqueKey', 'Month', 'Year', 'TimeTaken', 'Agency', 'ComplaintType', 'Descriptor', 'LocationType',
         'Incidentzip', 'AddressType', 'City', 'FacilityType', 'Borough', 'Status', 'SchoolZone', 'CreatedDate',
         'ClosedDate']

# Exploratory Analysis

Exploratory Analysis on selected high level features 

['Agency', 'ComplaintType', 'LocationType', 'AddressType', 'City', 'FacilityType', 'Borough', 
                  'Status', 'SchoolZone']

 Group based on the each of the selected column for selected column + Time Taken column pair
 
 For each grouping use aggregate for count and mean
 
 Convert the grouped data into pandas dataframes and write into excel sheets
 
 Perfrom exploratory analysis means of grouped data by joining all the pandas dataframes formed
 
 Check for variance in the means
 
  (i) Large variance/stddev implies that the feature is important driver because it means that in each 
      feature the category are diverse and have extreme varying effects

In [7]:
df = spark.read.parquet("hdfs://ip-172-31-51-46.ec2.internal/pager/parquet/311_data")

In [16]:
grouped_counts_dict = {}
pandas_df_dict = {}

select_columns = ['Agency', 'ComplaintType', 'LocationType', 'AddressType', 'City', 'FacilityType', 'Borough', 
                  'Status', 'SchoolZone']

# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('./pager_analysis.xlsx', engine='xlsxwriter')

for col in select_columns:
    group = [col] + ['TimeTaken']
    grouped = df.select(group).groupBy(col)
    grouped_counts_dict[col] = {}
    grouped_counts_dict[col]['counts'] = grouped.count()
    grouped_counts_dict[col]['mean_time_taken'] = grouped.mean('TimeTaken')

    pandas_df_count = grouped_counts_dict[col]['counts'].toPandas()
    pandas_df_mean = grouped_counts_dict[col]['mean_time_taken'].toPandas()
    
    pandas_df = pandas_df_count.join(pandas_df_mean.set_index(col), on=col)
    pandas_df_dict[col] = pandas_df[['avg(TimeTaken)']].describe()
    
    pandas_df.to_excel(writer, sheet_name=col)

exploratory_analysis = pd.concat([pandas_df_dict[col] for col in select_columns], axis=1)
exploratory_analysis.columns = select_columns
exploratory_analysis.to_excel(writer, sheet_name='exploratory_analysis')

display(exploratory_analysis)

Unnamed: 0,Agency,ComplaintType,LocationType,AddressType,City,FacilityType,Borough,Status,SchoolZone
count,22.0,279.0,147.0,6.0,2281.0,6.0,549.0,19.0,2.0
mean,502.816256,666.591178,544.995441,373.093974,468.864287,1054.828844,300.156621,582.443329,383.900277
std,611.327217,1557.538318,542.851709,168.564998,520.700964,1330.409651,692.846707,1582.51255,40.821188
min,0.014705,0.011983,0.011976,210.045546,0.001667,4.48383,0.233333,0.095556,355.035338
25%,94.940681,45.155536,130.759632,249.373963,340.496832,149.095611,66.73333,25.814167,369.467808
50%,254.354355,203.892079,413.680753,324.680185,371.578339,772.227826,129.25,108.078149,383.900277
75%,591.492468,575.287516,698.05667,464.460444,412.73056,1137.52471,241.016663,269.685307,398.332747
max,2099.378765,17367.164875,2198.201251,643.832178,8398.497027,3579.358369,8135.866699,6906.248676,412.765216


## Do the top features change over time

Group based on Year/Month and the column

Aggregate on the TimeTaken column (mean, counts)

convert the result into pandas dataframe and write to xlsx file

use the excel file to check if the mean is changing with year/month (time)

### Yearly

In [17]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('./pager_yearly_analysis.xlsx', engine='xlsxwriter')

for col in select_columns:
    group = [col] + ['TimeTaken', 'Year']
    grouped = df.select(group).groupBy(['Year'] + [col])
    grouped_counts_dict[col] = {}
    grouped_counts_dict[col]['counts'] = grouped.count()
    grouped_counts_dict[col]['mean_time_taken'] = grouped.mean('TimeTaken')

    pandas_df_count = grouped_counts_dict[col]['counts'].toPandas()
    pandas_df_mean = grouped_counts_dict[col]['mean_time_taken'].toPandas()
    
    pandas_df = pandas_df_count.join(pandas_df_mean.set_index(['Year'] + [col]), 
                                     on=['Year'] + [col]).sort_values(by=col).reset_index(drop=True)
    pandas_df.to_excel(writer, sheet_name=col)

### Monthly

In [18]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('./pager_monthly_analysis.xlsx', engine='xlsxwriter')

for col in select_columns:
    group = [col] + ['TimeTaken', 'Month']
    grouped = df.select(group).groupBy(['Month'] + [col])
    grouped_counts_dict[col] = {}
    grouped_counts_dict[col]['counts'] = grouped.count()
    grouped_counts_dict[col]['mean_time_taken'] = grouped.mean('TimeTaken')

    pandas_df_count = grouped_counts_dict[col]['counts'].toPandas()
    pandas_df_mean = grouped_counts_dict[col]['mean_time_taken'].toPandas()
    
    pandas_df = pandas_df_count.join(pandas_df_mean.set_index(['Month'] + [col]), 
                                     on=['Month'] + [col]).sort_values(by=col).reset_index(drop=True)
    pandas_df.to_excel(writer, sheet_name=col)

In [8]:
from pyspark.ml.feature import Binarizer, Bucketizer

# provide 5 split points to generate 4 buckets
bucketizer = Bucketizer(splits=[0, 10, 60, 180, 1440, 4320, 7200, 14400, 28800, 57600, 15151675], 
                        inputCol='TimeTaken', outputCol='TimeTakenDiscrete')

# pipeline stages
from pyspark.ml import Pipeline
stages = [bucketizer]
pipeline = Pipeline(stages=stages)

# fit the pipeline model and transform the data
df = pipeline.fit(df).transform(df)
df[['Agency', 'TimeTaken', 'TimeTakenDiscrete']].show(n=100)

+------+-----------+-----------------+
|Agency|  TimeTaken|TimeTakenDiscrete|
+------+-----------+-----------------+
|  dsny|   72.63333|              2.0|
|  nypd|  0.7077778|              0.0|
|  nypd|      5.515|              0.0|
|  nypd|  1.4108334|              0.0|
|  nypd|  3.3466666|              0.0|
|  nypd| 0.53194445|              0.0|
|  nypd|  2.8741667|              0.0|
|  nypd|     0.1325|              0.0|
|  nypd|  0.4786111|              0.0|
|  nypd|  3.4405556|              0.0|
|  nypd|  1.8877778|              0.0|
|  nypd|  0.6402778|              0.0|
|  nypd|  6.2727776|              0.0|
|  nypd|     5.7475|              0.0|
|  nypd|  1.0736111|              0.0|
|  nypd| 0.61972225|              0.0|
|  nypd|  3.5530555|              0.0|
|  nypd|  5.9477777|              0.0|
|   dep| 0.16666667|              0.0|
|  nypd|  0.5472222|              0.0|
|  nypd|  2.0083334|              0.0|
|  nypd|   5.207778|              0.0|
|  nypd|  1.8655555|     

In [3]:
df[['TimeTaken']].describe().show()

+-------+------------------+
|summary|         TimeTaken|
+-------+------------------+
|  count|          15151674|
|   mean|355.43198445079247|
| stddev| 2803.091941861275|
|    min|      2.7777778E-4|
|    max|         8855068.0|
+-------+------------------+

