## Filename: model

#### Use the .randomSplit method to split the 311 data into training and test sets.
#### Create a classification model to predict whether a case will be late or not (i.e. predict case_late).
#### Experiment with different combinations of features and different classification algorithms.

In [1]:
# set up environment and start spark session
%matplotlib inline
import pyspark
import pyspark.ml
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
# read cases, source, and department into spark dataframes
df_cases = spark.read.csv('./sa311/case.csv', header=True, inferSchema=True)
df_source = spark.read.csv('./sa311/source.csv', header=True, inferSchema=True)
df_dept = spark.read.csv('./sa311/dept.csv', header=True, inferSchema=True)

In [3]:
# join the databases
df = df_cases.join(df_dept, on='dept_division', how='left')
df = df.join(df_source, on='source_id', how="left")

In [4]:
# ensure everything looks good thus far
df.printSchema()

root
 |-- source_id: string (nullable = true)
 |-- dept_division: string (nullable = true)
 |-- case_id: integer (nullable = true)
 |-- case_opened_date: string (nullable = true)
 |-- case_closed_date: string (nullable = true)
 |-- SLA_due_date: string (nullable = true)
 |-- case_late: string (nullable = true)
 |-- num_days_late: double (nullable = true)
 |-- case_closed: string (nullable = true)
 |-- service_request_type: string (nullable = true)
 |-- SLA_days: double (nullable = true)
 |-- case_status: string (nullable = true)
 |-- request_address: string (nullable = true)
 |-- council_district: integer (nullable = true)
 |-- dept_name: string (nullable = true)
 |-- standardized_dept_name: string (nullable = true)
 |-- dept_subject_to_SLA: string (nullable = true)
 |-- source_username: string (nullable = true)



In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType

# clean the data a bit
df = df.withColumn("num_days_late", df["num_days_late"].cast(DoubleType()))\
        .withColumn("SLA_days", df["SLA_days"].cast(DoubleType()))\
        .withColumn('dept_lower', F.lower(F.col('dept_name')))\
        .withColumn('dept_lower', regexp_replace(col('dept_lower'), r'[^a-z0-9]', '_'))    

In [None]:
# look at value counts for department to check for nulls
df.groupBy('dept_lower').count().show()

+--------------------+------+
|          dept_lower| count|
+--------------------+------+
|trans___cap_impro...| 97841|
|        city_council|    34|
|animal_care_services|119362|
|                null|   198|
|development_services|  1397|
|code_enforcement_...|321984|
|solid_waste_manag...|286287|
|parks_and_recreation| 19964|
|        metro_health|  5313|
|    customer_service|  2889|
+--------------------+------+



In [None]:
# fill nulls
df = df.na.fill('none_associated', ['dept_lower'])

In [None]:
# check again
df.groupBy('dept_lower').count().show()

+--------------------+------+
|          dept_lower| count|
+--------------------+------+
|     none_associated|   198|
|trans___cap_impro...| 97841|
|        city_council|    34|
|animal_care_services|119362|
|development_services|  1397|
|code_enforcement_...|321984|
|solid_waste_manag...|286287|
|parks_and_recreation| 19964|
|        metro_health|  5313|
|    customer_service|  2889|
+--------------------+------+



In [None]:
train, test = df.randomSplit([.7, .3], seed=123)

In [None]:
from pyspark.ml.feature import RFormula
rf = RFormula(formula = 'case_late ~ dept_lower')
df_model = rf.fit(train).transform(train).select('features', 'label')

In [None]:
df_model.show(5)

+-------------+-----+
|     features|label|
+-------------+-----+
|(9,[6],[1.0])|  0.0|
|(9,[6],[1.0])|  0.0|
|(9,[1],[1.0])|  0.0|
|(9,[1],[1.0])|  0.0|
|(9,[1],[1.0])|  0.0|
+-------------+-----+
only showing top 5 rows



In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression()
lr_fit = lr.fit(df_model)
lr_fit

In [None]:
(lr_fit.transform(df_model)
 .withColumn('we_got_it_right', col('label') == col('prediction'))).show()

In [None]:
training_summary = lr_fit.summary


In [None]:
training_summary.areaUnderROC


In [None]:
training_summary.accuracy


In [None]:
df_model = rf.fit(test).transform(test).select('features', 'label')
lr_fit_test = lr.fit(df_model)
lr_fit_test

In [None]:
test_summary = lr_fit.summary
test_summary.areaUnderROC
