In [1]:
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark
from pyspark.sql.functions import *

role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
    .master("local[*]").getOrCreate()
    
spark

In [60]:
import os
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [3]:
cdc = spark.read.csv('output.csv', header = True)

In [4]:
cdc = cdc.drop('detail_age_type','detail_age', 'age_substitution_flag','age_recode_27', 'age_recode_12', 'infant_age_recode_22', 'icd_code_10th_revision')

In [5]:
cdc = cdc.drop('record_condition_20' , 'entity_condition_20', 'entity_condition_19', 'entity_condition_18', 'entity_condition_17', 'record_condition_19', 'record_condition_18', 'record_condition_16', 'record_condition_17','record_condition_15','record_condition_14','record_condition_13','record_condition_12','record_condition_11','record_condition_10','record_condition_9','record_condition_8','record_condition_7' , 'record_condition_6', 'record_condition_5', 'entity_condition_16', 'entity_condition_15','entity_condition_14', 'entity_condition_13', 'entity_condition_12', 'entity_condition_11', 'entity_condition_10', 'entity_condition_9','entity_condition_8', 'entity_condition_7', 'entity_condition_6', 'entity_condition_5')

In [6]:
cdc.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_1989_revision: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- education_reporting_flag: string (nullable = true)
 |-- month_of_death: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- place_of_death_and_decedents_status: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- day_of_week_of_death: string (nullable = true)
 |-- current_data_year: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- manner_of_death: string (nullable = true)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = true)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = true)
 |-- 358_cause_recode: string (nullable = true)
 |-- 113_cause_recode: string (nullable = true)
 |-- 130_infant_cause_recode: 

In [7]:
cdc = cdc.fillna({'place_of_injury_for_causes_w00_y34_except_y06_and_y07_' : '10'})
cdc = cdc.fillna({'130_infant_cause_recode' : '000'})
cdc = cdc.fillna({'activity_code' : '10'})
cdc = cdc.fillna({'manner_of_death' : '8'})
cdc = cdc.fillna({'Place_of_death_and_decedents_status': '7'})

In [8]:
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'R' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'E' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'D' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'U' , 'O'))
cdc = cdc.withColumn('Place_of_death_and_decedents_status', regexp_replace('Place_of_death_and_decedents_status', '9' , '7'))

In [9]:
#cdc = cdc.withColumn('Place_of_death_and_decedents_status', 
 #                    when(cdc['Place_of_death_and_decedents_status']== 9 , 7).otherwise(cdc['Place_of_death_and_decedents_status']))

In [10]:
#cdc.select('Place_of_death_and_decedents_status').distinct().show()

In [11]:
#cdc.select('place_of_injury_for_causes_w00_y34_except_y06_and_y07_').distinct().show()


In [12]:
#cdc.select("education_1989_revision").distinct().show()

In [13]:
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '00' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '01' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '02' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '03' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '04' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '05' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '06' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '07' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '08' , '1'))

cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '09' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '10' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '11' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '12' , '3'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '13' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '14' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '15' , '5'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '16' , '6'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '17' , '7'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '99' , '9'))

In [14]:
cdc = cdc.withColumn("education_2003_revision", coalesce(cdc.education_2003_revision,cdc.education_1989_revision))

In [15]:
cdc = cdc.filter(cdc.education_2003_revision.isNotNull())

In [16]:
#cdc.groupBy('education_2003_revision').count().orderBy('count', ascending=False).show()

In [17]:
#cdc.printSchema()


In [18]:
cdc = cdc.drop('education_1989_revision', 'education_reporting_flag','bridged_race_flag', 'race_imputation_flag', 
               'hispanic_origin','entity_condition_1','entity_condition_2','entity_condition_3','entity_condition_4',
              'record_condition_1','record_condition_2','record_condition_3','record_condition_4')

In [19]:
cdc = cdc.withColumn('number_of_entity_axis_conditions', cdc['number_of_entity_axis_conditions'].cast(DoubleType()))
cdc = cdc.withColumn('number_of_record_axis_conditions', cdc['number_of_record_axis_conditions'].cast(DoubleType()))

In [43]:
cdc = cdc.filter(cdc.method_of_disposition != 'O')

### Building prediction model

In [45]:
cols = cdc.columns
categorical_cols = cols.copy()
categorical_cols.remove('number_of_entity_axis_conditions')
categorical_cols.remove('number_of_record_axis_conditions')
categorical_cols.remove('method_of_disposition')

In [46]:
categorical_cols

['resident_status',
 'education_2003_revision',
 'month_of_death',
 'sex',
 'age_recode_52',
 'Place_of_death_and_decedents_status',
 'marital_status',
 'day_of_week_of_death',
 'current_data_year',
 'injury_at_work',
 'manner_of_death',
 'autopsy',
 'activity_code',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_',
 '358_cause_recode',
 '113_cause_recode',
 '130_infant_cause_recode',
 '39_cause_recode',
 'race',
 'race_recode_3',
 'race_recode_5',
 'hispanic_originrace_recode']

In [47]:
stages = []
for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol=col + "Index")
    encoder = OneHotEncoder(inputCol=col + "Index", outputCol=col + "classVec")
    
    stages += [string_indexer, encoder]

In [48]:
label_string_index = StringIndexer(inputCol="method_of_disposition", outputCol="label")
stages += [label_string_index]

In [49]:
numeric_cols = ['number_of_entity_axis_conditions', 'number_of_record_axis_conditions']
assemblerInputs = [c + "classVec" for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [50]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(cdc)
preppedDataDF = pipelineModel.transform(cdc)

In [51]:
lrModel = LogisticRegression().fit(preppedDataDF)

In [52]:
display(lrModel, preppedDataDF, "ROC")

LogisticRegression_42ca9a8439510cbd247f

DataFrame[resident_status: string, education_2003_revision: string, month_of_death: string, sex: string, age_recode_52: string, Place_of_death_and_decedents_status: string, marital_status: string, day_of_week_of_death: string, current_data_year: string, injury_at_work: string, manner_of_death: string, method_of_disposition: string, autopsy: string, activity_code: string, place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string, 358_cause_recode: string, 113_cause_recode: string, 130_infant_cause_recode: string, 39_cause_recode: string, number_of_entity_axis_conditions: double, number_of_record_axis_conditions: double, race: string, race_recode_3: string, race_recode_5: string, hispanic_originrace_recode: string, resident_statusIndex: double, resident_statusclassVec: vector, education_2003_revisionIndex: double, education_2003_revisionclassVec: vector, month_of_deathIndex: double, month_of_deathclassVec: vector, sexIndex: double, sexclassVec: vector, age_recode_52Index: double, ag

'ROC'

In [53]:
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)

DataFrame[label: double, features: vector, resident_status: string, education_2003_revision: string, month_of_death: string, sex: string, age_recode_52: string, Place_of_death_and_decedents_status: string, marital_status: string, day_of_week_of_death: string, current_data_year: string, injury_at_work: string, manner_of_death: string, method_of_disposition: string, autopsy: string, activity_code: string, place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string, 358_cause_recode: string, 113_cause_recode: string, 130_infant_cause_recode: string, 39_cause_recode: string, number_of_entity_axis_conditions: double, number_of_record_axis_conditions: double, race: string, race_recode_3: string, race_recode_5: string, hispanic_originrace_recode: string]

In [54]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

In [55]:
lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

In [56]:
lrModel = lr.fit(trainingData)

In [57]:
predictions = lrModel.transform(testData)

In [59]:
predictions.select("label", "prediction", "probability").show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|  0.0|       0.0|[0.67957164662236...|
|  0.0|       0.0|[0.79846153815456...|
|  0.0|       0.0|[0.64433925705476...|
|  0.0|       0.0|[0.68394603112795...|
|  0.0|       0.0|[0.57817610983773...|
|  0.0|       0.0|[0.77178165921019...|
|  0.0|       0.0|[0.62977576937925...|
|  0.0|       0.0|[0.76398302577695...|
|  0.0|       0.0|[0.67848321650588...|
|  0.0|       0.0|[0.68490593304098...|
|  0.0|       0.0|[0.62599127192123...|
|  0.0|       0.0|[0.86648160293934...|
|  0.0|       0.0|[0.57080531891056...|
|  0.0|       0.0|[0.69794662955588...|
|  0.0|       0.0|[0.71991159023037...|
|  0.0|       0.0|[0.69481376375091...|
|  0.0|       0.0|[0.62761785391637...|
|  0.0|       0.0|[0.67737025083674...|
|  0.0|       0.0|[0.69620002244455...|
|  0.0|       0.0|[0.86348551518537...|
+-----+----------+--------------------+
only showing top 20 rows



In [61]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.6818197121020741

In [62]:
evaluator.getMetricName()

'areaUnderROC'

In [64]:
tp = predictions[(predictions.label == 1) & (predictions.prediction == 1)].count()
tn = predictions[(predictions.label == 0) & (predictions.prediction == 0)].count()
fp = predictions[(predictions.label == 0) & (predictions.prediction == 1)].count()
fn = predictions[(predictions.label == 1) & (predictions.prediction == 0)].count()

In [67]:
accuracy = (tp + tn) * 100 / (tp + tn + fp + fn)
accuracy

63.83987205636665

In [68]:
precision = tp / (tp + fp)
precision

0.6175657058859233

In [69]:
recall = tp / (tp + fn)
recall

0.4969072444187992

In [71]:
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.5507049284155684

In [72]:
lrModel.save('logistic_model.model')