In [1]:
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark
from pyspark.sql.functions import *

role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()

classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
    .master("local[*]").getOrCreate()
    
spark

In [40]:
import os
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, udf
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [3]:
cdc = spark.read.csv('output.csv', header = True)

In [4]:
cdc = cdc.drop('detail_age_type','detail_age', 'age_substitution_flag','age_recode_27', 'age_recode_12', 'infant_age_recode_22', 'icd_code_10th_revision')

In [5]:
cdc = cdc.drop('record_condition_20' , 'entity_condition_20', 'entity_condition_19', 'entity_condition_18', 'entity_condition_17', 'record_condition_19', 'record_condition_18', 'record_condition_16', 'record_condition_17','record_condition_15','record_condition_14','record_condition_13','record_condition_12','record_condition_11','record_condition_10','record_condition_9','record_condition_8','record_condition_7' , 'record_condition_6', 'record_condition_5', 'entity_condition_16', 'entity_condition_15','entity_condition_14', 'entity_condition_13', 'entity_condition_12', 'entity_condition_11', 'entity_condition_10', 'entity_condition_9','entity_condition_8', 'entity_condition_7', 'entity_condition_6', 'entity_condition_5')

In [6]:
cdc.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_1989_revision: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- education_reporting_flag: string (nullable = true)
 |-- month_of_death: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- place_of_death_and_decedents_status: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- day_of_week_of_death: string (nullable = true)
 |-- current_data_year: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- manner_of_death: string (nullable = true)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = true)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = true)
 |-- 358_cause_recode: string (nullable = true)
 |-- 113_cause_recode: string (nullable = true)
 |-- 130_infant_cause_recode: 

In [7]:
cdc = cdc.fillna({'place_of_injury_for_causes_w00_y34_except_y06_and_y07_' : '10'})
cdc = cdc.fillna({'130_infant_cause_recode' : '000'})
cdc = cdc.fillna({'activity_code' : '10'})
cdc = cdc.fillna({'manner_of_death' : '8'})
cdc = cdc.fillna({'Place_of_death_and_decedents_status': '7'})

In [8]:
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'R' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'E' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'D' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'U' , 'O'))
cdc = cdc.withColumn('Place_of_death_and_decedents_status', regexp_replace('Place_of_death_and_decedents_status', '9' , '7'))

In [9]:
#cdc = cdc.withColumn('Place_of_death_and_decedents_status', 
 #                    when(cdc['Place_of_death_and_decedents_status']== 9 , 7).otherwise(cdc['Place_of_death_and_decedents_status']))

In [10]:
#cdc.select('Place_of_death_and_decedents_status').distinct().show()

In [11]:
#cdc.select('place_of_injury_for_causes_w00_y34_except_y06_and_y07_').distinct().show()


In [12]:
#cdc.select("education_1989_revision").distinct().show()

In [13]:
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '00' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '01' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '02' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '03' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '04' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '05' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '06' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '07' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '08' , '1'))

cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '09' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '10' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '11' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '12' , '3'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '13' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '14' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '15' , '5'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '16' , '6'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '17' , '7'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '99' , '9'))

In [14]:
cdc = cdc.withColumn("education_2003_revision", coalesce(cdc.education_2003_revision,cdc.education_1989_revision))

In [15]:
cdc = cdc.filter(cdc.education_2003_revision.isNotNull())

In [16]:
#cdc.groupBy('education_2003_revision').count().orderBy('count', ascending=False).show()

In [17]:
#cdc.printSchema()


In [18]:
cdc = cdc.drop('education_1989_revision', 'education_reporting_flag','bridged_race_flag', 'race_imputation_flag', 
               'hispanic_origin','entity_condition_1','entity_condition_2','entity_condition_3','entity_condition_4',
              'record_condition_1','record_condition_2','record_condition_3','record_condition_4')

In [19]:
cdc = cdc.withColumn('number_of_entity_axis_conditions', cdc['number_of_entity_axis_conditions'].cast(DoubleType()))
cdc = cdc.withColumn('number_of_record_axis_conditions', cdc['number_of_record_axis_conditions'].cast(DoubleType()))

In [20]:
cdc = cdc.filter(cdc.method_of_disposition != 'O')

### Building prediction model

In [21]:
cols = cdc.columns
categorical_cols = cols.copy()
categorical_cols.remove('number_of_entity_axis_conditions')
categorical_cols.remove('number_of_record_axis_conditions')
categorical_cols.remove('method_of_disposition')

In [22]:
categorical_cols

['resident_status',
 'education_2003_revision',
 'month_of_death',
 'sex',
 'age_recode_52',
 'Place_of_death_and_decedents_status',
 'marital_status',
 'day_of_week_of_death',
 'current_data_year',
 'injury_at_work',
 'manner_of_death',
 'autopsy',
 'activity_code',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_',
 '358_cause_recode',
 '113_cause_recode',
 '130_infant_cause_recode',
 '39_cause_recode',
 'race',
 'race_recode_3',
 'race_recode_5',
 'hispanic_originrace_recode']

In [23]:
stages = []
for col in categorical_cols:
    string_indexer = StringIndexer(inputCol=col, outputCol=col + "Index")
    encoder = OneHotEncoder(inputCol=col + "Index", outputCol=col + "classVec")
    
    stages += [string_indexer, encoder]

In [24]:
label_string_index = StringIndexer(inputCol="method_of_disposition", outputCol="label")
stages += [label_string_index]

In [25]:
numeric_cols = ['number_of_entity_axis_conditions', 'number_of_record_axis_conditions']
assemblerInputs = [c + "classVec" for c in categorical_cols] + numeric_cols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [27]:
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(cdc)
preppedDataDF = pipelineModel.transform(cdc)

In [28]:
selectedcols = ["label", "features"] + cols
dataset = preppedDataDF.select(selectedcols)
display(dataset)

DataFrame[label: double, features: vector, resident_status: string, education_2003_revision: string, month_of_death: string, sex: string, age_recode_52: string, Place_of_death_and_decedents_status: string, marital_status: string, day_of_week_of_death: string, current_data_year: string, injury_at_work: string, manner_of_death: string, method_of_disposition: string, autopsy: string, activity_code: string, place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string, 358_cause_recode: string, 113_cause_recode: string, 130_infant_cause_recode: string, 39_cause_recode: string, number_of_entity_axis_conditions: double, number_of_record_axis_conditions: double, race: string, race_recode_3: string, race_recode_5: string, hispanic_originrace_recode: string]

In [29]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)

In [41]:
glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.1)

In [42]:
lrModel = glr.fit(trainingData)

In [99]:
meta = [f.metadata 
    for f in preppedDataDF.schema.fields 
    if f.name == 'features'][0]
features_name_ind = meta['ml_attr']['attrs']['binary'] + meta['ml_attr']['attrs']['numeric']
features_name_ind

[{'idx': 0, 'name': 'resident_statusclassVec_1'},
 {'idx': 1, 'name': 'resident_statusclassVec_2'},
 {'idx': 2, 'name': 'resident_statusclassVec_3'},
 {'idx': 3, 'name': 'education_2003_revisionclassVec_3'},
 {'idx': 4, 'name': 'education_2003_revisionclassVec_1'},
 {'idx': 5, 'name': 'education_2003_revisionclassVec_4'},
 {'idx': 6, 'name': 'education_2003_revisionclassVec_2'},
 {'idx': 7, 'name': 'education_2003_revisionclassVec_6'},
 {'idx': 8, 'name': 'education_2003_revisionclassVec_5'},
 {'idx': 9, 'name': 'education_2003_revisionclassVec_7'},
 {'idx': 10, 'name': 'education_2003_revisionclassVec_9'},
 {'idx': 11, 'name': 'month_of_deathclassVec_01'},
 {'idx': 12, 'name': 'month_of_deathclassVec_12'},
 {'idx': 13, 'name': 'month_of_deathclassVec_03'},
 {'idx': 14, 'name': 'month_of_deathclassVec_11'},
 {'idx': 15, 'name': 'month_of_deathclassVec_10'},
 {'idx': 16, 'name': 'month_of_deathclassVec_02'},
 {'idx': 17, 'name': 'month_of_deathclassVec_04'},
 {'idx': 18, 'name': 'month_

In [120]:
features_name_ind[0]['name']

'resident_statusclassVec_1'

In [123]:
feature_names = ['intercept']
for i in features_name_ind:
    feature_names.append(i['name'])

['intercept', 'resident_statusclassVec_1', 'resident_statusclassVec_2', 'resident_statusclassVec_3', 'education_2003_revisionclassVec_3', 'education_2003_revisionclassVec_1', 'education_2003_revisionclassVec_4', 'education_2003_revisionclassVec_2', 'education_2003_revisionclassVec_6', 'education_2003_revisionclassVec_5', 'education_2003_revisionclassVec_7', 'education_2003_revisionclassVec_9', 'month_of_deathclassVec_01', 'month_of_deathclassVec_12', 'month_of_deathclassVec_03', 'month_of_deathclassVec_11', 'month_of_deathclassVec_10', 'month_of_deathclassVec_02', 'month_of_deathclassVec_04', 'month_of_deathclassVec_05', 'month_of_deathclassVec_07', 'month_of_deathclassVec_08', 'month_of_deathclassVec_09', 'sexclassVec_M', 'age_recode_52classVec_43', 'age_recode_52classVec_42', 'age_recode_52classVec_41', 'age_recode_52classVec_44', 'age_recode_52classVec_40', 'age_recode_52classVec_39', 'age_recode_52classVec_38', 'age_recode_52classVec_37', 'age_recode_52classVec_36', 'age_recode_52c

In [106]:
pvalues = lrModel.summary.pValues
len(pvalues)

783

In [142]:
intercept = lrModel.intercept
coefficients = list(lrModel.coefficients)
coefficients.insert(0, intercept)
coefficients

[-0.39246637221373254,
 0.046720960993353766,
 -0.08716687962191344,
 0.03302791352535616,
 -0.024990141739336287,
 -0.3087845238912354,
 0.10017297347643474,
 -0.08762353146512557,
 0.14495346964456915,
 0.0530049552829441,
 0.07923801108325626,
 -0.016477756960348776,
 -0.014634109147210873,
 0.015224379794424153,
 -0.007481315556300118,
 0.009461788372710809,
 0.004255151607741193,
 -0.010459919805591958,
 -0.006848515052451526,
 -0.0023860448916742942,
 0.001552725496770674,
 0.005163985511986896,
 0.004121763633382317,
 0.0934284189784431,
 -0.15998548755595826,
 -0.09888053472835225,
 -0.022831890536473467,
 -0.17558613753604166,
 0.05544871445333076,
 0.11033598769549097,
 0.13899043235341627,
 0.14327322928571076,
 0.11656769567801095,
 -0.1205949740916935,
 0.06599416006097132,
 0.03067575914437255,
 0.008055309530822395,
 -0.0035216083605450663,
 -0.035202416774891755,
 -0.013085861501510051,
 -0.02257809945421992,
 -0.02010953168809902,
 0.017723805605672847,
 0.010855998086

In [143]:
df_values = [('feature_names', feature_names), ('coefficients', coefficients),('p_values', pvalues)]
inference_df = pd.DataFrame.from_items(df_values)

In [144]:
inference_df.head(20)

Unnamed: 0,feature_names,coefficients,p_values
0,intercept,-0.392466,0.0
1,resident_statusclassVec_1,0.046721,0.0
2,resident_statusclassVec_2,-0.087167,0.0
3,resident_statusclassVec_3,0.033028,0.0
4,education_2003_revisionclassVec_3,-0.02499,0.0
5,education_2003_revisionclassVec_1,-0.308785,0.0
6,education_2003_revisionclassVec_4,0.100173,0.0
7,education_2003_revisionclassVec_2,-0.087624,0.0
8,education_2003_revisionclassVec_6,0.144953,0.0
9,education_2003_revisionclassVec_5,0.053005,0.0


In [145]:
inference_df.to_csv('inference_df.csv')

In [57]:
predictions = lrModel.transform(testData)

In [73]:
predictions.select("label", "prediction", "probability", 'resident_status').show()

KeyboardInterrupt: 

<bound method HasInputCols.getInputCols of VectorAssembler_441aac334f4bd0abed85>

In [59]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.6818197121020716

In [62]:
evaluator.getMetricName()

'areaUnderROC'

In [64]:
tp = selected.where(selected["label"] == 1).where(selected["prediction"] == 1).count()
tn = selected.where(selected["label"] == 0).where(selected["prediction"] == 0).count()
fp = selected.where(selected["label"] == 0).where(selected["prediction"] == 1).count()
fn = selected.where(selected["label"] == 1).where(selected["prediction"] == 0).count()

In [67]:
accuracy = (tp + tn) * 100 / (tp + tn + fp + fn)
accuracy

63.83987205636665

In [68]:
precision = tp / (tp + fp)
precision

0.6175657058859233

In [69]:
recall = tp / (tp + fn)
recall

0.4969072444187992

In [71]:
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.5507049284155684

In [72]:
lrModel.save('logistic_model.model')

In [53]:
lrmodel = LogisticRegression.load(path='')

Py4JJavaError: An error occurred while calling o1145.load.
: java.lang.NoSuchMethodException: org.apache.spark.ml.classification.LogisticRegressionModel.<init>(java.lang.String)
	at java.lang.Class.getConstructor0(Class.java:3082)
	at java.lang.Class.getConstructor(Class.java:1825)
	at org.apache.spark.ml.util.DefaultParamsReader.load(ReadWrite.scala:328)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)


In [28]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

In [33]:
rfModel = rf.fit(trainingData)

In [34]:
predictions = rfModel.transform(testData)

In [35]:
predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- resident_status: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- month_of_death: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- Place_of_death_and_decedents_status: string (nullable = false)
 |-- marital_status: string (nullable = true)
 |-- day_of_week_of_death: string (nullable = true)
 |-- current_data_year: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- manner_of_death: string (nullable = false)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = false)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = false)
 |-- 358_cause_recode: string (nullable = true)
 |-- 113_cause_recode: string (nullable = true)
 |-- 130_infant_cause_recode: string (nullable = false)
 |--

In [36]:
selected = predictions.select("label", "prediction", "probability")

In [37]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.6585139971498309

In [43]:
tp = selected.where(selected["label"] == 1).where(selected["prediction"] == 1).count()

In [None]:
tn = selected.where(selected["label"] == 0).where(selected["prediction"] == 0).count()
fp = selected.where(selected["label"] == 0).where(selected["prediction"] == 1).count()
fn = selected.where(selected["label"] == 1).where(selected["prediction"] == 0).count()

In [46]:
accuracy = (tp + tn) * 100 / (tp + tn + fp + fn)
accuracy

59.852210662949055

In [47]:
precision = tp / (tp + fp)
precision

0.6760139560824394

In [48]:
recall = tp / (tp + fn)
recall

0.19241678982932794

In [49]:
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.29956662847683896