In [1]:
import os
import boto3

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark
from pyspark.sql.functions import *
                                                                        
role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
jars = sagemaker_pyspark.classpath_jars()


classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github to learn how to connect to EMR from a notebook instance
spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
    .master("local[*]").getOrCreate()
    
spark

In [2]:
bucket = 'mrinal-ml-sagemaker'
import os
import pandas as pd
import numpy as np
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
import pyspark.sql.functions as fn
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.mllib.util import MLUtils

In [3]:
cdc = spark.read.csv('output.csv', header = True)

In [4]:
cdc = cdc.drop('detail_age_type','detail_age', 'age_substitution_flag','age_recode_27', 'age_recode_12', 'infant_age_recode_22', 'icd_code_10th_revision')

In [5]:
cdc = cdc.drop('record_condition_20' , 'entity_condition_20', 'entity_condition_19', 'entity_condition_18', 'entity_condition_17', 'record_condition_19', 'record_condition_18', 'record_condition_16', 'record_condition_17','record_condition_15','record_condition_14','record_condition_13','record_condition_12','record_condition_11','record_condition_10','record_condition_9','record_condition_8','record_condition_7' , 'record_condition_6', 'record_condition_5', 'entity_condition_16', 'entity_condition_15','entity_condition_14', 'entity_condition_13', 'entity_condition_12', 'entity_condition_11', 'entity_condition_10', 'entity_condition_9','entity_condition_8', 'entity_condition_7', 'entity_condition_6', 'entity_condition_5')

In [6]:
cdc.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_1989_revision: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- education_reporting_flag: string (nullable = true)
 |-- month_of_death: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- place_of_death_and_decedents_status: string (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- day_of_week_of_death: string (nullable = true)
 |-- current_data_year: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- manner_of_death: string (nullable = true)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = true)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = true)
 |-- 358_cause_recode: string (nullable = true)
 |-- 113_cause_recode: string (nullable = true)
 |-- 130_infant_cause_recode: 

In [7]:
cdc = cdc.fillna({'place_of_injury_for_causes_w00_y34_except_y06_and_y07_' : '10'})
cdc = cdc.fillna({'130_infant_cause_recode' : '000'})
cdc = cdc.fillna({'activity_code' : '10'})
cdc = cdc.fillna({'manner_of_death' : '8'})
cdc = cdc.fillna({'Place_of_death_and_decedents_status': '7'})

In [8]:
cdc = cdc.fillna({'record_condition_1': 'Other'})
cdc = cdc.fillna({'record_condition_2': 'Other'})
cdc = cdc.fillna({'record_condition_3': 'Other'})
cdc = cdc.fillna({'record_condition_4': 'Other'})

cdc = cdc.fillna({'entity_condition_1': 'Other'})
cdc = cdc.fillna({'entity_condition_2': 'Other'})
cdc = cdc.fillna({'entity_condition_3': 'Other'})
cdc = cdc.fillna({'entity_condition_4': 'Other'})

In [9]:
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'R' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'E' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'D' , 'O'))
cdc = cdc.withColumn('method_of_disposition', regexp_replace('method_of_disposition', 'U' , 'O'))
cdc = cdc.withColumn('Place_of_death_and_decedents_status', regexp_replace('Place_of_death_and_decedents_status', '9' , '7'))

In [10]:
#cdc = cdc.withColumn('Place_of_death_and_decedents_status', 
 #                    when(cdc['Place_of_death_and_decedents_status']== 9 , 7).otherwise(cdc['Place_of_death_and_decedents_status']))

In [11]:
#cdc.groupby('record_condition_1').count().show()

In [12]:
#cdc.select('place_of_injury_for_causes_w00_y34_except_y06_and_y07_').distinct().show()


In [13]:
#cdc.select("education_1989_revision").distinct().show()

In [14]:
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '00' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '01' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '02' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '03' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '04' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '05' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '06' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '07' , '1'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '08' , '1'))

cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '09' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '10' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '11' , '2'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '12' , '3'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '13' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '14' , '4'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '15' , '5'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '16' , '6'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '17' , '7'))
cdc = cdc.withColumn('education_1989_revision', regexp_replace('education_1989_revision', '99' , '9'))



In [15]:
cdc = cdc.withColumn("education_2003_revision", coalesce(cdc.education_2003_revision,cdc.education_1989_revision))

In [16]:
cdc = cdc.filter(cdc.education_2003_revision.isNotNull())

In [20]:
cdc.groupBy('manner_of_death').count().orderBy('count', ascending=False).show()

+---------------+--------+
|manner_of_death|   count|
+---------------+--------+
|              7|19914162|
|              8| 5656790|
|              1| 1350268|
|              2|  423361|
|              3|  200528|
|              5|  119756|
|              4|   55808|
+---------------+--------+



In [18]:
cdc.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_1989_revision: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- education_reporting_flag: string (nullable = true)
 |-- month_of_death: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- Place_of_death_and_decedents_status: string (nullable = false)
 |-- marital_status: string (nullable = true)
 |-- day_of_week_of_death: string (nullable = true)
 |-- current_data_year: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- manner_of_death: string (nullable = false)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = false)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = false)
 |-- 358_cause_recode: string (nullable = true)
 |-- 113_cause_recode: string (nullable = true)
 |-- 130_infant_cause_reco

In [21]:
cdc = cdc.drop('education_1989_revision', 'education_reporting_flag','bridged_race_flag', 'race_imputation_flag', 
               'hispanic_origin')

## Selecting data for firearms

In [28]:
new_cdc = cdc.filter((cdc['358_cause_recode'] == '429') | (cdc['358_cause_recode'] == '435') | (cdc['358_cause_recode'] == '446')
                       | (cdc['358_cause_recode'] == '450') | (cdc['358_cause_recode'] == '451') | (cdc['358_cause_recode'] == '407'))
new_cdc = new_cdc.drop('number_of_entity_axis_conditions','number_of_record_axis_conditions', '39_cause_recode',
                      '130_infant_cause_recode','113_cause_recode',  'month_of_death','current_data_year','day_of_week_of_death')

In [30]:
new_cdc.groupBy('manner_of_death').count().orderBy('count', ascending=False).show()

+---------------+------+
|manner_of_death| count|
+---------------+------+
|              2|213092|
|              3|135714|
|              1|  4196|
|              5|  2745|
|              4|  1125|
|              7|   755|
|              8|   621|
+---------------+------+



In [31]:
from pyspark.sql.functions import lit
new_cdc = new_cdc.withColumn("suicide", lit(0))
new_cdc.show()

+---------------+-----------------------+---+-------------+-----------------------------------+--------------+--------------+---------------+---------------------+-------+-------------+------------------------------------------------------+----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+----+-------------+-------------+--------------------------+-------+
|resident_status|education_2003_revision|sex|age_recode_52|Place_of_death_and_decedents_status|marital_status|injury_at_work|manner_of_death|method_of_disposition|autopsy|activity_code|place_of_injury_for_causes_w00_y34_except_y06_and_y07_|358_cause_recode|entity_condition_1|entity_condition_2|entity_condition_3|entity_condition_4|record_condition_1|record_condition_2|record_condition_3|record_condition_4|race|race_recode_3|race_recode_5|hispanic_originrace_recode|suicide|
+---------------+-----------------------+---

In [33]:
new_cdc = new_cdc.withColumn('suicide', 
                     when(new_cdc['manner_of_death']== '2' , 1).otherwise(new_cdc['suicide']))

In [34]:
new_cdc.groupBy('suicide').count().orderBy('count', ascending=False).show()

+-------+------+
|suicide| count|
+-------+------+
|      1|213092|
|      0|145156|
+-------+------+



In [35]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel

In [36]:
#from pyspark.sql.types import DoubleType
#new_cdc = new_cdc.withColumn("number_of_entity_axis_conditions", new_cdc["number_of_entity_axis_conditions"].cast(DoubleType()))
#new_cdc = cdc.withColumn("suicide", new_cdc["suicide"].cast(StringType())

In [73]:
new_cdc.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- Place_of_death_and_decedents_status: string (nullable = false)
 |-- marital_status: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = false)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = false)
 |-- race: string (nullable = true)
 |-- race_recode_3: string (nullable = true)
 |-- race_recode_5: string (nullable = true)
 |-- hispanic_originrace_recode: string (nullable = true)
 |-- suicide: string (nullable = false)



In [51]:
new_cdc = new_cdc.drop('manner_of_death','entity_condition_1','entity_condition_2','entity_condition_3','entity_condition_4',
                       'record_condition_1','record_condition_2','record_condition_3','record_condition_4','358_cause_recode')

In [48]:
new_cdc.select('entity_condition_1').distinct().count()

491

In [53]:
#new_cdc.groupby('358_cause_recode').count().orderBy('count').show()

In [54]:
#cdc_pd = new_cdc.toPandas()

In [55]:
#cdc_pd.info()

In [56]:
#from xgboost import XGBClassifier
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import accuracy_score

In [57]:
#X = cdc_pd.drop(['358_cause_recode'], axis = 1)
#Y = cdc_pd['358_cause_recode']

In [58]:
#from sklearn.preprocessing import LabelEncoder
#labelencoder = LabelEncoder()
#x[:, 13] = labelencoder.fit_transform(x[:, 0])

In [59]:
#X.values[:5]

In [60]:
'''X.values[:,12] = labelencoder.fit_transform(X.values[:, 12])
X.values[:,13]= labelencoder.fit_transform(X.values[:, 13])
X.values[:,14]= labelencoder.fit_transform(X.values[:, 14])
X.values[:,15]= labelencoder.fit_transform(X.values[:, 15])
X.values[:,16]= labelencoder.fit_transform(X.values[:, 16])
X.values[:,17]= labelencoder.fit_transform(X.values[:, 17])
X.values[:,18]= labelencoder.fit_transform(X.values[:, 18])
X.values[:,19]= labelencoder.fit_transform(X.values[:, 19])'''

'X.values[:,12] = labelencoder.fit_transform(X.values[:, 12])\nX.values[:,13]= labelencoder.fit_transform(X.values[:, 13])\nX.values[:,14]= labelencoder.fit_transform(X.values[:, 14])\nX.values[:,15]= labelencoder.fit_transform(X.values[:, 15])\nX.values[:,16]= labelencoder.fit_transform(X.values[:, 16])\nX.values[:,17]= labelencoder.fit_transform(X.values[:, 17])\nX.values[:,18]= labelencoder.fit_transform(X.values[:, 18])\nX.values[:,19]= labelencoder.fit_transform(X.values[:, 19])'

In [61]:
'''X.values[:,2]= labelencoder.fit_transform(X.values[:, 2])
X.values[:,5]= labelencoder.fit_transform(X.values[:, 5])
X.values[:,6]= labelencoder.fit_transform(X.values[:, 6])
X.values[:,9]= labelencoder.fit_transform(X.values[:, 9])
X.values[:,19]= labelencoder.fit_transform(X.values[:, 19])'''

'X.values[:,2]= labelencoder.fit_transform(X.values[:, 2])\nX.values[:,5]= labelencoder.fit_transform(X.values[:, 5])\nX.values[:,6]= labelencoder.fit_transform(X.values[:, 6])\nX.values[:,9]= labelencoder.fit_transform(X.values[:, 9])\nX.values[:,19]= labelencoder.fit_transform(X.values[:, 19])'

In [62]:
#X.values[:,8]= labelencoder.fit_transform(X.values[:, 8])

In [63]:
# split data into train and test sets
#X = X.astype(float)
#seed = 7
#test_size = 0.33
#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [64]:
# fit model no training data
#model = XGBClassifier()
#model.fit(X_train, y_train)

In [65]:
# make predictions for test data
#y_pred = model.predict(X_test)
#predictions = [round(value) for value in y_pred]
# evaluate predictions
#accuracy = accuracy_score(y_test, predictions)
#print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [66]:
#print(model.feature_importances_)

In [67]:
#from matplotlib import pyplot
#pyplot.bar(range(len(model.feature_importances_)), model.feature_importances_)
#pyplot.show()


In [71]:
new_cdc = new_cdc.withColumn("suicide", new_cdc["suicide"].cast(StringType()))

# Use stringindexer

In [76]:
indexer = StringIndexer(inputCol="resident_status", outputCol="resident_statusIndex").fit(new_dc)
new_df = indexer.transform(new_dc)
indexer = StringIndexer(inputCol="education_2003_revision", outputCol="education_2003_revisionIndex").fit(new_df)
new_df = indexer.transform(new_df)
indexer = StringIndexer(inputCol="sex", outputCol="sexIndex").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="age_recode_52", outputCol="age_recode_52Index").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="Place_of_death_and_decedents_status", outputCol="Place_of_death_and_decedents_statusIndex").fit(new_cdc)
new_df = indexer.transform(new_cdc)
indexer = StringIndexer(inputCol="marital_status", outputCol="marital_statusIndex").fit(new_df)
new_df = indexer.transform(new_df)
indexer = StringIndexer(inputCol="injury_at_work", outputCol="injury_at_workIndex").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="autopsy", outputCol="autopsyIndex").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="method_of_disposition", outputCol="method_of_dispositionIndex").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="activity_code", outputCol="activity_codeIndex").fit(new_df)
new_df = indexer.transform(new_df)
indexer = StringIndexer(inputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_", outputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_Index").fit(new_df)
new_df = indexer.transform(new_df)
indexer = StringIndexer(inputCol="race", outputCol="raceIndex").fit(new_df)
new_df =indexer.transform(new_df)
indexer = StringIndexer(inputCol="race_recode_3", outputCol="race_recode_3Index").fit(new_df)
new_df =indexer.transform(new_df)

'indexer = StringIndexer(inputCol="Place_of_death_and_decedents_status", outputCol="Place_of_death_and_decedents_statusIndex").fit(new_cdc)\nnew_df = indexer.transform(new_cdc)\nindexer = StringIndexer(inputCol="marital_status", outputCol="marital_statusIndex").fit(new_df)\nnew_df = indexer.transform(new_df)\nindexer = StringIndexer(inputCol="injury_at_work", outputCol="injury_at_workIndex").fit(new_df)\nnew_df =indexer.transform(new_df)\nindexer = StringIndexer(inputCol="autopsy", outputCol="autopsyIndex").fit(new_df)\nnew_df =indexer.transform(new_df)\nindexer = StringIndexer(inputCol="method_of_disposition", outputCol="method_of_dispositionIndex").fit(new_df)\nnew_df =indexer.transform(new_df)\nindexer = StringIndexer(inputCol="activity_code", outputCol="activity_codeIndex").fit(new_df)\nnew_df = indexer.transform(new_df)\nindexer = StringIndexer(inputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_", outputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_Index"

In [78]:
indexer = OneHotEncoder(inputCol="resident_statusIndex", outputCol="resident_statusVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="education_2003_revisionIndex", outputCol="education_2003_revisionVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="sexIndex", outputCol="sexVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="age_recode_52Index", outputCol="age_recode_52Vec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="Place_of_death_and_decedents_statusIndex", outputCol="Place_of_death_and_decedents_statusVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="marital_statusIndex", outputCol="marital_statusVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="injury_at_workIndex", outputCol="injury_at_workVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="autopsyIndex", outputCol="autopsyVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="method_of_dispositionIndex", outputCol="method_of_dispositionVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="activity_codeIndex", outputCol="activity_codeVec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_Index", outputCol="place_of_injury_for_causes_w00_y34_except_y06_and_y07_Vec")
new_df = indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="raceIndex", outputCol="raceVec")
new_df =indexer.transform(new_df)
indexer = OneHotEncoder(inputCol="race_recode_3Index", outputCol="race_recode_3Vec")
new_df =indexer.transform(new_df)

In [105]:
new_df.columns

['resident_status',
 'education_2003_revision',
 'sex',
 'age_recode_52',
 'Place_of_death_and_decedents_status',
 'marital_status',
 'injury_at_work',
 'method_of_disposition',
 'autopsy',
 'activity_code',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_',
 'race',
 'race_recode_3',
 'race_recode_5',
 'hispanic_originrace_recode',
 'suicide',
 'Place_of_death_and_decedents_statusIndex',
 'marital_statusIndex',
 'injury_at_workIndex',
 'autopsyIndex',
 'method_of_dispositionIndex',
 'activity_codeIndex',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_Index',
 'raceIndex',
 'race_recode_3Index',
 'resident_statusIndex',
 'education_2003_revisionIndex',
 'sexIndex',
 'age_recode_52Index',
 'resident_statusVec',
 'education_2003_revisionVec',
 'sexVec',
 'age_recode_52Vec',
 'Place_of_death_and_decedents_statusVec',
 'marital_statusVec',
 'injury_at_workVec',
 'autopsyVec',
 'method_of_dispositionVec',
 'activity_codeVec',
 'place_of_injury_for_causes_w00_y34_except_y06_

In [29]:
new_df1 = new_df.drop('resident_status','education_2003_revision','month_of_death','sex','age_recode_52','Place_of_death_and_decedents_status',
                     'marital_status','day_of_week_of_death','current_data_year','injury_at_work','manner_of_death',
                     'autopsy','activity_code','activity_code','place_of_injury_for_causes_w00_y34_except_y06_and_y07_','358_cause_recode',
                     '113_cause_recode','130_infant_cause_recode','39_cause_recode','race','race_recode_3','race_recode_5','hispanic_originrace_recode')

In [118]:
new_df.printSchema()

root
 |-- resident_status: string (nullable = true)
 |-- education_2003_revision: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- age_recode_52: string (nullable = true)
 |-- Place_of_death_and_decedents_status: string (nullable = false)
 |-- marital_status: string (nullable = true)
 |-- injury_at_work: string (nullable = true)
 |-- method_of_disposition: string (nullable = true)
 |-- autopsy: string (nullable = true)
 |-- activity_code: string (nullable = false)
 |-- place_of_injury_for_causes_w00_y34_except_y06_and_y07_: string (nullable = false)
 |-- race: string (nullable = true)
 |-- race_recode_3: string (nullable = true)
 |-- race_recode_5: string (nullable = true)
 |-- hispanic_originrace_recode: string (nullable = true)
 |-- suicide: integer (nullable = false)
 |-- Place_of_death_and_decedents_statusIndex: double (nullable = true)
 |-- marital_statusIndex: double (nullable = true)
 |-- injury_at_workIndex: double (nullable = true)
 |-- autopsyIndex: double (nu

In [31]:
new_df1 = new_df1.drop('resident_statusIndex','education_2003_revisionIndex','month_of_deathIndex','sexIndex','age_recode_52Index',
                       'Place_of_death_and_decedents_statusIndex',
                     'marital_statusIndex','day_of_week_of_deathIndex','current_data_yearIndex','injury_at_workIndex',
                       'manner_of_deathIndex',
                     'autopsyIndex','activity_codeIndex','activity_codeIndex',
                       'place_of_injury_for_causes_w00_y34_except_y06_and_y07_Index','358_cause_recodeIndex',
                     '113_cause_recodeIndex','130_infant_cause_recodeIndex','39_cause_recodeIndex','raceIndex',
                       'race_recode_3Index','race_recode_5Index','hispanic_originrace_recodeIndex')

In [32]:
#cols = new_df1.columns
#ategorical_cols = cols.copy()
#categorical_cols.remove('number_of_entity_axis_conditions')
#categorical_cols.remove('number_of_record_axis_conditions')
#categorical_cols.remove('suicide')

In [80]:
#numeric_cols = ['number_of_entity_axis_conditions', 'number_of_record_axis_conditions']
assemblerInputs = ['resident_statusVec',
 'education_2003_revisionVec',
 'sexVec',
 'age_recode_52Vec',
 'Place_of_death_and_decedents_statusVec',
 'marital_statusVec',
 'injury_at_workVec',
 'autopsyVec',
 'method_of_dispositionVec',
 'activity_codeVec',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_Vec',
 'raceVec',
 'race_recode_3Vec']

In [81]:
assemblerInputs

['resident_statusVec',
 'education_2003_revisionVec',
 'sexVec',
 'age_recode_52Vec',
 'Place_of_death_and_decedents_statusVec',
 'marital_statusVec',
 'injury_at_workVec',
 'autopsyVec',
 'method_of_dispositionVec',
 'activity_codeVec',
 'place_of_injury_for_causes_w00_y34_except_y06_and_y07_Vec',
 'raceVec',
 'race_recode_3Vec']

In [35]:
#unwanted = ['number_of_entity_axis_conditions','number_of_record_axis_conditions', '39_cause_recodeVec',
 #                     '130_infant_cause_recodeVec','113_cause_recodeVec',  'month_of_deathVec','current_data_yearVec']

#assemblerInputs = [e for e in assemblerInputs if e not in unwanted]

In [84]:
label_string_index = StringIndexer(inputCol="suicide", outputCol="label").fit(new_df)
new_df = label_string_index.transform(new_df)

In [85]:
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
new_df = assembler.transform(new_df)  

In [86]:
training, test = new_df.randomSplit([0.7, 0.3], seed = 11)

# Run training algorithm to build the model
#lrModel = LogisticRegression().fit(training)

In [92]:
#from pyspark.ml.regression import GeneralizedLinearRegression
#glr = GeneralizedLinearRegression(family="binomial", link="logit", maxIter=10, regParam=0.3)

# Fit the model
#model = glr.fit(training)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
'''#print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
#print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()'''

Coefficients: [-0.042131078906297574,0.029149959227714243,0.006130124572996133,0.0013326199289400683,0.1961748425496155,-0.09215205622578627,-0.16255611665755132,0.13478432156231868,-0.06875327157424822,0.016703961313749377,-0.04894281345983757,-0.10979358864372828,0.1754544814271855,0.14659387087042025,0.11393806820772051,-0.10644488394501556,-0.05071589950693075,0.0600249266372091,0.004603456425884648,0.14536016406552182,-0.12583079374186365,-0.11234130686226547,-0.0823225522161746,-0.06557095292968328,-0.053932430924862385,-0.028583351563239694,-0.0006969609780372501,0.018552655327639035,-0.09219899138718816,-0.005778384125953754,-0.01859108177578447,-0.0030741326094232925,0.03862758611460665,0.02813038140897255,-0.0018946624038258999,-0.01733395994053962,-0.008096321243014265,-0.0044181237366271225,0.013727174511025078,0.005036653274217063,0.0010424721516789277,-0.015763562660550495,-0.005226405641104987,0.0003709284888385528,0.0005376522422949167,0.00015430169267307363,0.000213793

'#print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))\n#print("T Values: " + str(summary.tValues))\nprint("P Values: " + str(summary.pValues))\nprint("Dispersion: " + str(summary.dispersion))\nprint("Null Deviance: " + str(summary.nullDeviance))\nprint("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))\nprint("Deviance: " + str(summary.deviance))\nprint("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))\nprint("AIC: " + str(summary.aic))\nprint("Deviance Residuals: ")\nsummary.residuals().show()'

In [120]:
meta = [f.metadata 
    for f in new_df.schema.fields 
    if f.name == 'features'][0]
features_name_ind = meta['ml_attr']['attrs']['binary']
features_name_ind

[{'idx': 0, 'name': 'resident_statusVec_1'},
 {'idx': 1, 'name': 'resident_statusVec_2'},
 {'idx': 2, 'name': 'resident_statusVec_3'},
 {'idx': 3, 'name': 'education_2003_revisionVec_3'},
 {'idx': 4, 'name': 'education_2003_revisionVec_2'},
 {'idx': 5, 'name': 'education_2003_revisionVec_4'},
 {'idx': 6, 'name': 'education_2003_revisionVec_6'},
 {'idx': 7, 'name': 'education_2003_revisionVec_1'},
 {'idx': 8, 'name': 'education_2003_revisionVec_5'},
 {'idx': 9, 'name': 'education_2003_revisionVec_9'},
 {'idx': 10, 'name': 'education_2003_revisionVec_7'},
 {'idx': 11, 'name': 'sexVec_M'},
 {'idx': 12, 'name': 'age_recode_52Vec_30'},
 {'idx': 13, 'name': 'age_recode_52Vec_31'},
 {'idx': 14, 'name': 'age_recode_52Vec_32'},
 {'idx': 15, 'name': 'age_recode_52Vec_36'},
 {'idx': 16, 'name': 'age_recode_52Vec_35'},
 {'idx': 17, 'name': 'age_recode_52Vec_33'},
 {'idx': 18, 'name': 'age_recode_52Vec_34'},
 {'idx': 19, 'name': 'age_recode_52Vec_29'},
 {'idx': 20, 'name': 'age_recode_52Vec_37'},
 

In [133]:
feature_names = []
for i in model.featureImportances.indices:
    feature_names.append(features_name_ind[i]['name'])

In [135]:
len(feature_names)

62

In [146]:
df_values = [('feature_names', feature_names), ('variable_imp', model.featureImportances.values)]
inference_df = pd.DataFrame.from_items(df_values)
inference_df.sort_values(by = 'variable_imp', ascending= False)

Unnamed: 0,feature_names,variable_imp
54,raceVec_02,0.209822
53,raceVec_01,0.152422
61,race_recode_3Vec_3,0.100893
45,place_of_injury_for_causes_w00_y34_except_y06_...,0.096556
60,race_recode_3Vec_1,0.073787
39,autopsyVec_N,0.072708
47,place_of_injury_for_causes_w00_y34_except_y06_...,0.065462
30,Place_of_death_and_decedents_statusVec_2,0.041618
38,autopsyVec_Y,0.038952
28,Place_of_death_and_decedents_statusVec_4,0.032348


In [97]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

# Train model.  This also runs the indexers.
model = rf.fit(training)

# Make predictions.
predictions = model.transform(test)


In [103]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#accuracy = evaluator.evaluate(predictions)
#print("Test Error = %g" % (1.0 - accuracy))

evaluator = BinaryClassificationEvaluator(
    labelCol="label", rawPredictionCol="prediction")

auc = evaluator.evaluate(predictions)
print("AUC= %g" % (auc))

#rfModel = model.stages[2]
#print()  # summary only


Exception ignored in: <object repr() failed>
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pyspark/ml/wrapper.py", line 105, in __del__
    SparkContext._active_spark_context._gateway.detach(self._java_obj)
AttributeError: 'MulticlassClassificationEvaluator' object has no attribute '_java_obj'


AUC= 0.793773


In [110]:
selected = predictions.select("label", "prediction", "probability")

In [130]:
model.featureImportances[4]

0.0038043378849072227

In [None]:
meta = [f.metadata 
    for f in preppedDataDF.schema.fields 
    if f.name == 'features'][0]
features_name_ind = meta['ml_attr']['attrs']['binary']
features_name_ind

In [111]:
tp = selected.where(selected["label"] == 1).where(selected["prediction"] == 1).count()
tn = selected.where(selected["label"] == 0).where(selected["prediction"] == 0).count()
fp = selected.where(selected["label"] == 0).where(selected["prediction"] == 1).count()
fn = selected.where(selected["label"] == 1).where(selected["prediction"] == 0).count()

In [112]:
accuracy = (tp + tn) * 100 / (tp + tn + fp + fn)
accuracy

82.18496735053026

In [113]:
precision = tp / (tp + fp)
precision

0.8780351313539562

In [114]:
recall = tp / (tp + fn)
recall

0.6484513121943379

In [115]:
f1_score = 2 * precision * recall / (precision + recall)
f1_score

0.7459784991679654

In [136]:
len(model.featureImportances.indices)

62

In [106]:
model.featureImportances

SparseVector(104, {0: 0.001, 1: 0.0005, 2: 0.0, 4: 0.0038, 5: 0.0001, 6: 0.0013, 7: 0.0008, 8: 0.0001, 9: 0.0, 10: 0.0, 11: 0.0034, 12: 0.0027, 13: 0.0006, 14: 0.0002, 15: 0.0001, 16: 0.0001, 18: 0.0, 19: 0.0073, 20: 0.0026, 21: 0.0042, 22: 0.0008, 23: 0.0005, 24: 0.0004, 25: 0.0005, 26: 0.0, 27: 0.0, 29: 0.0001, 30: 0.0, 56: 0.0323, 57: 0.0007, 58: 0.0416, 59: 0.0001, 62: 0.0178, 63: 0.0124, 64: 0.0071, 65: 0.0004, 66: 0.0, 67: 0.0002, 68: 0.039, 69: 0.0727, 70: 0.0143, 71: 0.0006, 72: 0.0047, 73: 0.0143, 77: 0.0, 79: 0.0966, 80: 0.0, 81: 0.0655, 82: 0.0006, 83: 0.0, 84: 0.0104, 86: 0.0, 88: 0.0, 89: 0.1524, 90: 0.2098, 91: 0.0003, 92: 0.0, 94: 0.0, 97: 0.0, 99: 0.0, 102: 0.0738, 103: 0.1009})