In [63]:
import pandas as pd
import numpy as np
from collections import defaultdict

# disable false positive warnings
pd.options.mode.chained_assignment = None

# run models notebook -- similar to import
%run models.ipynb

Step 1: Read in data from file. My data was in csv format, and I used pandas to house the data in a dataframe. I also shuffled the samples in the data and extracted the names of the targets (drug names) I will use in training.

In [86]:
dataframe = pd.read_csv("Accidental_Drug_Related_Deaths__2012-2017.csv")


dataframe.rename(columns = {'Morphine (not heroin)': 'Morphine'},inplace=True)
drug_names = dataframe.columns[15:27]

# shuffle data
dataframe = dataframe.sample(frac=1).reset_index(drop=True)

dataframe.head()

Unnamed: 0,CaseNumber,Date,Sex,Race,Age,Residence City,Residence State,Residence County,Death City,Death State,...,Benzodiazepine,Methadone,Amphet,Tramad,Morphine (not heroin),Other,Any Opioid,MannerofDeath,AmendedMannerofDeath,DeathLoc
0,17-465,06/05/2017,Male,White,34.0,NORTH GROSVENORDALE,CT,WINDHAM,NORTH GROSVENORDALE,CTCCTTCT,...,,,,,,,,Accident,Acute Intoxication From the Combined Effects o...,"North Grosvenordale, CT\n(41.985644, -71.899438)"
1,15-18455,11/18/2015,Male,"Hispanic, White",49.0,HARTFORD,CT,HARTFORD,HARTFORD,CT,...,,,,,,,Y,Accident,,"HARTFORD, CT\n(41.765775, -72.673356)"
2,16-2504,06/28/2016,Male,White,44.0,BRISTOL,CT,HARTFORD,BRISTOL,CT,...,,,,,,,,Accident,,"BRISTOL, CT\n(41.673037, -72.945791)"
3,17-296,04/10/2017,Male,White,59.0,WATERBURY,CT,NEW HAVEN,WATERBURY,CTCCTTCT,...,Y,,,Y,,,,Accident,Acute Intoxication due to the Combined Effects...,"Waterbury, CT\n(41.554261, -73.043069)"
4,16-3012,09/17/2016,Male,White,28.0,OXFORD,,NEW HAVEN,WATERBURY,,...,Y,,,,,,,Accident,,"WATERBURY, CT\n(41.554261, -73.043069)"


Step 2: Clean up the dataset. This could involve removing NaN's, records with missing info, columns you know for sure you don't need, etc.

In [87]:
# sanitize data -- drop records with NA in feature columns
dataframe.dropna(subset=["Sex","Race","Age"],inplace=True)
# drop problematic columns
dataframe.drop(columns=["Other","Any Opioid"],inplace=True)
# replace NA's in drugs with N
dataframe[drug_names]=dataframe[drug_names].fillna(value="N")
# replace y's with Y's
dataframe.replace(to_replace="y",value="Y",inplace=True)
# replace other discrepancies
dataframe = dataframe[(dataframe[drug_names].isin(["Y","N"])).all(axis=1)]

dataframe.head()

Unnamed: 0,CaseNumber,Date,Sex,Race,Age,Residence City,Residence State,Residence County,Death City,Death State,...,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine (not heroin),MannerofDeath,AmendedMannerofDeath,DeathLoc
0,17-465,06/05/2017,Male,White,34.0,NORTH GROSVENORDALE,CT,WINDHAM,NORTH GROSVENORDALE,CTCCTTCT,...,N,N,N,N,N,N,N,Accident,Acute Intoxication From the Combined Effects o...,"North Grosvenordale, CT\n(41.985644, -71.899438)"
1,15-18455,11/18/2015,Male,"Hispanic, White",49.0,HARTFORD,CT,HARTFORD,HARTFORD,CT,...,N,N,N,N,N,N,N,Accident,,"HARTFORD, CT\n(41.765775, -72.673356)"
2,16-2504,06/28/2016,Male,White,44.0,BRISTOL,CT,HARTFORD,BRISTOL,CT,...,N,N,N,N,N,N,N,Accident,,"BRISTOL, CT\n(41.673037, -72.945791)"
3,17-296,04/10/2017,Male,White,59.0,WATERBURY,CT,NEW HAVEN,WATERBURY,CTCCTTCT,...,Y,N,Y,N,N,Y,N,Accident,Acute Intoxication due to the Combined Effects...,"Waterbury, CT\n(41.554261, -73.043069)"
4,16-3012,09/17/2016,Male,White,28.0,OXFORD,,NEW HAVEN,WATERBURY,,...,N,N,Y,N,N,N,N,Accident,,"WATERBURY, CT\n(41.554261, -73.043069)"


Step 3: Select features from dataset to train/test on. For some of my categorical data, a sample could belong to multiple categories for a feature (if the feature was too broad). Thus, I did the extra step of separating each category for a sample's feature into a list of categories.

In [88]:
features = select_features(["Sex","Race","Age"],dataframe)

# special treatment for categorical variables-- convert each race attribute for a sample into it's own list
features["Race"] = process_categorical_data(features["Race"],delimiter=", ")

# special treatment for categorical variables-- convert each gender attribute for a sample into it's own list
features["Sex"] = process_categorical_data(features["Sex"])

Step 4: Convert data to numerical form. Since I had categorical data, I encoded them using one-hot encoding. I also saved the encoding scheme for converting my results back to their respective categories

In [89]:
# encode race and gender
encoded_race, race_vocab = encode_feature(features["Race"])
encoded_gender, gender_vocab = encode_feature(features["Sex"])

# normalize age
features["Age"] = normalize_features({"Age": features["Age"]})['Age']

# concatenate feature tensors into one big dataset
data = integrate_features([encoded_gender,features["Age"],encoded_race])
data[0:5]

array([[1.        , 0.        , 0.2739726 , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.47945204, 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.41095892, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.6164383 , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [1.        , 0.        , 0.19178082, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]], dtype=float32)

Step 5: Select targets from data to train/test on. Here I use the drug names I extracted earlier.

In [92]:
labels = select_labels(drug_names,dataframe)
labels.head()

Unnamed: 0,Heroin,Cocaine,Fentanyl,Oxycodone,Oxymorphone,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine (not heroin)
0,Y,N,Y,N,N,N,N,N,N,N,N,N
1,Y,Y,Y,N,N,N,N,N,N,N,N,N
2,Y,Y,Y,N,N,N,N,N,N,N,N,N
3,Y,N,N,N,N,Y,N,Y,N,N,Y,N
4,N,N,Y,N,N,N,N,Y,N,N,N,N


Step 6: Convert target data to numerical form. Since people could die from multiple drugs, I used multihot encoding.

In [93]:
# convert 'Y' to 1 and otherwise to 0
binary_values = defaultdict(int,{
    'Y': 1,
    'N':0
})

death = multihot_binarycolumns(labels, binary_values=binary_values)
death

array([[1., 0., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Step 7: Split data into train and test sets.

In [94]:
# split data into train and test
training_num = int(data.shape[0]*0.8)
training_input = data[:training_num]
training_output = death[:training_num]
test_input = data[training_num:]
test_output = death[training_num:]

num_features = data.shape[1]
num_labels = death.shape[1]

training_input.shape[0],test_input.shape[0]

(3216, 805)