Step 1: Read in data from file. My data was in csv format, and I used pandas to house the data in a dataframe. I also shuffle the samples in the data and extract the names of the targets (drug names) I will use in training.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.python.data import Dataset
from collections import defaultdict
from functools import reduce
from matplotlib import pyplot as plt

dataframe = pd.read_csv("Accidental_Drug_Related_Deaths__2012-2017.csv")

drug_names = dataframe.columns[15:27]

# shuffle data
dataframe = dataframe.sample(frac=1).reset_index(drop=True)

dataframe.head()

Unnamed: 0,CaseNumber,Date,Sex,Race,Age,Residence City,Residence State,Residence County,Death City,Death State,...,Benzodiazepine,Methadone,Amphet,Tramad,Morphine (not heroin),Other,Any Opioid,MannerofDeath,AmendedMannerofDeath,DeathLoc
0,15-10607,06/28/2015,Female,White,55.0,SAUGUS,MA,ESSEX,MASHANTUCKET,CT,...,,,,,,,,Accident,,"MASHANTUCKET, CT\n(41.471499, -71.956392)"
1,14-12929,08/27/2014,Male,White,26.0,DURHAM,,,MIDDLETOWN,,...,,,,,,,,Accident,,"MIDDLETOWN, CT\n(41.544654, -72.651713)"
2,17-913,11/21/2017,Female,White,26.0,Somers,CT,Tolland,Somers,CT,...,,,,,,,,Accident,,"Somers, CT\n(41.985524, -72.447503)"
3,17-91,02/02/2017,Male,White,64.0,MILFORD,CT,NEW HAVEN,MILFORD,CTCCTTCT,...,,,,,,,,Accident,Acute Oxycodone Intoxication,"Milford, CT\n(41.224276, -73.057564)"
4,13-5240,04/02/2013,Female,White,45.0,CHESHIRE,,NEW HAVEN,CHESHIRE,,...,,,,,,,,Accident,,"CHESHIRE, CT\n(41.498834, -72.901448)"


Step 2: Select features from dataset to train/test on. For some of my categorical data, a sample could belong to multiple categories for a feature. Thus, I did the extra step of separating each category for a sample's feature into a list of categories.

In [2]:
def select_features(features, dataframe):
    """
    Extracts selected features from data
    :param features: list of column names 
    :param dataframe: the dataframe to select features from
    :return: map of selected features to numpy arrays
    """
    
    feature_frame = {}
    
    # add features to feature_frame
    for feature in features:
        try:
            # convert values to numpy array
            feature_frame[feature] = np.array(dataframe[feature])
        except:
            print("feature not in dataframe")
            raise
        
    return feature_frame

# sanitize data -- drop records with NA in feature columns
dataframe.dropna(subset=["Sex","Race","Age"],inplace=True)

features = select_features(["Sex","Race","Age"],dataframe)

def process_categorical_data(feature,delimiter=None):
    """
    convert categorical data to usable format
    :param feature: panda series to be processed
    :param delimiter: to separate multiple values for a sample's feature 
    :return: tensor with modified values
    """
    
    # create new tensors
    values = []
    for value in feature:
        # in case number present
        try:
            new_value = value.split(delimiter)
            values.append(new_value)
        except:
            values.append([value])
        
    return values

# special treatment for categorical variables-- convert each race attribute for a sample into a list item
features["Race"] = process_categorical_data(features["Race"],delimiter=", ")

# special treatment for categorical variables-- convert each gender attribute for a sample into a list item
features["Sex"] = process_categorical_data(features["Sex"])

Step 3: Convert data to numerical form. Since I had categorical data, I encoded them using one-hot encoding. I also saved the encoding scheme for converting my results back to their respective categories

In [3]:
def encode_feature(feature):
    """
    Convert categorical data to numerical array using multihot-encoding
    :param feature: list of categories
    :return: numpy array of data, encoding scheme
    """
    
    samples = len(feature)
   
    # index of 1's for each sample encoding
    data_indices = []
    
    # encode each sample
    vocab = {}
    index = 0
    for sample in feature:
        sample_indices = []
        for category in sample:
            if category not in vocab: 
                vocab[category] = index
                index+=1
            sample_indices.append(vocab[category])
        data_indices.append(sample_indices)
        
    # create tensor and load in 1's
    num_data = np.zeros((samples,index),dtype=np.float32)
    row = 0
    for sample_indices in data_indices:
        for index in sample_indices:
            num_data[row,index] = 1
        row+=1
        
    return num_data, vocab

# encode race and gender
encoded_race, race_vocab = encode_feature(features["Race"])
encoded_gender, gender_vocab = encode_feature(features["Sex"])

# concatenate feature tensors into one big dataset
data = reduce(lambda x,y: np.column_stack((x,y)),[encoded_gender,features["Age"],encoded_race]).astype(np.float32)

Step 4: Select targets from data to train/test on. Here I use the drug names I extracted earlier.

In [4]:
def select_labels(targets,dataframe):
    """
    Extracts selected labels from data
    :param targets: list of target data
    :param dataframe: the dataframe to select targets from
    :return: dataframe of target data
    """
    
    target_frame = pd.DataFrame()
    
    # add targets to target_frame
    for target in targets:
        try:
            target_frame[target] = dataframe[target]
        except:
            print("feature not in dataframe")
            raise
        
    return target_frame

labels = select_labels(drug_names,dataframe)
labels.head()

Unnamed: 0,Heroin,Cocaine,Fentanyl,Oxycodone,Oxymorphone,EtOH,Hydrocodone,Benzodiazepine,Methadone,Amphet,Tramad,Morphine (not heroin)
0,,,,,,,,,,,,
1,Y,,,,,,,,,,,
2,,,Y,,,,,,,,,
3,,,,Y,,,,,,,,
4,Y,Y,,,,,,,,,,


Step 5: Convert target data to numerical form. Since people could die from multiple drugs, I used multihot encoding.

In [5]:
def multihot_binarycolumns(labels,new_column_name,binary_values):
    """
    Multihot encoding of multiple columns with binary values
    :param labels: dataframe to be encoded
    :param binary_values: dictionary mapping dataset binary values to a 1 or 0
    :param new_column_name: name of encoded column
    :return: numpy array
    """
       
    # create encoded column
    data = np.zeros((labels.shape),dtype=np.float32)
    
    # convert binary values to 1's and 0's for each item    
    for index, row in labels.iterrows():
        
        # weird bug where index can be equal to num rows
        if index >= labels.shape[0]:
            break
        data[index] = row.apply(lambda death: binary_values[death])
    
    return data

# convert 'Y' to 1 and otherwise to 0
binary_values = defaultdict(int,{
    'Y': 1,
    np.nan:0
})

death = multihot_binarycolumns(labels,"death", binary_values=binary_values)
death

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Step 6: Construct pipeline to feed data into model. I used a batch system where I created an iterator to return the next batch of data as needed by the model.

In [6]:
def create_batches(features,targets,batch_size = None,num_epochs = None):
    """
    create batches to be fed into model
    :param features: tensor of features
    :param targets: tensor of targets
    :param batch_size: desired size of batches
    :param num_epochs: number of epochs
    :return: batch iterator
    """
    """
    ds = Dataset.from_tensor_slices((features,targets)).shuffle(32,reshuffle_each_iteration=True).repeat(count=num_epochs)
    """
    # construct a dataset and configure batching/repeating
    ds = Dataset.from_tensor_slices((features,targets)).shuffle(1000)
    ds = ds.batch(batch_size)
    
    # retrieve next batch
    return ds.make_initializable_iterator()

Step 7: Create the model. In tensorflow, a Graph contains the components of the model (the optimizer, the weights, the loss function, etc.) in the form of "Tensors" and "Operations". Later on, these objects will be activated in a separate step for training.

In [7]:
def create_linear_classifier(training_data, test_data, optimizer_name, learning_rate = 0.1, batch_size=None):
    """
    creates a linear model
    :param training_data: numpy array of numerical data
    :param test_data: numpy array of numerical targets
    :param optimizer: name of optimizer
    :param learning_rate: optimizer's learning rate
    :param batch_size: size of batches
    :return: tensorflow graph of net and list of relevant graph variables
    """
    
    num_features = training_data.shape[1]
    num_labels = test_data.shape[1]
    
    # construct graph
    graph = tf.Graph() 
    
    # construct list of graph vars to run in session -- predictions, loss, optimizer, accuracy
    graph_vars = {}
    
    # invoke tensorflow dataflow context
    with graph.as_default():
                        
        # split data into train and test tensors following 70/30
        train_num = int(training_data.shape[0]*.7)
        x_train, y_train = tf.constant(training_data[:train_num,:]), tf.constant(test_data[:train_num,:])
        x_test, y_test = tf.constant(training_data[train_num:,:]), tf.constant(test_data[train_num:,:])
        
        # create iterator
        iterator = create_batches(x_train,y_train,batch_size=batch_size)   
        batch_x_train, batch_y_train = iterator.get_next()
        
        # set up weights, biases, logits
        weights = tf.Variable(tf.truncated_normal([num_features,num_labels]))
        biases = tf.Variable(tf.zeros([num_labels]))
        logits = tf.add(tf.matmul(batch_x_train,weights), biases)
        
        # apply sigmoid to logits and get prediction
        train_prediction = tf.sigmoid(logits) 
        
        # cross entropy loss (add 1e-10 constant to avoid log(0))   
        loss = -tf.reduce_sum(batch_y_train*tf.log(tf.clip_by_value(train_prediction, 1e-10, 1.0 )),name="loss")
        graph_vars['loss']=loss
        
        # optimizer + backpropagation
        optimizer = select_optimizer(optimizer_name,learning_rate)
        back_propagation = optimizer.minimize(loss)
        graph_vars['back prop']=back_propagation
        
        # calculate accuracy
        equality = tf.equal(tf.argmax(train_prediction,1),tf.argmax(batch_y_train,1))
        accuracy = tf.reduce_mean(tf.cast(equality,tf.float32),name="accuracy")
        graph_vars['accuracy']=accuracy
                
        graph_vars['iterator'] = iterator
        
    return graph, graph_vars

In [8]:
def create_vanilla_nn(training_data, test_data, optimizer_name, learning_rate = 0.1, batch_size=None, layers = None):
    """
    creates a vanilla (aka fully connected feed forward) neural network
    :param training_data: numpy array of numerical data
    :param test_data: numpy array of numerical targets
    :param optimizer: name of optimizer
    :param learning_rate: optimizer's learning rate
    :param batch_size: size of batches
    :param layers: list of tuples for each layer (num hidden nodes, name of activation func). None = linear classifier 
    :return: tensorflow graph of net and list of relevant graph variables
    """
    
    if layers == None:
        create_linear_classifier(training_data,test_data,optimizer_name,learning_rate,batch_size)
        return
    
    num_features = training_data.shape[1]
    num_labels = test_data.shape[1]
    
    # construct graph
    graph = tf.Graph() 
    
    # construct map of graph vars to run in session -- predictions, loss, optimizer, accuracy
    graph_vars = {}
    
    # invoke tensorflow dataflow context
    with graph.as_default():
        
        # split data into train and test tensors following 70/30
        train_num = int(training_data.shape[0]*.7)
        x_train, y_train = tf.constant(training_data[:train_num,:]), tf.constant(test_data[:train_num,:])
        x_test, y_test = tf.constant(training_data[train_num:,:]), tf.constant(test_data[train_num:,:])
        
        # set up batching
        iterator = create_batches(x_train,y_train,batch_size=batch_size)   
        batch_x_train, batch_y_train = iterator.get_next()
        
        # tuple of tensor and activation func
        hidden_layers = []

        prev_rows = num_features 
        
        # create hidden layers
        for layer in layers:
            
            # extract number of nodes in hidden layer and activation func
            num_nodes = layer[0]
            activation_func = select_activation_func(layer[1])
            
            # create layer
            hidden = tf.Variable(tf.truncated_normal([prev_rows,num_nodes]))
            hidden_layers.append((hidden,activation_func))
            
            # update shape for next layer
            prev_rows = num_nodes
            
        # input to final layer
        input_final = forward_pass(batch_x_train,hidden_layers)
            
        # final layer
        final = tf.Variable(tf.truncated_normal([prev_rows,num_labels]))
        
        # set up biases and logits
        biases = tf.Variable(tf.zeros([num_labels]))
        logits = tf.add(tf.matmul(input_final,final), biases)
        
        # apply sigmoid to logits and get prediction
        train_prediction = tf.sigmoid(logits)
        
        # cross entropy loss (add 1e-10 constant to avoid log(0))   
        loss = -tf.reduce_sum(batch_y_train*tf.log(tf.clip_by_value(train_prediction, 1e-10, 1.0 )),name="loss")
        graph_vars['loss'] = loss
        
        # optimizer + backpropagation
        optimizer = select_optimizer(optimizer_name,learning_rate)
        back_propagation = optimizer.minimize(loss)
        graph_vars['back prop'] = back_propagation
        
        # calculate accuracy
        equality = tf.equal(tf.round(train_prediction),batch_y_train)
        accuracy = tf.reduce_mean(tf.cast(equality,tf.float32),name="accuracy")
        graph_vars['accuracy'] = accuracy
                         
        graph_vars['iterator'] = iterator
        
    return graph, graph_vars

In [9]:
def forward_pass(input, layers):
    """
    compute the forward pass of the net
    :param input: input data to the first hidden layer
    :param layers: hidden layers
    :return: a tensor
    """
    
    # computes matrix multiply in forward pass
    for layer in layers:
        weights = layer[0]
        activation_func = layer[1]
        input = activation_func(tf.matmul(input,weights))
    return input

In [10]:
def select_optimizer(name,learning_rate = 0.1):
    """
    Select user specified optimizer
    :param name: name of optimizer
    :param learning_rate: optimizer's learning rate
    :return: optimizer object
    """
    
    name = name.lower()
    
    # list of optimizers    
    optimizers = set(["adam","gd","adagrad","adadelta"])
    assert name in optimizers
    
    # TODO: make more memory efficient
    return {
        "gd" : tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
        "adam" : tf.train.AdamOptimizer(learning_rate=learning_rate),
        "adagrad" : tf.train.AdagradOptimizer(learning_rate=learning_rate),
        "adadelta": tf.train.AdadeltaOptimizer(learning_rate=learning_rate)
    }.get(name)

def select_activation_func(name):
    """
    Selects user specified activation func
    :param name: name of activation func
    :return: activation func
    """
    
    name = name.lower()
    # list of activation funcs
    activations = set(["sigmoid","relu","tanh"])
    assert name in activations
    return {
        "sigmoid" : tf.sigmoid,
        "relu" : tf.nn.relu,
        "tanh" : tf.tanh
    }.get(name)

Step 8: Train the model. In low level tensorflow, training the model (which is a contained in a Graph) occurs in a Session. Here I also plot the accuracy and error of the model.

In [1]:
def train_model(model,model_vars, epochs = 1):
    """
    train model and plot accuracy
    :param model: graph object containing net
    :param model_vars: map of tensors/ops such as optimizer, loss, iterator etc. to run in session
    :param epochs: number of epochs to train
    :return: 
    """
    
    # lists of average accuracy and error for each epoch
    accuracy_points = []
    error_points = []
    
    # start session
    with tf.Session(graph=model) as session:
        tf.global_variables_initializer().run()
        
        #iterator initializer
        init_iterator = tf.variables_initializer([model_vars['iterator']])
        
        # session variables
        sess_vars = [model_vars['loss'],model_vars['back prop'],model_vars['accuracy']]
        for epoch in range(epochs):
            session.run(init_iterator)
            epoch_acc = 0.0
            epoch_err = 0.0
            denom = 0
            
            # training-- get batch until no batches left
            while(True):
                try:
                    l, _, acc= session.run(sess_vars)
                    epoch_acc+=acc
                    epoch_err+=l
                    denom+=1
                    
                except:
                    break
                                
            accuracy_points.append(epoch_acc/denom)
            error_points.append(epoch_err/denom)
        
        session.close()        
        x_values = np.arange(1,epochs+1)
        
        # plot data and end session
        plt.subplot(121)
        plt.plot(x_values, accuracy_points, 'b-')
        plt.title("Accuracy")
        plt.ylim(0.0,1.0)
        plt.grid(True)
        
        plt.subplot(122)
        plt.plot(x_values, error_points, 'r-')
        plt.title('Error')
        plt.grid(True)
        
        plt.subplots_adjust(top=0.92, bottom=0.08, left=0.05, right=0.95, hspace=0.25,
                    wspace=0.5)

        plt.show()