In [152]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *


### Sampling on patients

In [2]:
path = "../../data/"
patients = pd.read_csv(f"{path}PATIENTS.csv")

In [4]:
#ensuring every patient is unique
print(f"{len(patients.SUBJECT_ID.unique())} unique patients in {len(patients)} rows")
#sampling random patients
patients_sample = patients.sample(n = 1000, random_state= 1)

46520 unique patients in 46520 rows


In [5]:
# note_events = pd.read_csv("../data/NOTEEVENTS.csv")
# #sample and save for these 1000 patients
# note_events_sample = pd.merge(patients_sample.SUBJECT_ID,note_events)
# note_events_sample.to_csv("../data/NOTEEVENTS_SAMPLE.csv")

In [6]:
### Incomplete: Using AWS glue, for now I will do stuff locally
# import boto3
# Creating the low level functional client
# client = boto3.client(
#     'glue',
#     aws_access_key_id = '',
#     aws_secret_access_key = '',
#     region_name = 'us-east-1'
# )
# clientResponse = client.get_table(DatabaseName="mimiciii",Name="admissions")

In [7]:
def convert_icd9(icd9_object):
    """
    :param icd9_object: ICD-9 code (Pandas/Numpy object).
    :return: extracted main digits of ICD-9 code
    """
    icd9_str = str(icd9_object)

    if icd9_str[0] == 'E': #if code starts with E
        converted = icd9_str[:4]
    else: #if they start with V or numeric
        converted = icd9_str[:3]

    return converted

def build_codemap(dataset):
    """
    :return: Dict of code map {main-digits of ICD9: unique feature ID}
    """
    # TODO: We build a code map using ONLY train data. Think about how to construct validation/test sets using this.
    df_digits = dataset['FEATURE'].unique()
    codemap = {}
    for i in range(0,len(df_digits)):
        codemap[df_digits[i]] = i


    return codemap

In [8]:
#read and etl on diagnoses
diagnoses = pd.read_csv(f"{path}DIAGNOSES_ICD.csv.gz")
# sample for the patients
diagnoses = pd.merge(patients_sample.SUBJECT_ID,diagnoses)
diagnoses = diagnoses[['SUBJECT_ID','ICD9_CODE']]
diagnoses['VALUE'] = 1
diagnoses["ICD9_CODE"] = diagnoses["ICD9_CODE"].apply(convert_icd9)
diagnoses.drop_duplicates(inplace = True)

In [9]:
diagnoses

Unnamed: 0,SUBJECT_ID,ICD9_CODE,VALUE
0,4074,038,1
1,4074,785,1
2,4074,578,1
3,4074,427,1
4,4074,428,1
...,...,...,...
13691,27594,300,1
13692,27594,287,1
13693,27594,372,1
13694,27594,787,1


In [10]:
admissions = pd.read_csv(f"{path}ADMISSIONS.csv.gz")
admissions = pd.merge(patients_sample.SUBJECT_ID,admissions)

#read and etl on lab_results
lab_results = pd.read_csv(f"{path}LABEVENTS_SAMPLE.csv")
# lab_results = pd.read_csv(f"{path}LABEVENTS.csv.gz")
lab_results = pd.merge(patients_sample.SUBJECT_ID,lab_results)
# lab_results.to_csv("LABEVENTS_SAMPLE.csv")

In [11]:
#roughly 20% of these items have null in the HADM ID
lab_results = lab_results[['SUBJECT_ID','ITEMID','VALUE']]
#making sure lab_results has different item_id than diagnostics. appending a code 200 to the itemIDs
lab_results['ITEMID'] = lab_results['ITEMID']+20000000

In [12]:
#take average value for lab_results
lab_results = lab_results.dropna()
#keep only numeric values, removing things like 'not done'
def is_a_number(x):
    if x in [' ','.']:
        return False
    for char in x.strip():
        if (not char in ['.',' ','1','2','3','4','5','6','7','8','9','0']):
            return False
    return True
lab_results = lab_results.loc[lab_results['VALUE'].apply(is_a_number),:]
lab_results['VALUE'] = pd.to_numeric(lab_results['VALUE'])

#keep average of values (can use more sophisticated methods later on)
lab_results = lab_results.groupby(['SUBJECT_ID','ITEMID']).mean().reset_index()
lab_results

Unnamed: 0,SUBJECT_ID,ITEMID,VALUE
0,138,20050802,1.947368
1,138,20050804,27.076923
2,138,20050806,104.666667
3,138,20050808,1.134286
4,138,20050809,136.187500
...,...,...,...
54497,99928,20051301,7.633333
54498,99928,20051478,1000.000000
54499,99928,20051484,150.000000
54500,99928,20051491,5.000000


In [160]:
#merging diagnosis and lab_results
lab_results.columns = ['SUBJECT_ID','FEATURE','VALUE']
diagnoses.columns = ['SUBJECT_ID','FEATURE','VALUE']
features = pd.concat([lab_results,diagnoses])


In [161]:
#get rid of missing values
features = features.dropna()

#apply codemap to features
codemap = build_codemap(features)
features['FEATURE'] = features['FEATURE'].map(codemap)
features['VALUE'] = features.VALUE.round(2)
display(features[0:5])

Unnamed: 0,SUBJECT_ID,FEATURE,VALUE
0,138,0,1.95
1,138,1,27.08
2,138,2,104.67
3,138,3,1.13
4,138,4,136.19


In [110]:
#create mortality table
mortality = patients_sample
mortality['DEAD'] = 1- mortality['DOD_SSN'].isna()
mortality.index = mortality.SUBJECT_ID
mortality = mortality['DEAD']

display(mortality[0:5])
mortality = mortality.to_dict()

SUBJECT_ID
4074     1
90889    0
72753    0
64908    0
70273    0
Name: DEAD, dtype: int64

### Saving the data prior to model training
The data will be saved in an SVM_light format for reproducibility

In [157]:
#turn patient_features into an svm_light format
features['F2V'] = list(zip(features.FEATURE,features.VALUE))
features_lists = features.groupby(['SUBJECT_ID'])['F2V'].apply(list)
# features_lists = pd.DataFrame(features_lists).reset_index()
features_lists = features_lists.to_dict()


In [131]:
 # function to help output data
def create_svmlite(patient_features, mortality, type):
    patient_ids = list(patient_features.keys())
    patient_ids.sort()
    d1 = ""
    for id in patient_ids:
        patient_features[id].sort()
        features = ''
        for feature in patient_features[id]:
            features += f" {str(int(feature[0]))}:" + "{:.6f}".format(feature[1])
        if type == 1: d1 += f"{mortality[id]}{features} \n"
        if type == 2: d1 += f"{int(id)} {mortality[id]}{features} \n"
    # print(d1)

    return d1
# create_svmlite(features_lists,mortality,2)
deliverable1 = open(f"{path}svm_light/features.train", 'wb')
deliverable1.write(bytes((create_svmlite(features_lists, mortality, 1)), 'UTF-8'))

846746

### Model Training

In [150]:
# input: Y_pred,Y_true
# output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_pred, Y_true):
    # TODO: Calculate the above mentioned metrics
    acc = accuracy_score(Y_pred, Y_true)
    auc_ = roc_auc_score(Y_pred, Y_true)
    precision = precision_score(Y_pred, Y_true)
    recall = recall_score(Y_pred, Y_true)
    f1score = f1_score(Y_pred, Y_true)
    # NOTE: It is important to provide the output in the same order
    return acc, auc_, precision, recall, f1score
# input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName, Y_pred, Y_true):
    print("______________________________________________")
    print(("Classifier: " + classifierName))
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred, Y_true)
    print(("Accuracy: " + str(acc)))
    print(("AUC: " + str(auc_)))
    print(("Precision: " + str(precision)))
    print(("Recall: " + str(recall)))
    print(("F1-score: " + str(f1score)))
    print("______________________________________________")
    print("")

# input: X_train, Y_train and X_test
# output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
    # TODO: train a logistic regression classifier using X_train and Y_train. Use this to predict labels of X_test
    # use default params for the classifier
    log_model = LogisticRegression(random_state=1)
    log_model.fit(X_train, Y_train)
    Y_pred = log_model.predict(X_test)
    return Y_pred
X_train, Y_train = load_svmlight_file(f"{path}svm_light/features.train", n_features=3190)

from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X_train,test_size = 0.2,random_state = 1)
Y_train, Y_test = train_test_split(Y_train,test_size = 0.2,random_state = 1)

In [154]:
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
# from sklearn.tree import DecisionTreeClassifier
display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_test), Y_test)
# display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test)
# display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_test), Y_test)


______________________________________________
Classifier: Logistic Regression
Accuracy: 0.77
AUC: 0.7250000000000001
Precision: 0.4482758620689655
Recall: 0.65
F1-score: 0.5306122448979592
______________________________________________



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


NameError: name 'svm_pred' is not defined