In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *


ModuleNotFoundError: No module named 'numpy'

## Overview
The baseline model is trained using the following features:

1) Diagnosis (value 1 for every diagnosis)

2) Lab_results (value is the mean of recorded values, scaled by maximum

In [None]:
# CONSTANTS
MIMIC_BUCKET = ''
PATIENTS_S3_PATH = f's3://{MIMIC_BUCKET}/PATIENTS.csv.gz'

In [455]:
### Sampling on patients
import utils

patients = pd.read_csv(f"{path}PATIENTS.csv")
#ensuring every patient is unique
print(f"{len(patients.SUBJECT_ID.unique())} unique patients in {len(patients)} rows")
#sampling random patients
patients_sample = patients.sample(n = 10000, random_state= 1)

46520 unique patients in 46520 rows


In [456]:
# note_events = pd.read_csv("../data/NOTEEVENTS.csv")
# #sample and save for these 1000 patients
# note_events_sample = pd.merge(patients_sample.SUBJECT_ID,note_events)
# note_events_sample.to_csv("../data/NOTEEVENTS_SAMPLE.csv")

In [457]:
### Incomplete: Using AWS glue, for now I will do stuff locally
# import boto3
# Creating the low level functional client
# client = boto3.client(
#     'glue',
#     aws_access_key_id = '',
#     aws_secret_access_key = '',
#     region_name = 'us-east-1'
# )
# clientResponse = client.get_table(DatabaseName="mimiciii",Name="admissions")

In [458]:
def convert_icd9(icd9_object):
    """
    :param icd9_object: ICD-9 code (Pandas/Numpy object).
    :return: extracted main digits of ICD-9 code
    """
    icd9_str = str(icd9_object)

    if icd9_str[0] == 'E': #if code starts with E
        converted = icd9_str[:4]
    else: #if they start with V or numeric
        converted = icd9_str[:3]

    return converted

def build_codemap(dataset):
    """
    :return: Dict of code map {main-digits of ICD9: unique feature ID}
    """
    # TODO: We build a code map using ONLY train data. Think about how to construct validation/test sets using this.
    df_digits = dataset['FEATURE'].unique()
    codemap = {}
    for i in range(0,len(df_digits)):
        codemap[df_digits[i]] = i


    return codemap

### Adding diagnoses data

In [459]:
#read and etl on diagnoses
diagnoses = pd.read_csv(f"{path}DIAGNOSES_ICD.csv.gz")
# sample for the patients
diagnoses = pd.merge(patients_sample.SUBJECT_ID,diagnoses)
diagnoses = diagnoses[['SUBJECT_ID','ICD9_CODE']]
diagnoses['VALUE'] = 1
diagnoses["ICD9_CODE"] = diagnoses["ICD9_CODE"].apply(convert_icd9)
diagnoses.drop_duplicates(inplace = True)

In [460]:
diagnoses

Unnamed: 0,SUBJECT_ID,ICD9_CODE,VALUE
0,4074,038,1
1,4074,785,1
2,4074,578,1
3,4074,427,1
4,4074,428,1
...,...,...,...
139065,62212,790,1
139066,62212,781,1
139067,62212,338,1
139068,62212,V15,1


### Lab Results

In [461]:

#read and etl on lab_results
# lab_results = pd.read_csv(f"{path}LABEVENTS_SAMPLE.csv")
lab_results = pd.read_csv(f"{path}LABEVENTS.csv.gz")
lab_results = pd.merge(patients_sample.SUBJECT_ID,lab_results)
# lab_results.to_csv("LABEVENTS_SAMPLE.csv")

In [462]:
#roughly 20% of these items have null in the HADM ID
lab_results = lab_results[['SUBJECT_ID','ITEMID','VALUE']]
#making sure lab_results has different item_id than diagnostics. appending a code 200 to the itemIDs
lab_results['ITEMID'] = lab_results['ITEMID']+20000000

In [463]:
#take average value for lab_results
lab_results = lab_results.dropna()
#keep only numeric values, removing things like 'not done'
def is_a_number(x):
    try:
        float(x.strip())
        return True
    except:
        return False
#     if x in [' ','.']:
#         return False
#     for char in x.strip():
#         if (not char in ['.',' ','1','2','3','4','5','6','7','8','9','0']):
#             return False
    return True
lab_results = lab_results.loc[lab_results['VALUE'].apply(is_a_number),:]
lab_results['VALUE'] = pd.to_numeric(lab_results['VALUE'])

#keep average of values (can use more sophisticated methods later on)
lab_results = lab_results.groupby(['SUBJECT_ID','ITEMID']).mean().reset_index()
lab_results

Unnamed: 0,SUBJECT_ID,ITEMID,VALUE
0,11,20050802,2.000
1,11,20050804,25.000
2,11,20050806,105.000
3,11,20050808,1.090
4,11,20050809,94.000
...,...,...,...
555449,99995,20051491,5.750
555450,99995,20051492,30.000
555451,99995,20051493,13.000
555452,99995,20051498,1.019


### Microbiology Events

In [464]:
### reading in data similar to lab_results, like microbiology events
#NOTE: Does not seem to help at all
mb_events = pd.read_csv(f"{path}MICROBIOLOGYEVENTS.csv.gz")
mb_events = pd.merge(patients_sample.SUBJECT_ID,mb_events)
mb_events = mb_events[['SUBJECT_ID','AB_ITEMID','INTERPRETATION']]
mb_events.dropna(inplace= True)

#transform seperate boolean values for Resistant, intermediate and Sensitive. 
mb_events.loc[mb_events.INTERPRETATION == 'R','AB_ITEMID'] += 31000000
mb_events.loc[mb_events.INTERPRETATION == 'S','AB_ITEMID'] += 32000000
mb_events.loc[mb_events.INTERPRETATION == 'I','AB_ITEMID'] += 33000000

mb_events = mb_events.loc[mb_events.INTERPRETATION != 'P']
mb_events.INTERPRETATION = 1
mb_events.drop_duplicates(inplace = True)


### Admissions

In [465]:
#ETL on admissions
#feature is the admission type and value is the time spent
admissions = pd.read_csv(f"{path}ADMISSIONS.csv.gz")
admissions = pd.merge(patients_sample.SUBJECT_ID,admissions)

admissions['TIME_SPENT'] = pd.to_timedelta(pd.to_datetime(admissions.DISCHTIME)-  pd.to_datetime(admissions.ADMITTIME)).dt.total_seconds()/3600
admissions['FEATURE'] = admissions.ADMISSION_TYPE.apply(lambda x: 40000000 + ord(x[0])*100+ord(x[1]))
admissions = admissions[['SUBJECT_ID','FEATURE','TIME_SPENT']]

admissions = admissions.groupby(['SUBJECT_ID','FEATURE']).aggregate('sum').reset_index()


In [466]:
admissions

Unnamed: 0,SUBJECT_ID,FEATURE,TIME_SPENT
0,11,40006977,612.700000
1,22,40006977,27.466667
2,26,40006977,167.733333
3,27,40007869,64.483333
4,39,40007869,234.100000
...,...,...,...
10510,99928,40006977,45.883333
10511,99935,40006977,5.016667
10512,99944,40006977,116.516667
10513,99985,40006977,398.300000


## Putting it all together
This step will
- merge all the features together
- give them unique feature_ids by building a codemape
- scale the features by their maximums

In [467]:
#merging diagnosis and lab_results
lab_results.columns = ['SUBJECT_ID','FEATURE','VALUE']
diagnoses.columns = ['SUBJECT_ID','FEATURE','VALUE']
mb_events.columns = ['SUBJECT_ID','FEATURE','VALUE']
admissions.columns = ['SUBJECT_ID','FEATURE','VALUE']

features = pd.concat([lab_results,diagnoses,mb_events,admissions])

#get rid of missing values
features = features.dropna()

#apply codemap to features
codemap = build_codemap(features)
features['FEATURE'] = features['FEATURE'].map(codemap)
features['VALUE'] = features.VALUE.round(6)

#scale features
max_features = features.groupby(['FEATURE']).aggregate('max').reset_index()
features = features.merge(max_features,on = 'FEATURE', suffixes=['', '_MAX'])
features['VALUE'] = features['VALUE']/features['VALUE_MAX']
features = features[['SUBJECT_ID','FEATURE','VALUE']]
features = features.dropna()
display(features[0:5])

Unnamed: 0,SUBJECT_ID,FEATURE,VALUE
0,11,0,0.075472
1,22,0,0.0
2,39,0,-0.10566
3,71,0,-0.245283
4,87,0,-0.160377


In [468]:
#create mortality table
mortality = patients_sample.copy()
mortality['DEAD'] = 1- mortality['DOD_SSN'].isna()
mortality.index = mortality.SUBJECT_ID
mortality = mortality['DEAD']

display(mortality[0:5])
mortality = mortality.to_dict()

SUBJECT_ID
4074     1
90889    0
72753    0
64908    0
70273    0
Name: DEAD, dtype: int64

### Saving the data prior to model training
The data will be saved in an SVM_light format for reproducibility

In [469]:
#turn patient_features into an svm_light format
features['F2V'] = list(zip(features.FEATURE,features.VALUE))
features_lists = features.groupby(['SUBJECT_ID'])['F2V'].apply(list)
# features_lists = pd.DataFrame(features_lists).reset_index()
features_lists = features_lists.to_dict()


In [470]:
 # function to help output data
def create_svmlite(patient_features, mortality, type):
    patient_ids = list(patient_features.keys())
    patient_ids.sort()
    d1 = ""
    for id in patient_ids:
        patient_features[id].sort()
        features = ''
        for feature in patient_features[id]:
            features += f" {str(int(feature[0]))}:" + "{:.6f}".format(feature[1])
        if type == 1: d1 += f"{mortality[id]}{features} \n"
        if type == 2: d1 += f"{int(id)} {mortality[id]}{features} \n"
    # print(d1)

    return d1
# create_svmlite(features_lists,mortality,2)
deliverable1 = open(f"{path}svm_light/features.train", 'wb')
deliverable1.write(bytes((create_svmlite(features_lists, mortality, 1)), 'UTF-8'))

8759915

### Model Training

In [474]:
# input: Y_pred,Y_true
# output: accuracy, auc, precision, recall, f1-score
def classification_metrics(Y_pred, Y_true):
    # TODO: Calculate the above mentioned metrics
    acc = accuracy_score(Y_pred, Y_true)
    auc_ = roc_auc_score(Y_pred, Y_true)
    precision = precision_score(Y_pred, Y_true)
    recall = recall_score(Y_pred, Y_true)
    f1score = f1_score(Y_pred, Y_true)
    # NOTE: It is important to provide the output in the same order
    return acc, auc_, precision, recall, f1score
# input: Name of classifier, predicted labels, actual labels
def display_metrics(classifierName, Y_pred, Y_true):
    print("______________________________________________")
    print(("Classifier: " + classifierName))
    acc, auc_, precision, recall, f1score = classification_metrics(Y_pred, Y_true)
    print(("Accuracy: " + str(acc)))
    print(("AUC: " + str(auc_)))
    print(("Precision: " + str(precision)))
    print(("Recall: " + str(recall)))
    print(("F1-score: " + str(f1score)))
    print("______________________________________________")
    print("")

# input: X_train, Y_train and X_test
# output: Y_pred
def logistic_regression_pred(X_train, Y_train, X_test):
    log_model = LogisticRegression(random_state=1)
    log_model.fit(X_train, Y_train)
    Y_pred = log_model.predict(X_test)
    return Y_pred

def decisionTree_pred(X_train, Y_train, X_test):
    model = DecisionTreeClassifier(nrandom_state=1)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    return Y_pred

X_train, Y_train = load_svmlight_file(f"{path}svm_light/features.train", n_features=3190)

from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(X_train,test_size = 0.2,random_state = 1)
Y_train, Y_test = train_test_split(Y_train,test_size = 0.2,random_state = 1)

### Results

In [475]:
from sklearn.linear_model import LogisticRegression
# from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
display_metrics("Logistic Regression", logistic_regression_pred(X_train, Y_train, X_test), Y_test)
# display_metrics("SVM", svm_pred(X_train, Y_train, X_test), Y_test)
display_metrics("Decision Tree", decisionTree_pred(X_train, Y_train, X_test), Y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


______________________________________________
Classifier: Logistic Regression
Accuracy: 0.8025
AUC: 0.7647675196112008
Precision: 0.5783132530120482
Recall: 0.691358024691358
F1-score: 0.6298031865042173
______________________________________________

______________________________________________
Classifier: Decision Tree
Accuracy: 0.7325
AUC: 0.6748667998667999
Precision: 0.53184165232358
Recall: 0.5402097902097902
F1-score: 0.5359930615784909
______________________________________________

