In [373]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score

In [3]:
def lower_col_names(df):
    '''
    lowercase the column names of a pd df
    Input: pd dataframe
    Output: lower - a pd dataframe with lowercase columns 
    '''
    df.columns= df.columns.str.strip().str.lower()
    
    return df
    



In [4]:
# Configure Target Variables:
data_dir = "data/merged_data"
data_dir_rob = data_dir + "/roberta_los_read"
data_dir_bc = data_dir + "/bc_los_read"



In [138]:
ls data/merged_data

[34mbc_512_los_read[m[m/  [34mbc_512_read[m[m/      [34mroberta_los_read[m[m/ [34mroberta_read[m[m/


In [5]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(data_dir_rob) if isfile(join(data_dir_rob, f))]
onlyfiles.sort()

In [6]:
onlyfiles

['roberta_los_read_x_test.pkl',
 'roberta_los_read_x_train.pkl',
 'roberta_los_read_x_val.pkl',
 'roberta_los_read_y_test.pkl',
 'roberta_los_read_y_train.pkl',
 'roberta_los_read_y_val.pkl']

In [160]:
x_test_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[0])
x_train_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[1])
x_val_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[2])
y_test_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[3])
y_train_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[4])
y_val_unclean = pd.read_pickle(data_dir_rob + "/" + onlyfiles[5])

In [224]:
def process_x(df, emb_name):
    '''
    Function to preprocess x dataset
    Input: 
        df (dataframe) - x_train, dev, or test
        emb_name (string) - colname in df that accesses embeddings
    
    Output: df (dataframe) - the same dataset just processed 
    '''    
    #Drop diagnosis
    df = df.drop(["diagnosis", "icd9_code"], axis=1, inplace = False)
    
    #Go get embedding series of list
    embeddings = pd.Series(df[emb_name])

    #dop embeddings
    df = df.drop(emb_name, axis=1, inplace = False)
    
    
    #Convert embeddings to a usable df:
    emb_df = pd.DataFrame.from_dict(dict(zip(embeddings.index, embeddings.values))).T
    emb_df.columns = emb_df.columns.astype(str)


    #concatenate embeddings and df
    df_full = pd.concat([df, emb_df], axis =1)
    
    #get new 
    df_full = lower_col_names(df_full)
    
    
    return df_full
    

In [179]:
def process_y(y_val, col):
    '''
    Input: 
        y_val (df) - one of y_train, y_dev, y_test
        col (string) - name of string of column to use to forcast
    Output: clean_y_val
    '''
    
    return y_val[col]

In [322]:
x_train = process_x(x_train_unclean, "roberta")
x_val = process_x(x_val_unclean, "roberta")
x_test = process_x(x_test_unclean, "roberta")
x_train_long = pd.concat([x_train, x_val], axis = 0)


In [250]:
y_train = process_y(y_train_unclean, "readmit")
y_val = process_y(y_val_unclean, "readmit")
y_test = process_y(y_test_unclean, "readmit")
y_train_long = pd.concat([y_train, y_val], axis= 0)



In [289]:
#Running sklearn log reg:

#Hyper parameters to tune: 
penalty = ["l2", "none"]

C_options = [10, 1, 0.1, 0.01]
param_dict = {"penalty":penalty, "C":C_options}
cv = KFold(n_splits=10)



#Logistic Regression:
lr = LogisticRegression(random_state=0,  max_iter=1000)
lr_grid = GridSearchCV(estimator = lr,
                       param_grid = param_dict,
                       cv = cv,
                       scoring = "average_precision")
grid_res = lr_grid.fit(x_train, y_train)





In [341]:
print("Best: %f using %s" % (grid_res.best_score_, grid_res.best_params_))

Best: 0.054130 using {'C': 1000, 'penalty': 'l2'}


In [377]:
lr_fin_r = LogisticRegression(random_state=0,  max_iter=1000, C = 1000, penalty = 'l2')
lr_fin_r.fit(x_val, y_val)
preds = lr_fin_r.predict(x_test)

20

In [329]:
average_precision_score(np.array(preds), np.array(y_test))

0.0024412280096944025

In [378]:
accuracy_score(np.array(preds), np.array(y_test))

0.9579011592434411

In [304]:
#Get no word data:
#WORDS DO BETTER THAN NO WORDS: 
x_train.shape

(34416, 767)

In [356]:
x_train_no_word = x_train.iloc[:, 0:(767-512)]
x_val_no_word = x_val.iloc[:, 0:(767-512)]
x_test_no_word = x_test.iloc[:, 0:(767-512)]

In [379]:
lr = LogisticRegression(random_state=0,  max_iter=1000)


lr_fin_r.fit(x_val_no_word, y_val)
preds_no_word = lr_fin_r.predict(x_test_no_word)

In [380]:
average_precision_score(np.array(preds_no_word), np.array(y_test))

  recall = tps / tps[-1]


nan

In [381]:
accuracy_score(np.array(preds_no_word), np.array(y_test))

0.9591214154972544

In [239]:
!ls data/merged_data

[34mbc_512_los_read[m[m  [34mbc_512_read[m[m      [34mroberta_los_read[m[m [34mroberta_read[m[m


In [None]:
#Bioclinical BERT Runs:

In [336]:
data_dir_b = data_dir + "/bc_512_los_read"
onlyfiles_b = [f for f in listdir(data_dir_b) if isfile(join(data_dir_b, f))]
onlyfiles_b.sort()
onlyfiles_b

['bc_512_los_read_x_test.pkl',
 'bc_512_los_read_x_train.pkl',
 'bc_512_los_read_x_val.pkl',
 'bc_512_los_read_y_test.pkl',
 'bc_512_los_read_y_train.pkl',
 'bc_512_los_read_y_val.pkl']

In [337]:
x_test_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[0])
x_train_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[1])
x_val_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[2])
y_test_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[3])
y_train_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[4])
y_val_unclean_b = pd.read_pickle(data_dir_b + "/" + onlyfiles_b[5])

In [338]:
x_train_b = process_x(x_train_unclean_b, "bc_512")
x_val_b = process_x(x_val_unclean_b, "bc_512")
x_test_b = process_x(x_test_unclean_b, "bc_512")
x_train_long_b = pd.concat([x_train_b, x_val_b], axis = 0)



In [255]:
x_train_long_b.shape

(45889, 767)

In [346]:
#Logistic Regression:
lr_long = LogisticRegression(random_state=0,  max_iter=5000)
lr_grid_b = GridSearchCV(estimator = lr_long,
                       param_grid = param_dict,
                       cv = cv,
                       scoring = "average_precision")
grid_res_b = lr_grid_b.fit(x_train_b, y_train)








In [347]:
print("Best: %f using %s" % (grid_res_b.best_score_, grid_res_b.best_params_))

Best: 0.060113 using {'C': 1000, 'penalty': 'none'}


In [384]:
lr_fin_b = LogisticRegression(random_state=0,  max_iter=5000, penalty = 'none')
lr_fin_b.fit(x_val_b, y_val)
preds_b = lr_fin_b.predict(x_test_b)

In [386]:
preds_b.sum()

26

In [387]:
average_precision_score(np.array(preds_b), np.array(y_test))

0.0024198980424563923

In [388]:
accuracy_score(np.array(preds_b), np.array(y_test))

0.9572038699555478

In [389]:
x_train_b.shape

(34416, 767)

In [359]:
x_train_b_no_word = x_train_b.iloc[:, 0:(767-512)]
x_val_b_no_word = x_val_b.iloc[:, 0:(767-512)]
x_test_b_no_word = x_test_b.iloc[:, 0:(767-512)]

In [361]:
x_train_b_no_word

Unnamed: 0,subject_id,hadm_id,is_discharge,is_nursing,is_other,unnamed: 0,tsurg,med,unnamed: 5,cmed,...,ethnicity_patient declined to answer,ethnicity_portuguese,ethnicity_south american,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - other european,ethnicity_white - russian
6249,8477,154902,1,0,0,10135,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
38563,81212,130156,1,0,0,51868,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
36134,94049,183737,1,0,0,55875,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
53051,49434,146834,0,1,0,41817,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
21180,16590,151831,1,0,0,19796,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34058,6133,111487,1,0,0,7339,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
46834,56854,109829,1,0,0,44220,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
17909,7513,138956,1,0,0,8993,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
42301,66058,134558,1,0,0,47159,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [362]:
lr_grid_b = GridSearchCV(estimator = lr_long,
                       param_grid = param_dict,
                       cv = cv,
                       scoring = "average_precision")
grid_res_b = lr_grid_b.fit(x_train_b_no_word, y_train)





In [363]:
print("Best: %f using %s" % (grid_res_b.best_score_, grid_res_b.best_params_))

Best: 0.045394 using {'C': 1000, 'penalty': 'l2'}


In [369]:
lr_fin_b_no_word = LogisticRegression(random_state=0,  max_iter=5000, penalty = 'l2', C=1000)
lr_fin_b_no_word.fit(x_val_b_no_word, y_val)
preds_b_no_word = lr_fin_b_no_word.predict(x_test_b_no_word)

In [375]:
average_precision_score(np.array(preds_b_no_word), np.array(y_test))

  recall = tps / tps[-1]


nan

In [376]:
accuracy_score(np.array(preds_b_no_word), np.array(y_test))

0.9591214154972544

In [385]:
preds_b_no_word.sum()

0

In [128]:
#Read:




labels = pd.read_pickle(data_dir + 'labels_final_df.pkl')
labels = lower_col_names(labels)

non_text = pd.read_csv(data_dir + "non_text_features.csv", index_col=False)

if "Unnamed: 0" in non_text.columns:
    non_text.drop("Unnamed: 0", axis =1, inplace=True)
    
    
nt_labels = pd.merge(non_text, labels, on = ["subject_id", "hadm_id"])

In [85]:
labels

Unnamed: 0,subject_id,hadm_id,readmit,stay_length_sec,icd9_code
0,17,194023,0,31800,7455
1,17,161087,0,1740,4239
2,21,109451,0,14280,41071
3,21,111970,0,19080,0388
4,23,152223,0,42900,41401
...,...,...,...,...,...
58924,99985,176670,0,51480,0389
58925,99991,151118,0,13500,56211
58926,99992,197084,0,85980,9999
58927,99995,137810,0,18900,4414


In [129]:
X = nt_labels.drop(["readmit", "stay_length_sec", "icd9_code", "diagnosis"], axis = 1)
Y = nt_labels["readmit"]


X_train = X.loc[0:46200, ]
X_test = X.loc[46201:57749, ]
Y_train = Y.loc[0:46200, ]
Y_test = Y.loc[46201:57749, ]

In [69]:
for col in X_train.columns:
    if "NEWBORN" in col:
        print(col)

admission_type_NEWBORN


In [203]:
tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0])

NameError: name 'confusion_matrix' is not defined

In [202]:
#Running sklearn log reg:

#Hyper parameters to tune: 
penalty = ["l2", "none"]
C_options = [1000, 100, 1, 0.1, 0.001, 0.0001]

param_dict = {"penalty":penalty, "C":C_options}

#Logistic Regression:
lr = LogisticRegression(random_state=0)
lr_grid = GridSearchCV(estimator = lr,
                       param_grid = param_dict,
                       cv = cv,
                       scoring = "auc")
cv = KFold(n_splits=10)
grid_res = lr_grid.fit(x_train, y_train)

ValueError: 'auc' is not a valid scoring value. Use sorted(sklearn.metrics.SCORERS.keys()) to get valid options.

In [133]:
print("Best: %f using %s" % (grid_res.best_score_, grid_res.best_params_))


Best: 0.959114 using {'C': 1000, 'penalty': 'l2'}


In [126]:
X_train2 = X_train.drop("diagnosis", axis=1)
lr.fit(X_train2, Y_train)

#X_train2["is_female"].unique()

LogisticRegression(random_state=0)

NameError: name 'nt_labels' is not defined

Unnamed: 0,subject_id,hadm_id,is_discharge,is_nursing,is_other,roberta,icd9_code,unnamed: 0,tsurg,med,...,ethnicity_patient declined to answer,ethnicity_portuguese,ethnicity_south american,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - other european,ethnicity_white - russian
19692,28114,108195,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",9352,33585,0,1,...,0,0,0,0,0,1,0,0,0,0
4046,82512,169761,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",4241,52341,0,0,...,0,0,0,0,0,1,0,0,0,0
39366,90414,199046,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",99666,54700,0,1,...,0,0,0,0,0,1,0,0,0,0
7578,30575,107242,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",51881,36383,0,1,...,0,0,0,0,0,1,0,0,0,0
41771,14080,181637,1,0,0,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",85226,16839,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,23959,131919,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",44023,28615,0,0,...,0,0,0,0,0,1,0,0,0,0
47766,97232,189495,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",41071,56911,0,0,...,0,0,0,0,1,0,0,0,0,0
16576,7510,143474,1,0,0,"[0, 49329, 9167, 12478, 10566, 35, 646, 12606,...",V3001,8989,0,0,...,0,0,0,0,0,1,0,0,0,0
24121,16030,194963,1,0,0,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",80502,19112,0,0,...,0,0,0,0,0,0,0,0,0,0
