In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,roc_curve,roc_auc_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
adult_icu_raw = pd.read_csv("/home/pulkitmathur1800/csc2548_ml4h/adult_icu")
#print(list(adult_icu_raw.columns))
adult_icu_raw = adult_icu_raw.drop(['subject_id','hadm_id','icustay_id'],axis=1)
bool_cols = [col for col in adult_icu_raw if np.isin(adult_icu_raw[col].dropna().unique(),[0, 1]).all()]
non_bool_cols = [item for item in adult_icu_raw.columns if item not in bool_cols]
scaler = preprocessing.StandardScaler()
adult_icu_scaled = scaler.fit_transform(adult_icu_raw[non_bool_cols])
adult_icu_scaled = pd.DataFrame(adult_icu_scaled, columns=non_bool_cols)
adult_icu_scaled = adult_icu_scaled.join(adult_icu_raw[bool_cols])
adult_icu_scaled.head()

In [None]:
sns.countplot(x='mort_icu',data=adult_icu_scaled, palette='hls')
plt.show()

In [None]:
X = adult_icu_scaled.drop(['mort_icu'],axis=1)
X_train,X_test = X.loc[X['train']==1],X.loc[X['train']==0]
X_train,X_test = X_train.drop(['train'],axis=1),X_test.drop(['train'],axis=1)
Y = adult_icu_scaled[['mort_icu','train']] 
Y_train,Y_test = Y.loc[Y['train']==1],Y.loc[Y['train']==0]
Y_train,Y_test = Y_train.drop(['train'],axis=1).values.squeeze(),Y_test.drop(['train'],axis=1).values.squeeze()
X_train.head()

In [None]:
classifier = LogisticRegression(random_state=0,penalty="l2", C=1)
classifier.fit(X_train, Y_train)
Y_pred = classifier.predict(X_test)
matrix = confusion_matrix(Y_test, Y_pred)
print(matrix)

In [None]:
classifier.score(X_train, Y_train)

In [None]:
fpr, tpr,_=roc_curve(Y_pred,Y_test)
plt.figure()
plt.plot(fpr, tpr, color='red', lw=2, label='ROC curve')
plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.show()

In [None]:
roc_auc_score(Y_pred,Y_test)

In [None]:
adult_notes_raw = pd.read_csv("/home/pulkitmathur1800/csc2548_ml4h/adult_notes")
adult_notes_raw = adult_notes_raw.drop(['subject_id','hadm_id','icustay_id'],axis=1)
print(adult_notes_raw.shape)
adult_notes_raw.head()

In [None]:
sns.countplot(x='mort_icu',data=adult_notes_raw, palette='hls')
plt.show()

In [None]:
#deleting punctuation
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
adult_notes_raw['chartext'].dropna(inplace=True)
adult_notes_raw.chartext = adult_notes_raw.chartext.apply(lambda x: tokenizer.tokenize(x))
adult_notes_raw.head()

In [None]:
#deleting stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
adult_notes_raw['chartext'].dropna(inplace=True)
adult_notes_raw.chartext = adult_notes_raw.chartext.apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
adult_notes_raw['chartext'].dropna(inplace=True)
adult_notes_raw.chartext = adult_notes_raw.chartext.apply(lambda x: ' '.join(map(str, x)))
adult_notes_raw.head()

In [None]:
#train-test-split
X2 = adult_notes_raw.drop(['mort_icu'],axis=1)
X_train_notes,X_test_notes = X2.loc[X2['train']==1].drop(['train'],axis=1),X2.loc[X2['train']==0].drop(['train'],axis=1)
X_train_notes['chartext'].dropna(inplace=True)
X_test_notes['chartext'].dropna(inplace=True)
Y2 = adult_notes_raw[['mort_icu','train']] 
Y_train_notes,Y_test_notes = Y2.loc[Y2['train']==1],Y2.loc[Y2['train']==0]
Y_train_notes,Y_test_notes = Y_train_notes.drop(['train'],axis=1).values.squeeze(),Y_test_notes.drop(['train'],axis=1).values.squeeze()


In [None]:
adult_notes_raw['chartext'][0]

In [None]:
X_train_notes = numpy.array(X_train_notes['chartext'])
X_test_notes = numpy.array(X_test_notes['chartext'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x = v.fit_transform(X_train_notes)

In [None]:
notes_classifier = LogisticRegression(random_state=0,penalty="l1", C=1)
notes_classifier.fit(X_train, Y_train)
Y_pred = notes_classifier.predict(X_test)
matrix = confusion_matrix(Y_test, Y_pred)
print(matrix)