In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from scipy import stats

# Data preparation

In [2]:
tot_df=pd.read_csv('25100_dat.csv')

In [3]:
def feature_transform(tot_df):
    corpus_meds=tot_df['meds_str'].values
    corpus_reacts=tot_df['reacts_str'].values
    
    c_med = CountVectorizer()
    med_feats = c_med.fit_transform(corpus_meds).toarray()
    
    c_r=CountVectorizer()
    reacts_feats=c_r.fit_transform(corpus_reacts).toarray()
    
    feats=np.concatenate([med_feats,reacts_feats],axis=1)
    
    return feats,c_med.vocabulary_, c_r.vocabulary_
    
feats,med_vocab,reacts_vocab=feature_transform(tot_df)

In [4]:
targets=tot_df[['death']].fillna(0)
print(targets)
np.mean(targets)
print(targets[targets.death==1].shape)

       death
0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
...      ...
25095    0.0
25096    0.0
25097    0.0
25098    0.0
25099    0.0

[25100 rows x 1 columns]
(2162, 1)


In [5]:
# Separate majority and minority classes
df_majority = targets[targets.death==0]
df_minority = targets[targets.death==1]
 
#downsample
downsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2162,    # to match majority class
                                 random_state=123) 

In [6]:
print(downsampled[downsampled.death==0])
downsampled_inds=downsampled[downsampled.death==0].index
death_inds=df_minority.index
print(downsampled_inds)
print(death_inds)

       death
21779    0.0
17071    0.0
19281    0.0
23595    0.0
16713    0.0
...      ...
5936     0.0
7182     0.0
20052    0.0
18979    0.0
11709    0.0

[2162 rows x 1 columns]
Int64Index([21779, 17071, 19281, 23595, 16713,  8326, 14528, 24302,   100,
            24751,
            ...
            19620, 21122,  5357, 22892,  6362,  5936,  7182, 20052, 18979,
            11709],
           dtype='int64', length=2162)
Int64Index([    1,    16,    19,    37,   209,   231,   244,   262,   269,
              272,
            ...
            24988, 24995, 25002, 25009, 25014, 25015, 25020, 25035, 25053,
            25074],
           dtype='int64', length=2162)


In [7]:
feats_no_death=feats[downsampled_inds,:]
feats_death=feats[death_inds,:]
feats=np.concatenate([feats_no_death,feats_death],axis=0)

In [8]:
targets=np.concatenate([downsampled.values,df_minority.values],axis=0)
print(targets.shape)


(4324, 1)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(feats,targets,test_size=0.33)

# Logistic Regression

In [10]:
pcs=PCA(n_components=1000)
X_train_pcs=pcs.fit_transform(X_train)
X_test_pcs=pcs.transform(X_test)
print('Explained variance: ',pcs.explained_variance_ratio_.sum())

Explained variance:  0.9580995275138602


In [11]:
clf = LogisticRegression(random_state=0).fit(X_train_pcs, y_train)


  y = column_or_1d(y, warn=True)


In [12]:
pred=clf.predict(X_test_pcs)
accuracy_score(pred,y_test)

0.8913805185704274

In [13]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[621, 100],
       [ 55, 651]])

In [14]:
r2_score(y_test, pred)

0.5654740622286485

# Naive Bayes

In [15]:
clf=MultinomialNB().fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [16]:
pred=clf.predict(X_test)
accuracy_score(pred,y_test)

0.8794674141555712

In [17]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[664,  57],
       [115, 591]])

In [18]:
#print(med_vocab)
#print(reacts_vocab)

tot_vocab={}
for j in med_vocab.keys():
    tot_vocab[med_vocab[j]]=j
for i in reacts_vocab.keys():
    tot_vocab[reacts_vocab[i]+len(med_vocab.keys())]=i
#print(tot_vocab)

In [19]:
print(clf.feature_log_prob_.shape)
print(len(tot_vocab.keys()))
probs=clf.feature_log_prob_
top_death_class=probs[1,:].argsort()[-100:][::-1]
print(probs[0,top_death_class])
for j in top_death_class:
    print(tot_vocab[j])

(2, 11094)
11094
[ -8.16844592  -6.68036887  -6.53083713  -5.97122135  -7.62944942
  -6.81851921  -7.06983364  -4.54220204  -5.49923556  -7.34176735
  -5.8516762   -6.89548025  -7.28114273  -4.72986101  -5.53964509
  -6.02001151  -5.34367145  -6.97886186  -9.42120889  -6.37668645
  -6.81851921  -7.34176735  -6.85625953  -7.9171315   -7.9171315
  -5.77055065  -8.03491453  -7.9171315   -7.40630587  -6.33016644
  -6.93630224  -8.3225966   -6.35315596  -6.02001151  -8.72806171
  -6.00348221  -5.98722169  -6.78215156  -7.9171315   -5.90966345
  -7.22398432  -6.12537203  -6.28571468  -8.50491816  -7.9171315
  -7.7164608   -7.22398432  -8.03491453  -6.89548025  -5.23155415
  -8.3225966   -6.22253577  -8.72806171  -8.72806171  -8.03491453
  -8.3225966   -8.50491816  -7.1186238   -8.03491453  -6.53083713
  -6.71315869  -6.30769358  -6.42547662  -9.42120889  -6.45079443
  -6.42547662  -6.02001151  -9.01574378  -7.06983364  -9.01574378
  -5.69551547  -6.74706024  -7.7164608   -9.42120889  -9.0157

# K-fold crossvalidation

In [20]:
kf = KFold(n_splits=10,shuffle=True)

acc_lst_lr=[]
acc_lst_nb=[]
for train_index, test_index in kf.split(feats):
    
    X_train, X_test = feats[train_index], feats[test_index]
    y_train, y_test = targets[train_index].reshape(-1,), targets[test_index].reshape(-1,)
    pcs=PCA(n_components=1000)
    X_train_pcs=pcs.fit_transform(X_train)
    X_test_pcs=pcs.transform(X_test)
    clf = LogisticRegression(random_state=0).fit(X_train_pcs, y_train)
    pred=clf.predict(X_test_pcs)
    acc_lst_lr.append(accuracy_score(pred,y_test))
    clf=MultinomialNB().fit(X_train,y_train)
    pred=clf.predict(X_test)
    acc_lst_nb.append(accuracy_score(pred,y_test))

print('Logistic regression k-fold accuracies:', acc_lst_lr)
print('Naive Bayes k-fold accuracies:', acc_lst_nb)
    

Logistic regression k-fold accuracies: [0.8706697459584296, 0.8960739030023095, 0.8937644341801386, 0.9076212471131639, 0.8935185185185185, 0.9027777777777778, 0.9027777777777778, 0.8888888888888888, 0.9097222222222222, 0.9097222222222222]
Naive Bayes k-fold accuracies: [0.8429561200923787, 0.8568129330254042, 0.8683602771362586, 0.8637413394919169, 0.8842592592592593, 0.8773148148148148, 0.8680555555555556, 0.8773148148148148, 0.8657407407407407, 0.8726851851851852]


In [21]:
print('Logistic regression accuracy 95% confidence interval:', np.mean(acc_lst_lr)-2.262*(np.std(acc_lst_lr,ddof=1)/np.sqrt(10)),np.mean(acc_lst_lr)+2.262*(np.std(acc_lst_lr,ddof=1)/np.sqrt(10)))
print('Naive Bayes accuracy 95% confidence interval:', np.mean(acc_lst_nb)-2.262*(np.std(acc_lst_nb,ddof=1)/np.sqrt(10)),np.mean(acc_lst_nb)+2.262*(np.std(acc_lst_nb,ddof=1)/np.sqrt(10)))

Logistic regression accuracy 95% confidence interval: 0.8890140110708556 0.906093336461434
Naive Bayes accuracy 95% confidence interval: 0.8593455233160673 0.8761026847071983


The following t-test for testing between the differences of two mean accuracies is not valid. The samples are not independent in k-fold cross-validation. The following procedure is know to lead to a lot of Type I errors (incorrectly rejecting the null). However, we can the p-value as an approximation.

In [22]:
#Statistical test to determine whether there is a significant difference in classification accuracies for logistic
#regression and naive bayes
#Calculated according to a blog post https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
var_lr = np.array(acc_lst_lr).var(ddof=1)
var_nb = np.array(acc_lst_nb).var(ddof=1)
#std deviation
s = np.sqrt((var_lr + var_nb)/2)
## Calculate the t-statistics
N=10
t = (np.array(acc_lst_lr).mean() - np.array(acc_lst_nb).mean())/(s*np.sqrt(2/N))

## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

#p-value after comparison with the t 
p = 2*(1 - stats.t.cdf(t,df=df))

t2, p2 = stats.ttest_ind(np.array(acc_lst_lr),np.array(acc_lst_nb))
print(p,p2)

2.3751506029290326e-05 2.3751506029263742e-05


In [23]:
print('P-value of the t-test between classification accuracies', p)

P-value of the t-test between classification accuracies 2.3751506029290326e-05
