In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from scipy import stats

# Data preparation

In [2]:
tot_df=pd.read_csv('25100_dat.csv')

In [3]:
def feature_transform(tot_df):
    corpus_meds=tot_df['meds_str'].values
    corpus_reacts=tot_df['reacts_str'].values
    
    c_med = CountVectorizer(stop_words=['death'])
    med_feats = c_med.fit_transform(corpus_meds).toarray()
    
    c_r=CountVectorizer(stop_words=['death'])
    reacts_feats=c_r.fit_transform(corpus_reacts).toarray()
    
    feats=np.concatenate([med_feats,reacts_feats],axis=1)
    
    return feats,c_med.vocabulary_, c_r.vocabulary_
    
feats,med_vocab,reacts_vocab=feature_transform(tot_df)

In [4]:
targets=tot_df[['death']].fillna(0)
print(targets)
np.mean(targets)
print(targets[targets.death==1].shape)

       death
0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
...      ...
25095    0.0
25096    0.0
25097    0.0
25098    0.0
25099    0.0

[25100 rows x 1 columns]
(2162, 1)


In [5]:
# Separate majority and minority classes
df_majority = targets[targets.death==0]
df_minority = targets[targets.death==1]
 
#downsample
downsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2162,    # to match majority class
                                 random_state=123) 

In [6]:
print(downsampled[downsampled.death==0])
downsampled_inds=downsampled[downsampled.death==0].index
death_inds=df_minority.index
print(downsampled_inds)
print(death_inds)

       death
21779    0.0
17071    0.0
19281    0.0
23595    0.0
16713    0.0
...      ...
5936     0.0
7182     0.0
20052    0.0
18979    0.0
11709    0.0

[2162 rows x 1 columns]
Int64Index([21779, 17071, 19281, 23595, 16713,  8326, 14528, 24302,   100,
            24751,
            ...
            19620, 21122,  5357, 22892,  6362,  5936,  7182, 20052, 18979,
            11709],
           dtype='int64', length=2162)
Int64Index([    1,    16,    19,    37,   209,   231,   244,   262,   269,
              272,
            ...
            24988, 24995, 25002, 25009, 25014, 25015, 25020, 25035, 25053,
            25074],
           dtype='int64', length=2162)


In [7]:
feats_no_death=feats[downsampled_inds,:]
feats_death=feats[death_inds,:]
feats=np.concatenate([feats_no_death,feats_death],axis=0)

In [8]:
targets=np.concatenate([downsampled.values,df_minority.values],axis=0)
print(targets.shape)


(4324, 1)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(feats,targets,test_size=0.33)

# Logistic Regression

In [10]:
pcs=PCA(n_components=1000)
X_train_pcs=pcs.fit_transform(X_train)
X_test_pcs=pcs.transform(X_test)
print('Explained variance: ',pcs.explained_variance_ratio_.sum())

Explained variance:  0.9593066485215316


In [11]:
clf = LogisticRegression(random_state=0).fit(X_train_pcs, y_train)


  y = column_or_1d(y, warn=True)


In [12]:
pred=clf.predict(X_test_pcs)
accuracy_score(pred,y_test)

0.8002803083391731

In [13]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[584, 123],
       [162, 558]])

In [14]:
r2_score(y_test, pred)

0.20105492692126337

# Naive Bayes

In [15]:
clf=MultinomialNB().fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [16]:
pred=clf.predict(X_test)
accuracy_score(pred,y_test)

0.8065872459705676

In [17]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[586, 121],
       [155, 565]])

In [18]:
#print(med_vocab)
#print(reacts_vocab)

tot_vocab={}
for j in med_vocab.keys():
    tot_vocab[med_vocab[j]]=j
for i in reacts_vocab.keys():
    tot_vocab[reacts_vocab[i]+len(med_vocab.keys())]=i
#print(tot_vocab)

In [19]:
print(clf.feature_log_prob_.shape)
print(len(tot_vocab.keys()))
probs=clf.feature_log_prob_
top_death_class=probs[1,:].argsort()[-100:][::-1]
print(probs[0,top_death_class])
for j in top_death_class:
    print(tot_vocab[j])

(2, 11093)
11093
[ -6.61159291  -6.49718255  -7.71020519  -6.10076728  -6.77589596
  -7.16366149  -5.94921738  -4.82998581  -5.39857027  -6.64236456
  -7.16366149  -6.88922464  -4.55902438  -5.50293028  -7.33551174
  -7.01705801  -5.45414012  -7.06357803  -9.00948818  -7.91087589
  -7.91087589  -8.16219032  -5.91844573  -6.52458153  -5.87399396
  -7.21772871  -6.88922464  -7.62319382  -8.49866255  -5.96496574
  -8.316341    -6.88922464  -6.64236456  -6.15685675  -7.71020519
  -6.27945907  -7.11236819  -7.06357803  -8.316341    -8.49866255
  -7.71020519  -5.88859276  -5.96496574  -8.16219032  -6.47051431
  -7.21772871  -6.23689946  -7.33551174  -7.71020519  -7.33551174
  -9.00948818  -9.00948818  -9.00948818  -8.02865893  -6.52458153
  -6.77589596  -6.03056302  -6.15685675  -9.41495329  -9.41495329
 -10.10810047  -5.20282569  -6.8122636   -7.91087589  -7.01705801
  -8.72180611  -6.37043085  -6.88922464  -8.02865893  -7.06357803
  -6.88922464  -6.44453882  -8.72180611  -6.97260625  -7.16

# K-fold crossvalidation

In [None]:
kf = KFold(n_splits=10,shuffle=True)

acc_lst_lr=[]
acc_lst_nb=[]
for train_index, test_index in kf.split(feats):
    
    X_train, X_test = feats[train_index], feats[test_index]
    y_train, y_test = targets[train_index].reshape(-1,), targets[test_index].reshape(-1,)
    pcs=PCA(n_components=1000)
    X_train_pcs=pcs.fit_transform(X_train)
    X_test_pcs=pcs.transform(X_test)
    clf = LogisticRegression(random_state=0).fit(X_train_pcs, y_train)
    pred=clf.predict(X_test_pcs)
    acc_lst_lr.append(accuracy_score(pred,y_test))
    clf=MultinomialNB().fit(X_train,y_train)
    pred=clf.predict(X_test)
    acc_lst_nb.append(accuracy_score(pred,y_test))

print('Logistic regression k-fold accuracies:', acc_lst_lr)
print('Naive Bayes k-fold accuracies:', acc_lst_nb)
    

In [None]:
print('Logistic regression accuracy 95% confidence interval:', np.mean(acc_lst_lr)-2.262*(np.std(acc_lst_lr,ddof=1)/np.sqrt(10)),np.mean(acc_lst_lr)+2.262*(np.std(acc_lst_lr,ddof=1)/np.sqrt(10)))
print('Naive Bayes accuracy 95% confidence interval:', np.mean(acc_lst_nb)-2.262*(np.std(acc_lst_nb,ddof=1)/np.sqrt(10)),np.mean(acc_lst_nb)+2.262*(np.std(acc_lst_nb,ddof=1)/np.sqrt(10)))

The following t-test for testing between the differences of two mean accuracies is not valid. The samples are not independent in k-fold cross-validation. The following procedure is know to lead to a lot of Type I errors (incorrectly rejecting the null). However, we can the p-value as an approximation.

In [None]:
#Statistical test to determine whether there is a significant difference in classification accuracies for logistic
#regression and naive bayes
#Calculated according to a blog post https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
var_lr = np.array(acc_lst_lr).var(ddof=1)
var_nb = np.array(acc_lst_nb).var(ddof=1)
#std deviation
s = np.sqrt((var_lr + var_nb)/2)
## Calculate the t-statistics
N=10
t = (np.array(acc_lst_lr).mean() - np.array(acc_lst_nb).mean())/(s*np.sqrt(2/N))

## Compare with the critical t-value
#Degrees of freedom
df = 2*N - 2

#p-value after comparison with the t 
p = 2*(1 - stats.t.cdf(t,df=df))

t2, p2 = stats.ttest_ind(np.array(acc_lst_lr),np.array(acc_lst_nb))
print(p,p2)

In [None]:
print('P-value of the t-test between classification accuracies', p)