In [103]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Data preparation

In [196]:
tot_df=pd.read_csv('25100_dat.csv')

In [206]:
def feature_transform(tot_df):
    corpus_meds=tot_df['meds_str'].values
    corpus_reacts=tot_df['reacts_str'].values
    
    c_med = CountVectorizer()
    med_feats = c_med.fit_transform(corpus_meds).toarray()
    
    c_r=CountVectorizer()
    reacts_feats=c_r.fit_transform(corpus_reacts).toarray()
    
    feats=np.concatenate([med_feats,reacts_feats],axis=1)
    
    return feats,c_med.vocabulary_, c_r.vocabulary_
    
feats,med_vocab,reacts_vocab=feature_transform(tot_df)

In [86]:
targets=tot_df[['death']].fillna(0)
print(targets)
np.mean(targets)
print(targets[targets.death==1].shape)

       death
0        0.0
1        1.0
2        0.0
3        0.0
4        0.0
...      ...
25095    0.0
25096    0.0
25097    0.0
25098    0.0
25099    0.0

[25100 rows x 1 columns]
(2162, 1)


In [87]:
# Separate majority and minority classes
df_majority = targets[targets.death==0]
df_minority = targets[targets.death==1]
 
#downsample
downsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=2162,    # to match majority class
                                 random_state=123) 

In [88]:
print(downsampled[downsampled.death==0])
downsampled_inds=downsampled[downsampled.death==0].index
death_inds=df_minority.index
print(downsampled_inds)
print(death_inds)

       death
21779    0.0
17071    0.0
19281    0.0
23595    0.0
16713    0.0
...      ...
5936     0.0
7182     0.0
20052    0.0
18979    0.0
11709    0.0

[2162 rows x 1 columns]
Int64Index([21779, 17071, 19281, 23595, 16713,  8326, 14528, 24302,   100,
            24751,
            ...
            19620, 21122,  5357, 22892,  6362,  5936,  7182, 20052, 18979,
            11709],
           dtype='int64', length=2162)
Int64Index([    1,    16,    19,    37,   209,   231,   244,   262,   269,
              272,
            ...
            24988, 24995, 25002, 25009, 25014, 25015, 25020, 25035, 25053,
            25074],
           dtype='int64', length=2162)


In [89]:
feats_no_death=feats[downsampled_inds,:]
feats_death=feats[death_inds,:]
feats=np.concatenate([feats_no_death,feats_death],axis=0)

In [90]:
targets=np.concatenate([downsampled.values,df_minority.values],axis=0)
print(targets.shape)


(4324, 1)


In [91]:
X_train, X_test, y_train, y_test = train_test_split(feats,targets,test_size=0.33)

# Logistic Regression

In [140]:
pcs=PCA(n_components=1000)
X_train_pcs=pcs.fit_transform(X_train)
X_test_pcs=pcs.transform(X_test)
print('Explained variance: ',pcs.explained_variance_ratio_.sum())

Explained variance:  0.9570552004058386


In [141]:
clf = LogisticRegression(random_state=0).fit(X_train_pcs, y_train)


  y = column_or_1d(y, warn=True)


In [142]:
pred=clf.predict(X_test_pcs)
accuracy_score(pred,y_test)

0.8857743517869656

In [143]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[624, 101],
       [ 62, 640]])

# Naive Bayes

In [145]:
clf=MultinomialNB().fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


In [146]:
pred=clf.predict(X_test)
accuracy_score(pred,y_test)

0.8612473721093202

In [147]:
confusion_matrix(y_test,pred,labels=[1,0])

array([[662,  63],
       [135, 567]])

In [186]:
neg_class_prob_sorted = clf.feature_log_prob_.argsort()
print(neg_class_prob_sorted.shape)
top_10_most_important_words=neg_class_prob_sorted[:10].flatten()
print(top_10_most_important_words)

(1, 11094)
[   0 6608 6609 ... 8387 9008 8657]


In [209]:
meds=pd.DataFrame(med_vocab)
reacts=pd.DataFrame(reacts_vocab)
inv_map = {v: k for k, v in c_med.vocabulary_.items()}
print(len(reacts_vocab.keys()))

ValueError: If using all scalar values, you must pass an index

In [185]:
for j in range(0,10):
    drug=inv_map[top_10_most_important_words[j]]
    print(drug)

000
sectral
sedacid
sedatives
sediel
sedorrhoide
seed
seasonale
seguril
seishoku
