In [2]:
from __future__ import division
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

from difflib import SequenceMatcher

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
#import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

#for validating your classification model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

import re
import requests
from lxml import html
import time

In [3]:
df=pd.read_csv('prostate_canc.csv')
df.head()

Unnamed: 0,indi_pt,drugname,age,gndr_cod,wt,reporter_country,de,lt,ho,ds,ca,ri,ot,pt,target,pathway
0,prostate cancer metastatic,ibandronic acid,,,,country not specified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,gastrointestinal injury,Farnesyl pyrophosphate synthase,Ibandronate Action Pathway
1,prostate cancer metastatic,ibandronic acid,,,,country not specified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,gastrointestinal injury,Hydroxylapatite,Ibandronate Action Pathway
2,prostate cancer metastatic,ibandronic acid,,,,country not specified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lymphocele,Farnesyl pyrophosphate synthase,Ibandronate Action Pathway
3,prostate cancer metastatic,ibandronic acid,,,,country not specified,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lymphocele,Hydroxylapatite,Ibandronate Action Pathway
4,biopsy prostate,metronidazole,,1.0,68.2,gb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,diarrhoea,DNA,


In [4]:
df['pt'].value_counts()

no reaction                             4048
hypotension                             2305
anuria                                  2051
road traffic accident                   2023
blood pressure systolic decreased       2012
haemodialysis                           2012
fatigue                                  942
blood creatinine increased               896
aspartate aminotransferase increased     780
cerebral ischaemia                       698
asthenia                                 653
nausea                                   618
dyspnoea                                 617
pain                                     566
anaemia                                  558
pyrexia                                  507
dizziness                                477
malignant neoplasm progression           469
diarrhoea                                465
prostate cancer                          463
decreased appetite                       462
osteonecrosis of jaw                     459
off label 

## Survivability Model

In [5]:
df_surv=df.drop(['reporter_country', 'target', 'pathway','pt'], axis=1)
df_surv.head()

Unnamed: 0,indi_pt,drugname,age,gndr_cod,wt,de,lt,ho,ds,ca,ri,ot
0,prostate cancer metastatic,ibandronic acid,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,prostate cancer metastatic,ibandronic acid,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,prostate cancer metastatic,ibandronic acid,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,prostate cancer metastatic,ibandronic acid,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,biopsy prostate,metronidazole,,1.0,68.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
len(df_surv)

77733

In [7]:
df_surv=df_surv.dropna()
len(df_surv)

15712

In [8]:
df_surv['indi_pt'].value_counts()

prostate cancer                       11624
prostate cancer metastatic             2829
prostate infection                      435
hormone-refractory prostate cancer      204
prostate cancer stage iv                154
prostate cancer recurrent               132
prostate examination abnormal           120
prostatectomy                            61
prostate induration                      39
prostate cancer stage ii                 27
biopsy prostate                          23
transurethral prostatectomy              18
radical prostatectomy                     9
prostate cancer stage 0                   9
enlarged prostate                         9
prostate cancer stage iii                 7
neoplasm prostate                         6
prostate tenderness                       6
Name: indi_pt, dtype: int64

In [10]:
df_surv=df_surv[df_surv['indi_pt'].str.contains('prostate cancer')]
df_surv['indi_pt'].value_counts()

prostate cancer                       11624
prostate cancer metastatic             2829
hormone-refractory prostate cancer      204
prostate cancer stage iv                154
prostate cancer recurrent               132
prostate cancer stage ii                 27
prostate cancer stage 0                   9
prostate cancer stage iii                 7
Name: indi_pt, dtype: int64

In [11]:
dummy_col = pd.get_dummies(df_surv[['drugname']])
df_surv=df_surv.drop(['drugname','indi_pt'] , axis=1)
df_surv = pd.concat([df_surv, dummy_col], axis=1)
df_surv.head()

Unnamed: 0,age,gndr_cod,wt,de,lt,ho,ds,ca,ri,ot,...,drugname_xgeva,drugname_xofigo,drugname_xtandi,drugname_zemplar,drugname_zoladex,drugname_zoladex la,drugname_zoledronic acid,drugname_zometa,drugname_zonegran,drugname_zytiga
28,50.58,1.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
29,50.58,1.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
30,50.58,1.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
31,50.58,1.0,100.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
34,60.43,1.0,78.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df_surv.corr()['de'].sort_values()

ho                                       -0.197731
ot                                       -0.192944
drugname_magnesium                       -0.110389
drugname_proscar                         -0.089669
ds                                       -0.088038
drugname_thalidomide                     -0.084107
drugname_fosamax                         -0.080584
drugname_lupron                          -0.066489
drugname_provenge                        -0.064170
drugname_flutamide                       -0.059967
drugname_alendronate sodium              -0.052091
drugname_prednisone                      -0.050613
drugname_zoladex                         -0.049054
drugname_odyne                           -0.046768
drugname_casodex                         -0.043930
drugname_revlimid                        -0.042458
drugname_paclitaxel                      -0.042045
drugname_gleevec                         -0.041418
drugname_imatinib mesylate               -0.041418
drugname_aredia                

In [13]:
y=df_surv['de']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf1 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)    #building 100 decision trees
clf1.fit(X_train, y_train)
print ("Death model")
print ("Random Forest")
print (metrics.accuracy_score(y_test, clf1.predict(X_test)))
print (metrics.confusion_matrix(y_test, clf1.predict(X_test)))
print (metrics.classification_report(y_test, clf1.predict(X_test)))
print (metrics.roc_auc_score(y_test, clf1.predict(X_test)))

Death model
Random Forest
0.9613075383589059
[[1154   38]
 [  20  287]]
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98      1192
         1.0       0.88      0.93      0.91       307

   micro avg       0.96      0.96      0.96      1499
   macro avg       0.93      0.95      0.94      1499
weighted avg       0.96      0.96      0.96      1499

0.9514871127822836


In [14]:
clf = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1)    #building 100 decision trees
clf.fit(X, y)
scores = cross_val_score(clf, X, y, scoring='accuracy', cv=10)
print (scores)
print (scores.mean())

[0.79533333 0.59133333 0.83       0.84045394 0.89786382 0.61348465
 0.78304406 0.54873164 0.52536716 0.83244326]
0.7258055184690699


In [15]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train, y_train)
print ("Death model")
print ("Logistic Regression")
print (metrics.accuracy_score(y_test, lr.predict(X_test)))
print (metrics.confusion_matrix(y_test, lr.predict(X_test)))
print (metrics.classification_report(y_test, lr.predict(X_test)))
print (metrics.roc_auc_score(y_test, lr.predict(X_test)))

Death model
Logistic Regression
0.6891260840560374
[[780 412]
 [ 54 253]]
              precision    recall  f1-score   support

         0.0       0.94      0.65      0.77      1192
         1.0       0.38      0.82      0.52       307

   micro avg       0.69      0.69      0.69      1499
   macro avg       0.66      0.74      0.65      1499
weighted avg       0.82      0.69      0.72      1499

0.7392333253175349


In [16]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X, y)
scores = cross_val_score(lr, X, y, scoring='accuracy', cv=5)
print (scores)
print (scores.mean())

[0.66677785 0.81354236 0.56304203 0.39986649 0.79806409]
0.6482585631186256


In [24]:
df_surv['ri'].value_counts()

0.0    14930
1.0       56
Name: ri, dtype: int64

In [28]:
y=df_surv['de']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf1 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, oob_score=True, random_state=0)    #building 100 decision trees
clf1.fit(X_train, y_train)
print ("Death model")
print ("oob score:", clf1.oob_score_)
print (metrics.accuracy_score(y_test, clf1.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf1.predict(X_test)))
print ("=======================================================")
y=df_surv['lt']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, oob_score=True, random_state=0)    #building 100 decision trees
clf2.fit(X_train, y_train)
print ("Life threating model")
print ("oob score:", clf2.oob_score_)
print (metrics.accuracy_score(y_test, clf2.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf2.predict(X_test)))
print ("=======================================================")
y=df_surv['ho']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf3 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, oob_score=True, random_state=0)    #building 100 decision trees
clf3.fit(X_train, y_train)
print ("Hospitalization model")
print ("oob score:", clf3.oob_score_)
print (metrics.accuracy_score(y_test, clf3.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf3.predict(X_test)))
print ("=======================================================")
y=df_surv['ds']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf4 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, oob_score=True, random_state=0)    #building 100 decision trees
clf4.fit(X_train, y_train)
print ("Disability model")
print ("oob score:", clf4.oob_score_)
print (metrics.accuracy_score(y_test, clf4.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf4.predict(X_test)))
print ("=======================================================")
y=df_surv['ot']
X=df_surv.drop(['de','lt','ho','ds','ca','ri','ot'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, stratify=y)
clf5 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, oob_score=True, random_state=0)    #building 100 decision trees
clf5.fit(X_train, y_train)
print ("Other Serious issues model")
print ("oob score:", clf5.oob_score_)
print (metrics.accuracy_score(y_test, clf5.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf5.predict(X_test)))
print ("=======================================================")
X_test=X_test.reset_index(drop=True)

Death model
oob score: 0.9673018462222881
0.9613075383589059
[[1154   38]
 [  20  287]]
Life threating model
oob score: 0.9887298880403351
0.9879919946631087
[[1425   13]
 [   5   56]]
Hospitalization model
oob score: 0.9615926447690368
0.9639759839893263
[[1068   38]
 [  16  377]]
Disability model
oob score: 0.9848001779491362
0.981320880587058
[[1420   28]
 [   0   51]]
Other Serious issues model
oob score: 0.9513605694372359
0.9506337558372249
[[961  59]
 [ 15 464]]


In [29]:
X_test.head()

Unnamed: 0,age,gndr_cod,wt,drugname_abiraterone,drugname_abiraterone acetate,drugname_accutane,drugname_actiq,drugname_afinitor,drugname_alendronate sodium,drugname_alimta,...,drugname_xgeva,drugname_xofigo,drugname_xtandi,drugname_zemplar,drugname_zoladex,drugname_zoladex la,drugname_zoledronic acid,drugname_zometa,drugname_zonegran,drugname_zytiga
0,66.0,1.0,74.376,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,62.0,1.0,84.368112,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,64.0,1.0,70.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,68.0,1.0,91.609,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.0,1.0,89.357624,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
X_test.shape[1]

135

In [36]:
X_test.loc[0].shape

(135,)

In [38]:
def drug_recommender(age,wt,gndr):
    

    x=np.zeros(X_test.loc[0].shape)
    out=pd.DataFrame(columns={'drug','score'})
    min_prob=10
    min_prob_i=3
    for i in range(3,135):
        x = np.zeros(x.shape)
        x[0]=age
        x[1]=wt
        x[2]=gndr
        x[i]=1
        pred=10*clf1.predict_proba(x.reshape(1, -1))[0][1]+3*clf2.predict_proba(x.reshape(1, -1))[0][1]+2*clf3.predict_proba(x.reshape(1, -1))[0][1]+1.5*clf4.predict_proba(x.reshape(1, -1))[0][1]+clf5.predict_proba(x.reshape(1, -1))[0][1]
        prob=pred
        if prob <= min_prob:
            min_prob=prob
            min_prob_i=i
        out=out.append(pd.DataFrame({'drug':[X_test.columns[i]],'score':[pred]}))
    out=out.sort_values(['score'])
    print ("Recommended Drug:" ,X_test.columns[min_prob_i].replace("drugname_", ""))
    print ("==============================================")
    return (out.head())

In [39]:
drug_recommender(65,90,1)

Recommended Drug: zometa


Unnamed: 0,drug,score
0,drugname_zometa,1.637917
0,drugname_flutamide,2.160597
0,drugname_zoladex la,2.366662
0,drugname_odyne,2.515057
0,drugname_lupron,2.55823


## Reaction Prediction Models

In [54]:
df_reac=df[['indi_pt','age','wt','gndr_cod','pt','target', 'pathway']]
df_reac=df_reac.dropna()
df_reac.head()

Unnamed: 0,indi_pt,age,wt,gndr_cod,pt,target,pathway
28,prostate cancer metastatic,50.58,100.0,1.0,convulsion,Sodium channel protein type 10 subunit alpha,Bupivacaine Action Pathway
29,prostate cancer metastatic,50.58,100.0,1.0,convulsion,Prostaglandin E2 receptor EP1 subtype,Bupivacaine Action Pathway
30,prostate cancer metastatic,50.58,100.0,1.0,hepatic failure,Sodium channel protein type 10 subunit alpha,Bupivacaine Action Pathway
31,prostate cancer metastatic,50.58,100.0,1.0,hepatic failure,Prostaglandin E2 receptor EP1 subtype,Bupivacaine Action Pathway
34,prostate cancer metastatic,60.43,78.0,1.0,no reaction,Sodium channel protein type 10 subunit alpha,Bupivacaine Action Pathway


In [55]:
len(df_reac)

9450

In [56]:
df_reac['pt'].value_counts()

cerebral ischaemia                       694
no reaction                              426
asthenia                                 179
anaemia                                  118
dyspnoea                                 115
oedema peripheral                        107
pain                                     104
general physical health deterioration    101
dehydration                              100
malignant neoplasm progression            95
pyrexia                                   93
dizziness                                 92
febrile neutropenia                       90
decreased appetite                        87
osteonecrosis                             87
osteonecrosis of jaw                      86
fatigue                                   85
confusional state                         79
prostatic specific antigen increased      78
renal failure                             76
sepsis                                    68
arthralgia                                67
blood crea

In [57]:
li=df_reac['pt'].value_counts().head(5).index.tolist()
li.remove('no reaction')
df_reac.loc[~df_reac['pt'].isin(li), 'pt']=0
df_reac['pt']=df_reac['pt'].astype("category").cat.codes
df_reac['pt']=df_reac['pt'].astype(float)
df_reac['pt'].value_counts()

0.0    8344
3.0     694
2.0     179
1.0     118
4.0     115
Name: pt, dtype: int64

In [58]:
dummy_col = pd.get_dummies(df_reac[['target','indi_pt','pathway']])
df_reac=df_reac.drop(['target','indi_pt','pathway'] , axis=1)
df_reac = pd.concat([df_reac, dummy_col], axis=1)
df_reac.head()

Unnamed: 0,age,wt,gndr_cod,pt,target_2-oxoglutarate and iron-dependent oxygenase domain-containing protein 1,target_2-oxoglutarate and iron-dependent oxygenase domain-containing protein 2,target_3 beta-hydroxysteroid dehydrogenase/Delta 5-->4-isomerase type 1,"target_3-beta-hydroxysteroid-Delta(8),Delta(7)-isomerase",target_3-hydroxy-3-methylglutaryl-coenzyme A reductase,target_3-phosphoinositide-dependent protein kinase 1,...,pathway_Tyrosine Hydroxylase Deficiency,pathway_Tyrosine Metabolism,pathway_Tyrosinemia Type I,"pathway_Tyrosinemia, Transient, of the Newborn",pathway_Vinblastine Action Pathway,pathway_Vincristine Action Pathway,pathway_Vinorelbine Action Pathway,pathway_Wolman Disease,pathway_Xanthophyll Cycle,pathway_Zoledronate Action Pathway
28,50.58,100.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,50.58,100.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,50.58,100.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
31,50.58,100.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,60.43,78.0,1.0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
len(df_reac)

9450

In [60]:
X=df_reac.drop(['pt'], axis=1)
y=df_reac['pt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [61]:
from sklearn.multiclass import OneVsRestClassifier
clf2 = RandomForestClassifier(n_estimators=100, class_weight='balanced', n_jobs=-1, random_state=0)    #building 50 decision trees
clf3=OneVsRestClassifier(clf2)
clf3.fit(X_train, y_train)
print("(pt) model")
#print("oob score:"), clf2.oob_score_
print (metrics.accuracy_score(y_test, clf3.predict(X_test)))

print (metrics.confusion_matrix(y_test, clf3.predict(X_test)))
print ("=======================================================")
print (metrics. f1_score(y_test, clf3.predict(X_test), average='weighted'))

(pt) model
0.8703703703703703
[[1507   50   69    1   42]
 [  13    0   10    0    0]
 [  29    4    0    0    3]
 [   3    0    0  136    0]
 [  16    0    5    0    2]]
0.8954078307227549
