# Model Stacking

## Imports

In [1]:
%%time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Wall time: 6.63 s


In [3]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score,roc_curve, auc

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer


## Models

In [13]:
df= pd.read_csv('../raw_data/wra_CT_PM_conclusions.csv')

In [14]:
df['thera_count'] = df['Therapeutic area'].apply(lambda x: x.count(',') + 1)

In [16]:
df_numerical= df.select_dtypes(exclude='object')

In [17]:
df_numerical.drop(labels= 'Unnamed: 0', inplace=True, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
df_numerical.columns

Index(['Authorisation status', 'Orphan medicine', 'n_trials',
       'status_not_yet_recruiting', 'status_recruiting',
       'status_enrolling_by_invitation', 'status_active_not_recruiting',
       'status_suspended', 'status_terminated', 'status_completed',
       'status_withdrawn', 'status_unknown', 'org_fed', 'org_indiv',
       'org_industry', 'org_network', 'org_nih', 'org_other', 'org_other_gov',
       'phase_early_1', 'phase_not_applicable', 'phase_1', 'phase_2',
       'phase_3', 'phase_4', 'pm_results', 'thera_count'],
      dtype='object')

In [20]:
percent_list= df_numerical.columns[3:-1]

In [21]:
for column in percent_list:
    df_numerical[column]=((df_numerical[column]*100)/df_numerical['n_trials']).replace([np.inf, -np.inf, np.nan], 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
df_cleaned=df_numerical[df_numerical['n_trials']< 3000]


In [25]:
top_list= ['org_industry',  'n_trials',  'phase_4',
             'org_other', 'status_completed', 'status_recruiting',
             'phase_3',  'pm_results', 'status_not_yet_recruiting',
             'phase_2','Authorisation status', 'thera_count']

In [27]:
df_model=df_cleaned[top_list]

In [28]:
X= df_model.drop(labels='Authorisation status', axis=1)
y=df_model['Authorisation status']

In [29]:
optimized_numericals= {'RandomForestClassifier()': RandomForestClassifier(bootstrap= False, max_depth= 12, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 100),
 'KNeighborsClassifier()':KNeighborsClassifier(algorithm= 'auto', leaf_size=10, n_jobs= -1, n_neighbors= 5, p= 1, weights= 'distance'),
 'MLPClassifier()': MLPClassifier(activation= 'tanh',alpha= 1e-05, early_stopping= True, hidden_layer_sizes= 250, learning_rate= 'invscaling', learning_rate_init= 0.01, max_iter= 3000, n_iter_no_change= 10, solver= 'lbfgs'),
 'BernoulliNB()': BernoulliNB(alpha=0.1),
 'ComplementNB()':ComplementNB(alpha=1.2) }

In [104]:
model_list= list(optimized_numericals.keys())
model_list.append('TfIdf()')
model_list

['RandomForestClassifier()',
 'KNeighborsClassifier()',
 'MLPClassifier()',
 'BernoulliNB()',
 'ComplementNB()',
 'TfIdf()']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state =1,test_size = 0.2 )


In [31]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
import numpy as np
import warnings

In [37]:
##RANDOM FOREST

In [36]:
model_rf= optimized_numericals['RandomForestClassifier()']

clf1= model_rf.fit(X_train, y_train)

y1_pred=clf1.predict(X_test)

f1_score(y_test, y1_pred)

0.2692307692307692

In [38]:
## KNeighbors

In [42]:
model_kn= optimized_numericals['KNeighborsClassifier()']

clf2= model_kn.fit(X_train, y_train)

y2_pred=clf2.predict(X_test)

f1_score(y_test, y2_pred)

0.4109589041095891

In [43]:
## MLP

In [44]:
model_mlp= optimized_numericals['MLPClassifier()']
clf3= model_mlp.fit(X_train, y_train)
y3_pred=clf3.predict(X_test)
f1_score(y_test, y3_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


0.41025641025641024

In [45]:
## Bernoulli

In [46]:
model_bern= optimized_numericals['BernoulliNB()']
clf4= model_bern.fit(X_train, y_train)
y4_pred=clf4.predict(X_test)
f1_score(y_test, y4_pred)

0.3658536585365854

In [47]:
## ComplementNB

In [48]:
model_comp= optimized_numericals['ComplementNB()']
clf5= model_comp.fit(X_train, y_train)
y5_pred=clf5.predict(X_test)
f1_score(y_test, y5_pred)

0.25000000000000006

In [51]:
## Random Forest for text

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
X_text=df['conclusions']
X_text=X_text.replace([np.nan], " ")
y_text=df['Authorisation status']
y_text.shape

(1363,)

In [92]:
vectorizer = TfidfVectorizer(max_df = 0.8, min_df=0.5, max_features = 50, ngram_range=(1, 1))
X_text_vect = vectorizer.fit_transform(X_text)

In [93]:
X_vectors = X_text_vect.toarray()

model_text= RandomForestClassifier(max_depth=2, random_state=0, class_weight='balanced')

In [95]:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_vectors,y_text, random_state =1,test_size = 0.2 )

In [97]:
clf6= model_text.fit(X_text_train, y_text_train)
y6_pred=clf6.predict(X_text_test)
f1_score(y_text_test, y6_pred)
y6_tpred=clf6.predict(X_text_train)
f1_score(y_text_train, y6_tpred)

0.471311475409836

In [49]:
## STACKING


In [112]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state =1,test_size = 0.2 )

In [113]:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(X_vectors,y_text, random_state =1,test_size = 0.2 )

In [114]:
model_list

['RandomForestClassifier()',
 'KNeighborsClassifier()',
 'MLPClassifier()',
 'BernoulliNB()',
 'ComplementNB()',
 'TfIdf()']

In [119]:
optimized_numericals= {'RandomForestClassifier()': RandomForestClassifier(bootstrap= False, max_depth= 12, max_features= 'auto', min_samples_leaf= 1, min_samples_split= 2, n_estimators= 100),
 'KNeighborsClassifier()':KNeighborsClassifier(algorithm= 'auto', leaf_size=10, n_jobs= -1, n_neighbors= 5, p= 1, weights= 'distance'),
 'MLPClassifier()': MLPClassifier(activation= 'tanh',alpha= 1e-05, early_stopping= True, hidden_layer_sizes= 250, learning_rate= 'invscaling', learning_rate_init= 0.01, max_iter= 3000, n_iter_no_change= 10, solver= 'lbfgs'),
 'BernoulliNB()': BernoulliNB(alpha=0.1),
 'ComplementNB()':ComplementNB(alpha=1.2) }

In [115]:
classifiers=[optimized_numericals['RandomForestClassifier()'].fit(X_train, y_train),
            optimized_numericals['KNeighborsClassifier()'].fit(X_train, y_train),
             optimized_numericals['MLPClassifier()'].fit(X_train, y_train),
             optimized_numericals['BernoulliNB()'].fit(X_train, y_train),
             optimized_numericals['ComplementNB()'].fit(X_train, y_train)]

In [117]:
clf6= model_text.fit(X_text_train, y_text_train)

In [118]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingCVClassifier
import numpy as np
import warnings

warnings.simplefilter('ignore')

RANDOM_SEED = 1


# Starting from v0.16.0, StackingCVRegressor supports
# `random_state` to get deterministic result.
sclf = StackingCVClassifier(classifiers=classifiers,
                            meta_classifier=clf6,
                            random_state=RANDOM_SEED)

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2, clf3,clf4, clf5, sclf], 
                      ['RandomForestClassifier()',
                         'KNeighborsClassifier()',
                         'MLPClassifier()',
                         'BernoulliNB()',
                         'ComplementNB()',
                         'StackingClassifier()']):

    scores = model_selection.cross_val_score(clf, X, y, 
                                              cv=5, scoring='f1',
                                             n_jobs=-1,verbose=1)
    print("F1: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

5-fold cross validation:



[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.9s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.1s


F1: 0.31 (+/- 0.07) [RandomForestClassifier()]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


F1: 0.25 (+/- 0.03) [KNeighborsClassifier()]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   58.6s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   59.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


F1: 0.27 (+/- 0.13) [MLPClassifier()]
F1: 0.44 (+/- 0.09) [BernoulliNB()]
F1: 0.27 (+/- 0.06) [ComplementNB()]


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.9min remaining:  2.8min


F1: 0.45 (+/- 0.06) [StackingClassifier()]


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.1min finished
