# Exploring models on the new conclusions feature

In [65]:
import pandas as pd
import numpy as np
from tempfile import mkdtemp
from shutil import rmtree
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [4]:
df = pd.read_csv("../raw_data/data_CT_PM_conclusions.csv", index_col=0)

In [5]:
df.head()

Unnamed: 0,Medicine name,Therapeutic area,INN,Authorisation status,Generic,Biosimilar,Orphan medicine,First published,n_trials,status_not_yet_recruiting,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,conclusions
0,DuoTrav,"Open-Angle Glaucoma, Ocular Hypertension",travoprost / timolol,0,0,0,0,2018-02-15 01:00:00,54,0,...,12,2,0,2,1,2,17,30,44,Travoprost/timolol group resulted in an IOP r...
1,Palynziq,Phenylketonurias,pegvaliase,0,0,0,1,2019-05-29 13:43:00,12,0,...,2,0,0,0,1,4,3,0,11,A total of 34 guidance statements were includ...
2,Ifirmacombi,Hypertension,irbesartan / hydrochlorothiazide,0,1,0,0,2017-12-20 12:01:00,20,0,...,3,0,0,2,0,0,5,12,36,The prevalence of microalbuminuria and left v...
3,Topotecan Hospira,"Uterine Cervical Neoplasms, Small Cell Lung Ca...",topotecan,0,0,0,0,2018-04-13 20:29:00,111,3,...,53,4,1,0,34,65,21,0,523,We conducted laboratory studies which found s...
4,CoAprovel,Hypertension,irbesartan / hydrochlorothiazide,0,0,0,0,2017-08-22 00:09:00,20,0,...,3,0,0,2,0,0,5,12,36,The prevalence of microalbuminuria and left v...


In [9]:
df[df.conclusions.isna()]

Unnamed: 0,Medicine name,Therapeutic area,INN,Authorisation status,Generic,Biosimilar,Orphan medicine,First published,n_trials,status_not_yet_recruiting,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,conclusions
44,Adrovance,Postmenopausal Osteoporosis,"alendronic acid, colecalciferol",0,0,0,0,2018-07-06 11:38:00,14,0,...,4,0,0,1,1,3,6,3,0,
66,Actrapid,Diabetes Mellitus,human insulinÂ (rDNA),0,0,0,0,2018-03-14 01:00:00,0,0,...,0,0,0,0,0,0,0,0,0,
80,Jivi,Hemophilia A,damoctocog alfa pegol,0,0,0,0,2019-01-28 13:01:00,5,2,...,0,0,0,0,1,0,0,1,0,
81,VeraSeal,Surgical Hemostasis,human fibrinogen / human thrombin,0,0,0,0,2017-11-10 17:47:00,29,0,...,9,0,0,2,0,4,11,8,0,
107,Ondexxya,Drug-Related Side Effects and Adverse Reactions,andexanet alfa,0,0,0,0,2019-06-27 16:05:00,0,0,...,0,0,0,0,0,0,0,0,0,
111,Infanrix Hexa,"Hepatitis B, Tetanus, Immunization, Meningitis...","diphtheria (D), tetanus (T), pertussis (acellu...",0,0,0,0,2018-04-12 00:00:00,0,0,...,0,0,0,0,0,0,0,0,0,
112,Onivyde pegylated liposomal (previously known ...,Pancreatic Neoplasms,irinotecan hydrochloride trihydrate,0,0,0,1,2017-05-22 12:10:00,5,1,...,1,0,0,0,1,3,1,0,0,
132,Fosavance,Postmenopausal Osteoporosis,"alendronic acid, colecalciferol",0,0,0,0,2018-07-09 23:09:00,14,0,...,4,0,0,1,1,3,6,3,0,
153,Vaxelis,"Meningitis, Haemophilus, Poliomyelitis, Tetanu...","diphtheria, tetanus, pertussis (acellular, com...",0,0,0,0,2018-04-06 17:21:00,0,0,...,0,0,0,0,0,0,0,0,0,
169,Vitrakvi,Abdominal Neoplasms,larotrectinib,0,0,0,0,2019-10-24 14:30:00,0,0,...,0,0,0,0,0,0,0,0,0,


In [11]:
df.conclusions.fillna("", inplace=True)

In [22]:
X.head()

Unnamed: 0,conclusions
0,Travoprost/timolol group resulted in an IOP r...
1,A total of 34 guidance statements were includ...
2,The prevalence of microalbuminuria and left v...
3,We conducted laboratory studies which found s...
4,The prevalence of microalbuminuria and left v...


In [58]:
vectorizer = TfidfVectorizer(max_features=50, max_df=0.9, ngram_range=(1,3), stop_words=['clinicaltrials', 'gov'])
vectorizer.fit(X['conclusions'])
#X_trans = vectorizer.transform(X)

In [59]:
vectorizer.get_feature_names()

['adverse',
 'after',
 'an',
 'are',
 'associated',
 'associated with',
 'at',
 'between',
 'both',
 'but',
 'by',
 'clinical',
 'combination',
 'compared',
 'data',
 'disease',
 'dose',
 'drug',
 'effective',
 'effects',
 'efficacy',
 'events',
 'for the',
 'from',
 'group',
 'have',
 'in patients',
 'in patients with',
 'in the',
 'may',
 'mg',
 'more',
 'no',
 'not',
 'of the',
 'or',
 'patient',
 'patients with',
 'results',
 'risk',
 'significant',
 'significantly',
 'study',
 'than',
 'therapy',
 'these',
 'treatment of',
 'use',
 'we',
 'well']

In [60]:
pd.DataFrame(X_trans.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,adverse,after,an,are,associated,associated with,at,between,both,but,...,significant,significantly,study,than,therapy,these,treatment of,use,we,well
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
column_total= 'n_trials'
percentage_list= ['status_not_yet_recruiting', 'status_recruiting',\
       'status_enrolling_by_invitation', 'status_active_not_recruiting',\
       'status_suspended', 'status_terminated', 'status_completed',\
       'status_withdrawn', 'status_unknown', 'org_fed', 'org_indiv',\
       'org_industry', 'org_network', 'org_nih', 'org_other', 'org_other_gov',\
       'phase_early_1', 'phase_not_applicable', 'phase_1', 'phase_2',\
       'phase_3', 'phase_4']

def percentage_columns(df,column_list,column_total):
    for column in column_list:
        df[column]=((df[column]*100)/df[column_total]).replace([np.inf, -np.inf, np.nan], 0)
    return df

df = percentage_columns(df, percentage_list, column_total)

In [97]:
df = pd.read_csv("../raw_data/data_CT_PM_conclusions.csv", index_col=0)
df.conclusions.fillna("", inplace=True)

In [104]:
#X = df[['conclusions']]
X = df[['conclusions', 'phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',
       'org_fed', 'org_nih', 'Generic']]
y = df['Authorisation status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## SVC

In [93]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

vectorizer = TfidfVectorizer(max_features=40, max_df=0.9, ngram_range=(3,3), stop_words=['clinicaltrials', 'gov'])

model = SVC(class_weight='balanced')

numericals = ['phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',\
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',\
       'org_fed', 'org_nih']

column_trans = ColumnTransformer([('tfidf', vectorizer, 'conclusions'),
                                   ('scale', scaler, numericals)])
                                 
pipe = Pipeline([('preprocessing', column_trans),
                 ('model', model)])

In [94]:
y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.74      0.84       939
           1       0.09      0.66      0.16        38

   micro avg       0.73      0.73      0.73       977
   macro avg       0.54      0.70      0.50       977
weighted avg       0.95      0.73      0.82       977



## Logistic Regression

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1222 entries, 0 to 1221
Data columns (total 33 columns):
Medicine name                     1222 non-null object
Therapeutic area                  1222 non-null object
INN                               1222 non-null object
Authorisation status              1222 non-null int64
Generic                           1222 non-null int64
Biosimilar                        1222 non-null int64
Orphan medicine                   1222 non-null int64
First published                   1222 non-null object
n_trials                          1222 non-null int64
status_not_yet_recruiting         1222 non-null int64
status_recruiting                 1222 non-null int64
status_enrolling_by_invitation    1222 non-null int64
status_active_not_recruiting      1222 non-null int64
status_suspended                  1222 non-null int64
status_terminated                 1222 non-null int64
status_completed                  1222 non-null int64
status_withdrawn         

In [109]:
from sklearn.linear_model import LogisticRegression

vectorizer = TfidfVectorizer(max_features=20, max_df=0.9, ngram_range=(3,3), stop_words=['clinicaltrials', 'gov'])

model = LogisticRegression(class_weight='balanced')

numericals = ['phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',\
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',\
       'org_fed', 'org_nih']

column_trans = ColumnTransformer([('tfidf', vectorizer, 'conclusions'),
                                   ('scale', scaler, numericals)])
                                 

pipe = Pipeline([('preprocessing', column_trans),
                 ('model', model)])

In [110]:
y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.75      0.85       939
           1       0.10      0.71      0.18        38

   micro avg       0.75      0.75      0.75       977
   macro avg       0.54      0.73      0.51       977
weighted avg       0.95      0.75      0.82       977



## KN Classifier

In [113]:
from sklearn.neighbors import KNeighborsClassifier

vectorizer = TfidfVectorizer(max_features=20, max_df=0.9, ngram_range=(3,3), stop_words=['clinicaltrials', 'gov'])

model = KNeighborsClassifier()

numericals = ['phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',\
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',\
       'org_fed', 'org_nih']

column_trans = ColumnTransformer([('tfidf', vectorizer, 'conclusions'),
                                   ('scale', scaler, numericals)])
                                 

pipe = Pipeline([('preprocessing', column_trans),
                 ('model', model)])

In [114]:
y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98       939
           1       0.00      0.00      0.00        38

   micro avg       0.96      0.96      0.96       977
   macro avg       0.48      0.50      0.49       977
weighted avg       0.92      0.96      0.94       977



## Random forrest

In [116]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = TfidfVectorizer(max_features=20, max_df=0.9, ngram_range=(3,3), stop_words=['clinicaltrials', 'gov'])

model = RandomForestClassifier(class_weight='balanced')

numericals = ['phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',\
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',\
       'org_fed', 'org_nih']

column_trans = ColumnTransformer([('tfidf', vectorizer, 'conclusions'),
                                   ('scale', scaler, numericals)])
                                 

pipe = Pipeline([('preprocessing', column_trans),
                 ('model', model)])

In [117]:
y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97       939
           1       0.19      0.08      0.11        38

   micro avg       0.95      0.95      0.95       977
   macro avg       0.58      0.53      0.54       977
weighted avg       0.93      0.95      0.94       977



## AdaBoost

In [122]:
from sklearn.ensemble import AdaBoostClassifier

vectorizer = TfidfVectorizer(max_features=20, max_df=0.9, ngram_range=(3,3), stop_words=['clinicaltrials', 'gov'])

base_model = LogisticRegression(class_weight='balanced')

model = AdaBoostClassifier(base_estimator=base_model)

numericals = ['phase_4', 'Orphan medicine', 'org_indiv', 'n_trials',\
       'status_terminated', 'phase_2', 'phase_3', 'status_not_yet_recruiting',\
       'org_fed', 'org_nih']

column_trans = ColumnTransformer([('tfidf', vectorizer, 'conclusions'),
                                   ('scale', scaler, numericals)])
                                 

pipe = Pipeline([('preprocessing', column_trans),
                 ('model', model)])

In [123]:
y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.70      0.82       939
           1       0.10      0.82      0.18        38

   micro avg       0.71      0.71      0.71       977
   macro avg       0.54      0.76      0.50       977
weighted avg       0.95      0.71      0.80       977

