# Crossvalidate pipelines 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tempfile import mkdtemp
from shutil import rmtree
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load data

In [2]:
df = pd.read_csv('../raw_data/enriched_CT_PM.csv')

In [3]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [4]:
df.head()

Unnamed: 0,Medicine name,Therapeutic area,INN,Authorisation status,Generic,Biosimilar,Orphan medicine,First published,n_trials,status_not_yet_recruiting,...,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,pm_titles,pm_abstracts
0,DuoTrav,"Open-Angle Glaucoma, Ocular Hypertension",travoprost / timolol,0,0,0,0,2018-02-15 01:00:00,54,0,...,2,0,2,1,2,17,30,44,Comparison of the efficacy and safety of fixed...,Combining two medications in one bottle may im...
1,Palynziq,Phenylketonurias,pegvaliase,0,0,0,1,2019-05-29 13:43:00,12,0,...,0,0,0,1,4,3,0,11,Evidence- and consensus-based recommendations ...,Phenylketonuria (PKU) is a rare metabolic diso...
2,Ifirmacombi,Hypertension,irbesartan / hydrochlorothiazide,0,1,0,0,2017-12-20 12:01:00,20,0,...,0,0,2,0,0,5,12,36,Efficacy and safety of a fixed combination of ...,"In a multi-center, single-arm, prospective stu..."
3,Topotecan Hospira,"Uterine Cervical Neoplasms, Small Cell Lung Ca...",topotecan,0,0,0,0,2018-04-13 20:29:00,111,3,...,4,1,0,34,65,21,0,523,A Phase II Clinical Trial of CPI-613 in Patien...,Small cell lung cancer (SCLC) is a common lung...
4,CoAprovel,Hypertension,irbesartan / hydrochlorothiazide,0,0,0,0,2017-08-22 00:09:00,20,0,...,0,0,2,0,0,5,12,36,Efficacy and safety of a fixed combination of ...,"In a multi-center, single-arm, prospective stu..."


## Fill NA

In [5]:
df['pm_titles'].fillna('', inplace=True)
df['pm_abstracts'].fillna('', inplace=True)

## Train test split

In [8]:
#X = df.drop(columns=['Medicine name', 'Therapeutic area', 'INN', 'First published'])
X = df[['pm_abstracts']]
y = df['Authorisation status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
X_train

Unnamed: 0,pm_abstracts
99,Cystinosis is a rare autosomal-recessive lysos...
971,The EXCITE (clinical EXperienCe of amlodIpine ...
430,
336,Ziprasidone is increasingly used for the treat...
651,Primary biliary cirrhosis (PBC) is characteriz...
76,After a hospital-wide formulary change resulte...
211,"Sildenafil citrate, a drug used to treat erect..."
983,Distributive shock is a subset of shock marked...
534,A fixed-dose combination of a stain and an ant...
309,To characterize the effect of concurrent stere...


In [None]:
def percentage_columns(df,column_list,column_total):
    for column in column_list:
        df[column]=((df[column]*100)/df[column_total]).replace([np.inf, -np.inf, np.nan], 0)
    return df

percentage_list=(['status_not_yet_recruiting', 'status_recruiting',
       'status_enrolling_by_invitation', 'status_active_not_recruiting',
       'status_suspended', 'status_terminated', 'status_completed',
       'status_withdrawn', 'status_unknown', 'org_fed', 'org_indiv',
       'org_industry', 'org_network', 'org_nih', 'org_other', 'org_other_gov',
       'phase_early_1', 'phase_not_applicable', 'phase_1', 'phase_2',
       'phase_3', 'phase_4'])

column_total= 'n_trials'

## Crossvalidating pipelines with bag of words

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()

column_trans = ColumnTransformer([('bag_of_words', CountVectorizer(), 'pm_abstracts')])


# Create a temp folder
cachedir = mkdtemp()

pipe = Pipeline([('preprocessing', column_trans),
                 ('model', mnb)],
                memory=cachedir)

y_pred = cross_val_predict(pipe, X_train, y_train, cv=5)
print(classification_report(y_train, y_pred))

# Clear the cache directory after the cross-validation
rmtree(cachedir)

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  **fit_params_steps[name])
If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide th

              precision    recall  f1-score   support

           0       0.98      0.81      0.89       939
           1       0.11      0.55      0.18        38

   micro avg       0.80      0.80      0.80       977
   macro avg       0.54      0.68      0.53       977
weighted avg       0.94      0.80      0.86       977

