In [1]:
import pandas as pd
import numpy as np

def get_data(drop_8000=True):
    df = pd.read_csv('wra_CT_PM_conclusions.csv')
    df.drop(columns='Unnamed: 0', inplace=True)
    if drop_8000:
        df = df[df['n_trials'] < 8000]
    return df

In [2]:
df = get_data()

In [3]:
df.head(2)

Unnamed: 0,Medicine name,Therapeutic area,INN,Authorisation status,Orphan medicine,First published,n_trials,status_not_yet_recruiting,status_recruiting,status_enrolling_by_invitation,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,conclusions
0,DuoTrav,"Open-Angle Glaucoma, Ocular Hypertension",travoprost / timolol,0,0,2018-02-15 01:00:00,54,0,2,0,...,12,2,0,2,1,2,17,30,44,Travoprost/timolol group resulted in an IOP r...
1,Palynziq,Phenylketonurias,pegvaliase,0,1,2019-05-29 13:43:00,12,0,2,0,...,2,0,0,0,1,4,3,0,11,A total of 34 guidance statements were includ...


In [4]:
df['thera_count'] = df['Therapeutic area'].apply(lambda x: x.count(',') + 1)

In [6]:
df_numerical = df.select_dtypes(exclude='object')

In [70]:
df_numerical.head(2)

Unnamed: 0,Authorisation status,Orphan medicine,n_trials,status_not_yet_recruiting,status_recruiting,status_enrolling_by_invitation,status_active_not_recruiting,status_suspended,status_terminated,status_completed,...,org_other,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,thera_count
0,0,0,54,0,2,0,1,0,3,43,...,12,2,0,2,1,2,17,30,44,2
1,0,1,12,0,2,0,1,0,0,9,...,2,0,0,0,1,4,3,0,11,1


In [8]:
X = df_numerical.drop(columns='Authorisation status')
y = df_numerical['Authorisation status']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [10]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

tree = RandomForestClassifier(max_depth=2, random_state=0, class_weight='balanced')

cv_results = cross_validate(tree, 
                            X_train_sc,
                            y_train,
                            scoring = ["f1_macro",'accuracy','f1','precision'],
                            cv=10)

cv_results['test_f1_macro'].mean()

0.6421039308866584

In [13]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

y_pred = cross_val_predict(tree, X_train_sc, y_train, cv=5)

print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.74      0.83       939
           1       0.31      0.76      0.44       148

    accuracy                           0.74      1087
   macro avg       0.63      0.75      0.64      1087
weighted avg       0.86      0.74      0.78      1087



In [14]:
from sklearn.metrics import confusion_matrix
tree.fit(X_train_sc, y_train)
confusion_matrix(y_test, tree.predict(X_test_sc))

array([[170,  63],
       [ 13,  26]], dtype=int64)

In [15]:
y_test.value_counts()

0    233
1     39
Name: Authorisation status, dtype: int64

In [17]:
26/39 * 100

66.66666666666666

In [62]:
pred_df = pd.concat([y_train.reset_index(drop=True),pd.Series(y_pred)],axis=1,keys=['true','pred'])

In [63]:
pred_df.head()

Unnamed: 0,true,pred
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1


In [65]:
diff = pred_df[pred_df['true'] != pred_df['pred']]

In [68]:
len(diff)

283

In [69]:
1 - len(diff)/len(pred_df)

0.7396504139834407