In [30]:
import pandas as pd
from collections import Counter

import matplotlib.pyplot as plt
import matplotlib
import numpy as np

from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

from pprint import pprint
from time import time

from sklearn.metrics import f1_score

In [2]:
dataset = pd.read_csv('../raw_data/enriched_CT_PM.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Medicine name,Therapeutic area,INN,Authorisation status,Generic,Biosimilar,Orphan medicine,First published,n_trials,...,org_other_gov,phase_early_1,phase_not_applicable,phase_1,phase_2,phase_3,phase_4,pm_results,pm_titles,pm_abstracts
0,0,DuoTrav,"Open-Angle Glaucoma, Ocular Hypertension",travoprost / timolol,0,0,0,0,2018-02-15 01:00:00,54,...,2,0,2,1,2,17,30,44,Comparison of the efficacy and safety of fixed...,Combining two medications in one bottle may im...
1,1,Palynziq,Phenylketonurias,pegvaliase,0,0,0,1,2019-05-29 13:43:00,12,...,0,0,0,1,4,3,0,11,Evidence- and consensus-based recommendations ...,Phenylketonuria (PKU) is a rare metabolic diso...
2,2,Ifirmacombi,Hypertension,irbesartan / hydrochlorothiazide,0,1,0,0,2017-12-20 12:01:00,20,...,0,0,2,0,0,5,12,36,Efficacy and safety of a fixed combination of ...,"In a multi-center, single-arm, prospective stu..."
3,3,Topotecan Hospira,"Uterine Cervical Neoplasms, Small Cell Lung Ca...",topotecan,0,0,0,0,2018-04-13 20:29:00,111,...,4,1,0,34,65,21,0,523,A Phase II Clinical Trial of CPI-613 in Patien...,Small cell lung cancer (SCLC) is a common lung...
4,4,CoAprovel,Hypertension,irbesartan / hydrochlorothiazide,0,0,0,0,2017-08-22 00:09:00,20,...,0,0,2,0,0,5,12,36,Efficacy and safety of a fixed combination of ...,"In a multi-center, single-arm, prospective stu..."


In [7]:
pm_abstracts = dataset['pm_abstracts'].dropna()

In [14]:
tf_idf_vectorizer = TfidfVectorizer(max_df = 0.8, min_df=0.5, max_features = 5)

X = tf_idf_vectorizer.fit_transform(pm_abstracts)

X.toarray()

vectorized = pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

In [12]:
len(tf_idf_vectorizer.get_feature_names())

2

In [15]:
tf_idf_vectorizer.get_feature_names()

['cancer', 'inhibitor', 'inhibitors', 'kg', 'survival']

In [19]:
def lemmatize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in text.split()])
    return text

In [20]:
pm_abstracts_lemmatized = pm_abstracts.map(lambda x: lemmatize(x))

In [25]:
tf_idf_vectorizer = TfidfVectorizer(max_df = 0.8, min_df=0.5, max_features = 50)

X = tf_idf_vectorizer.fit_transform(pm_abstracts_lemmatized)

X.toarray()

vectorized = pd.DataFrame(X.toarray(),columns = tf_idf_vectorizer.get_feature_names())

print(tf_idf_vectorizer.get_feature_names())

['acid', 'acute', 'adults', 'arm', 'bone', 'cancer', 'cardiovascular', 'children', 'cycle', 'death', 'disorder', 'expression', 'failure', 'grade', 'growth', 'heart', 'hr', 'hypertension', 'index', 'induced', 'infection', 'inhibitor', 'inhibitors', 'kg', 'liver', 'maintenance', 'medication', 'ml', 'mm', 'monotherapy', 'mortality', 'negative', 'pain', 'participants', 'pressure', 'progression', 'renal', 'resistance', 'resistant', 'scale', 'serum', 'stage', 'survival', 'switch', 'syndrome', 'toxicity', 'tumor', 'twice', 'weight', 'women']


In [3]:
data = dataset[['pm_abstracts', 'Authorisation status']].dropna()

In [63]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [5]:
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__max_iter': (10, 50, 80),
}

In [20]:
from sklearn.metrics import f1_score, make_scorer
f1_scorer = make_scorer(f1_score)

In [21]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, scoring=f1_scorer)

In [22]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data['pm_abstracts'], data['Authorisation status'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05, 1e-06),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2', 'elasticnet'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__ngram_range': ((1, 1), (1, 2))}
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 13.3min finished


done in 870.085s

Best score: 0.134
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


In [23]:
grid_search.best_score_

0.1336675020885547

In [65]:
parameters = {
    'vect__max_df': (1.0,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 2), (1, 3)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (20,),
    'clf__alpha': (0.00001,),
    'clf__penalty': ('l2',),
    # 'clf__max_iter': (10, 50, 80),
}

In [66]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, scoring=f1_scorer)

In [67]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data['pm_abstracts'], data['Authorisation status'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__alpha': (1e-05,),
 'clf__max_iter': (20,),
 'clf__penalty': ('l2',),
 'vect__max_df': (1.0,),
 'vect__ngram_range': ((1, 2), (1, 3))}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:  3.5min finished


done in 293.360s

Best score: 0.074
Best parameters set:
	clf__alpha: 1e-05
	clf__max_iter: 20
	clf__penalty: 'l2'
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


In [59]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', SVC()),
])

In [29]:
parameters = {
    'vect__max_df': (1.0,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 2), (1, 2), (1, 3)),  # unigrams or bigrams
    # 'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    #'clf__max_iter': (20,),
    #'clf__alpha': (0.00001,),
    #'clf__penalty': ('l2',),
    # 'clf__max_iter': (10, 50, 80),
}

In [53]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=StratifiedKFold(3), scoring=f1_scorer)

In [32]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data['pm_abstracts'], data['Authorisation status'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'svc']
parameters:
{'vect__max_df': (1.0,), 'vect__ngram_range': ((1, 2), (1, 2), (1, 3))}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:  3.5min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  7.6min finished


done in 590.303s

Best score: 0.000
Best parameters set:
	vect__max_df: 1.0
	vect__ngram_range: (1, 2)


In [57]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, scoring=f1_scorer)

In [58]:
parameters = {
    'vect__max_df': (1.0,),
    # 'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 2)),  # unigrams or bigrams
}

In [60]:
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(data['pm_abstracts'], data['Authorisation status'])
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'svc']
parameters:
{'vect__max_df': (1.0,), 'vect__ngram_range': (1, 2)}
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:    3.0s finished


TypeError: cannot unpack non-iterable int object

In [45]:
pipeline_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', SVC(kernel="linear", class_weight="balanced"))
])

In [46]:
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=1)

In [47]:
grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1], 'vect__ngram_range': ((1, 2)), 
                                  'vect__max_df': (0.8, 1.0)}, 
                    cv = kfolds,
                    scoring=f1_scorer,
                    verbose=1,   
                    n_jobs=-1) 

In [48]:
grid_svm.fit(data['pm_abstracts'], data['Authorisation status'])

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   18.5s finished


TypeError: cannot unpack non-iterable int object