In [1]:
import eli5
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
from joblib import dump
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn_crfsuite.metrics import flat_classification_report, flat_accuracy_score

from crf_extraction.first_crf_model.first_model_features import sent_to_features, sent_to_labels
from utils.DrugRecords import DrugRecords
from utils.helper_functions import generate_heatmap_percentage, join_true_values_with_predicted, save_predicted_results
from utils.helper_functions import get_data

In [2]:
TRAINING_DATASET_PATH = "../../datasets/train_dataset.csv"
VALIDATION_DATASET_PATH = "../../datasets/validation_dataset.csv"
TEST_DATASET_PATH = "../../datasets/test_dataset.csv"
PREDICTED_RESULTS_PATH = "predicted_results.csv"

JOBLIB_MODEL = "trained.joblib"
JOBLIB_TUNED_MODEL = "trained_tuned.joblib"

Prepare the data

In [3]:
training_data = get_data(TRAINING_DATASET_PATH)
training_data.head(10)

Unnamed: 0,Drug: #,Word,Tag
0,Drug: #,Word,Tag
1,Drug: 0,ventamol,NAME
2,Drug: 0,2,STRENGTH
3,Drug: 0,mg,STRENGTH
4,Drug: 0,/,STRENGTH
5,Drug: 0,5,STRENGTH
6,Drug: 0,ml,STRENGTH
7,Drug: 0,syrup,FORM
8,Drug: 0,120,PACK
9,Drug: 0,ml,PACK


In [4]:
train_drug_records = DrugRecords(training_data).get_drug_records()
train_drug_records = train_drug_records[1:] # without header

In [5]:
x_train = [sent_to_features(s) for s in train_drug_records]
y_train = [sent_to_labels(s) for s in train_drug_records]

In [6]:
x_train[0][0] # features for word "ventamol"

{'bias': 1.0,
 'word[-2:]': 'ol',
 'word[:-3]': 'venta',
 'word.isdigit()': False,
 'BOS': True,
 '+1:bias': 1.0,
 '+1:word[-2:]': '2',
 '+1:word[:-3]': '',
 '+1:word.isdigit()': True}

Fit and save the model

In [7]:
crf = CRF(c1=0, c2=1, max_iterations=100, all_possible_transitions=False)
crf.get_params()

{'algorithm': None,
 'all_possible_states': None,
 'all_possible_transitions': False,
 'averaging': None,
 'c': None,
 'c1': 0,
 'c2': 1,
 'calibration_candidates': None,
 'calibration_eta': None,
 'calibration_max_trials': None,
 'calibration_rate': None,
 'calibration_samples': None,
 'delta': None,
 'epsilon': None,
 'error_sensitive': None,
 'gamma': None,
 'keep_tempfiles': None,
 'linesearch': None,
 'max_iterations': 100,
 'max_linesearch': None,
 'min_freq': None,
 'model_filename': None,
 'num_memories': None,
 'pa_type': None,
 'period': None,
 'trainer_cls': None,
 'variance': None,
 'verbose': False}

In [None]:
%%time

crf.fit(x_train, y_train)
dump(crf, JOBLIB_MODEL)

In [None]:
x_train = [sent_to_features(s) for s in train_drug_records]
y_train = [sent_to_labels(s) for s in train_drug_records]

In [None]:
y_pred = cross_val_predict(estimator=crf, X=x_train, y=y_train, cv=5)

In [None]:
class_report = flat_classification_report(y_pred=y_pred, y_true=y_train, digits=4)
print(class_report)

In [None]:
eli5.show_weights(crf, top=20)

Evaluate on validation data

In [None]:
validation_data = get_data(VALIDATION_DATASET_PATH)

validation_drug_records = DrugRecords(validation_data).get_drug_records()
validation_drug_records = validation_drug_records[1:]

In [None]:
x_val = [sent_to_features(s) for s in validation_drug_records]
y_val = [sent_to_labels(s) for s in validation_drug_records]

In [None]:
predicted_values = crf.predict(x_val)

In [None]:
acc_score = flat_accuracy_score(predicted_values, y_val)
print(acc_score)

Evaluate on test data

In [None]:
test_data = get_data(TEST_DATASET_PATH)

test_drug_records = DrugRecords(test_data).get_drug_records()
test_drug_records = test_drug_records[1:]

In [None]:
x_test = [sent_to_features(s) for s in test_drug_records]
y_test = [sent_to_labels(s) for s in test_drug_records]

In [None]:
predicted_values_test = crf.predict(x_test)

In [None]:
acc_score = flat_accuracy_score(predicted_values_test, y_test)
print(acc_score)


Cross matrix on test data

In [None]:
y_pred = pd.Series([p for pred_val in predicted_values_test for p in pred_val])
y_true_test = pd.Series([w[1] for drug_record in test_drug_records for w in drug_record])

matrixPercentage = pd.crosstab(y_true_test,
                               y_pred,
                               rownames=['True'], colnames=['Predicted'], normalize="index").round(6)*100

In [None]:
plt.figure(figsize=(10, 5))
sns_plot_percentage = generate_heatmap_percentage(matrixPercentage)

sns_plot_percentage.figure.savefig("firstCrfModelMatrix.png")

Save predicted results into csv file

In [None]:
joined_values = join_true_values_with_predicted(test_drug_records, predicted_values_test)
joined_values[:2]


In [None]:
save_predicted_results(joined_values, PREDICTED_RESULTS_PATH)

Hyper-parameter tuning

In [None]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
    'max_iterations': [int(x) for x in np.linspace(start=40, stop=150, num=10)],
    'all_possible_transitions': [True, False]
}

In [None]:
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted',
                        labels=['O', 'NAME', 'STRENGTH', 'PACK', 'FORM'])

In [None]:
rand_search_cv = RandomizedSearchCV(crf, params_space,
                        cv=4,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

In [None]:
%%time

rand_search_cv.fit(x_train, y_train)

In [None]:
print('best params:', rand_search_cv.best_params_)
print('best CV score:', rand_search_cv.best_score_)
print('model size: {:0.2f}M'.format(rand_search_cv.best_estimator_.size_ / 1000000))

In [None]:
tuned_crf = rand_search_cv.best_estimator_
dump(tuned_crf, JOBLIB_TUNED_MODEL)

Evaluate tuned model with cross-validation

In [None]:
%%time

y_pred = cross_val_predict(estimator=tuned_crf, X=x_train, y=y_train, cv=5)

In [None]:
class_report = flat_classification_report(y_pred=y_pred, y_true=y_train, digits=4)
print(class_report)

In [None]:
eli5.show_weights(tuned_crf, top=20)

Evaluate tuned model on validation data

In [None]:
predicted_values_val = tuned_crf.predict(x_val)

In [None]:
acc_score = flat_accuracy_score(predicted_values_val, y_val)
print(acc_score)

Evaluate on test data

In [None]:
predicted_values_test = tuned_crf.predict(x_test)

In [None]:
acc_score = flat_accuracy_score(predicted_values_test, y_test)
print(acc_score)


Cross matrix on test data

In [None]:
y_pred = pd.Series([p for pred_val in predicted_values_test for p in pred_val])

matrixPercentage = pd.crosstab(y_true_test,
                               y_pred,
                               rownames=['True'], colnames=['Predicted'], normalize="index").round(6)*100

In [None]:
plt.figure(figsize=(10, 5))
sns_plot_percentage = generate_heatmap_percentage(matrixPercentage)

sns_plot_percentage.figure.savefig("firstTunedCrfModelMatrix.png")
