In [None]:
import pickle

import numpy
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

In [4]:
# to filter out rare meddra terms
min_occurence = 10

with open('pt_counts.pkl', 'rb') as file:
    tag_counts = pickle.load(file)
    
with open("spell_checker.pkl", "rb") as file:
    spell = pickle.load(file)

def get_features(case):
    """
    return effect_description, drugname, [sex, age, imc]
    
    Depends on your data format,
        - effect_description is the free text written by the patient, it must be tokenized
            (you can use the tokenize function above)
        - drugname is simply the drug name, we used a spell checker trained on specific text (python library pyspellchecker==0.5.0)
        - sex: can be encoded as an int (0-1)
        - age and imc are given as numerical values (int of float and float)
    """
    return None

## data processing

In [None]:
"""
'dataset.pkl' contains your dataset of features, it is a dict with unique key corresponding to each case.
"""
with open('dataset.pkl', 'rb') as file:
    data = pickle.load(file)
    
"""
'regex_match.pkl' contains meddra terms matched to each case using a regex engine.
    it is a dict with the same key as for 'dataset.pkl'.
"""
with open('regex_match.pkl', 'rb') as file:
    regex_match = pickle.load(file)

"""
'tags.pkl' contains the meddra tags that correspond to your dataset, it is a dict with the same key as for 'dataset.pkl'.
    We only keep the most common terms (i.e with number of occurences greater than the min_occurence parameter.)
""" 
with open('tags.pkl', 'rb') as file:
    tags = pickle.load(file)
    
X = []
Y = []
re_match = []
"""
    We build the X and Y arrays from our features and tags.
    X components are numeric vectors of features, it can be a mixture of text
    vectorisation (using TF-IDF or any text embedding algorithm), numerical
    features (age, weigh,...) and one hot encoding of categorical features (gender).
"""
"""
    /!\ If you use a non pre trained text vectorization model, you should compute it on the train
    sample after train-test split (next cell) to avoid introducing bias in your evaluation. Indeed,
    if you compute for instance TF-IDF on the whole dataset (ie before splitting) test data will be
    used for word frequency computation.
"""
for key, value in data.items():
    X.append(value)
    Y.append(tags[key])
    re_match.append(regex_match[key])

## Bootstrap

In [None]:
def export_results(pred, true, model_name):
    """
        Function for simulations results metrics computation and export.
    """
    fpr, tpr, _ = roc_curve(true, pred)
    roc_auc = auc(fpr, tpr)
    p, r, t = precision_recall_curve(true, pred)
    F1 = 2 * (p * r) / (p + r)
    F1 = [x if x==x else 0 for x in F1]
    
    th = t[numpy.argmax(F1)]
    tn, fp, fn, tp = confusion_matrix(true, [0 if x < th else 1 for x in pred]).ravel()
    with open('result_bootstrap.txt', 'a') as file:
        file.write(
            '\t'.join(
                [
                    model_name,
                    str(max(F1)),
                    str(tn),
                    str(tp),
                    str(fn),
                    str(fp),
                    str(roc_auc),
                    'ROC_fpr',
                    '$'.join([str(x) for x in fpr]),
                    'ROC_tpr',
                    '$'.join([str(x) for x in tpr])
                ]
            ) + '\n'
        )

import warnings
warnings.filterwarnings("ignore")

n_split = 1000
"""
    We use ShuffleSplit to generate bootstrap samples indexes.
"""
rs = ShuffleSplit(n_splits=n_split, test_size=.1, random_state=0)
for train_index, test_index in tqdm(rs.split(X, Y)):
    """
        We build our train test sets
    """
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    Y_train, Y_test = [Y[i] for i in train_index], [Y[i] for i in test_index]
    regex_train, regex_test = [regex_match[i] for i in train_index], [regex_match[i] for i in test_index]
    
    # we binarize the tags as well as the regex matches
    Y_train = tag_binarizer.fit_transform(Y_train)
    Y_test = tag_binarizer.transform(Y_test)
    regex_test_bin = tag_binarizer.transform(regex_test)
    regex_test_filtered = [[s for s in l if s in tag_counts.keys() and tag_counts[s] >= min_occurence] for l in regex_test]
    
    """
        We train and test each model on the current split and then export the results.
    """
    # regex only
    regex_test_flat = numpy.hstack(regex_test_bin)
    export_results(regex_test_flat, Y_test.flatten('C'), 'regex')
    
    # train dataset
    # lgbm 
    lgbm = OneVsRestClassifier(
        LGBMClassifier(
            max_depth=2,
            n_estimators=50
        ),
        n_jobs=10
    )
    lgbm.fit(X_train_vec, Y_train)
    pred_test = lgbm.predict_proba(X_test_vec)
    pred_test_flat = numpy.hstack(pred_test)
    export_results(pred_test_flat, Y_test.flatten('C'), 'lgbm')

    # lgbm + regex
    pred_test_regex = pred_test + regex_test_bin
    pred_test_regex = numpy.minimum(pred_test_regex, numpy.ones(pred_test_regex.shape))
    pred_test_regex_flat = numpy.hstack(pred_test_regex)
    export_results(pred_test_regex_flat, Y_test.flatten('C'), 'lgbm + regex')

    # Random Forest
    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=4,
        n_jobs=8
    )
    clf.fit(X_train_vec, Y_train)

    pred_test = clf.predict_proba(X_test_vec)
    pred_test_flat = numpy.vstack(pred_test)
    pred_test_flat = [t[1] for t in pred_test_flat]
    export_results(pred_test_flat, Y_test.flatten('F'), 'random_forests')

    # SVM
    svc = OneVsRestClassifier(
        SVC(probability=True),
        n_jobs=8
    )
    svc.fit(X_train_vec, Y_train)
    pred_test = svc.predict_proba(X_test_vec)
    pred_test_flat = numpy.hstack(pred_test)
    export_results(pred_test_flat, Y_test.flatten('C'), 'svm')
    
    # logit
    logit = OneVsRestClassifier(
        LogisticRegression(
            multi_class='ovr'
        ),
        n_jobs=8
    )
    logit.fit(X_train_vec, Y_train)
    pred_test = logit.predict_proba(X_test_vec)
    pred_test_flat = numpy.hstack(pred_test)
    export_results(pred_test_flat, Y_test.flatten('C'), 'logit')