## Bag of Words Baseline Model
Uses CountVectorizer embeddings as input for a Logistic Regression model using L2 regularization. This model establishes a robust baseline performance to compare against for our ClinicalBERT models, just as the authors suggested.

In [1]:
import pandas as pd
from nltk import word_tokenize
import string, sys, os
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from run_readmission import *

stop = list(stopwords.words('english'))

timepoints = ['early', 'discharge']

#### Evaluation Metrics

In [2]:
def vote_score(df, score, readmission_mode, output_dir):
    df['pred_score'] = score
    df_sort = df.sort_values(by=['ID'])
    #score 
    temp = (df_sort.groupby(['ID'])['pred_score'].agg(max)+df_sort.groupby(['ID'])['pred_score'].agg(sum)/2)/(1+df_sort.groupby(['ID'])['pred_score'].agg(len)/2)
    x = df_sort.groupby(['ID'])['Label'].agg(np.min).values
    df_out = pd.DataFrame({'logits': temp.values, 'ID': x})

    fpr, tpr, thresholds = roc_curve(x, temp.values)
    auc_score = auc(fpr, tpr)

    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='Val (area = {:.3f})'.format(auc_score))
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    string = 'auroc_bow_'+readmission_mode+'.png'
    plt.savefig(os.path.join(output_dir, string))
    plt.close('all')

    return fpr, tpr, df_out
    
def pr_curve_plot(y, y_score, readmission_mode, output_dir):
    precision, recall, _ = precision_recall_curve(y, y_score)
    area = auc(recall,precision)
    step_kwargs = ({'step': 'post'}
                   if 'step' in signature(plt.fill_between).parameters
                   else {})
    
    plt.figure(2)
    plt.step(recall, precision, color='b', alpha=0.2,
             where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: AUC={0:0.2f}'.format(
              area))
    
    string = 'auprc_bow_'+readmission_mode+'.png'
    plt.savefig(os.path.join(output_dir, string))
    plt.close('all')
    
def vote_pr_curve(df, score, readmission_mode, output_dir):
    df['pred_score'] = score
    df_sort = df.sort_values(by=['ID'])
    #score 
    temp = (df_sort.groupby(['ID'])['pred_score'].agg(max)+df_sort.groupby(['ID'])['pred_score'].agg(sum)/2)/(1+df_sort.groupby(['ID'])['pred_score'].agg(len)/2)
    y = df_sort.groupby(['ID'])['Label'].agg(np.min).values
    
    precision, recall, thres = precision_recall_curve(y, temp)
    pr_thres = pd.DataFrame(data =  list(zip(precision, recall, thres)), columns = ['prec','recall','thres'])
    vote_df = pd.DataFrame(data =  list(zip(temp, y)), columns = ['score','label'])
    
    pr_curve_plot(y, temp, readmission_mode, output_dir)
    
    temp = pr_thres[pr_thres.prec > 0.799999].reset_index()
    
    rp80 = 0
    if temp.size == 0:
        print('Test Sample too small or RP80=0')
    else:
        rp80 = temp.iloc[0].recall
        print('Recall at Precision of 80 is {}', rp80)

    return rp80

#### CountVectorizer Embeddings --> Logistic Regression Model 

In [3]:
def tokenizer_better(text):    
    punc_list = string.punctuation+'0123456789'
    t = str.maketrans(dict.fromkeys(punc_list, " "))
    text = text.lower().translate(t)
    tokens = word_tokenize(text)
    return tokens

for timepoint in timepoints:
    readmission_mode = 'discharge' if timepoint == 'discharge' else 'early'
    if timepoint == 'discharge':
        df_train = pd.read_csv('data/discharge/train.csv')
        df_test = pd.read_csv('data/discharge/test.csv')
    else: 
        df_train = pd.read_csv('data/3days/train.csv')
        df_test = pd.read_csv('data/2days/test.csv').append(pd.read_csv('data/3days/test.csv'))
    vect = CountVectorizer(max_features = 5000, stop_words = stop, tokenizer = tokenizer_better)
    vect.fit(df_train['TEXT'].values)

    X_train_tf = vect.transform(df_train.TEXT.values)
    X_test_tf = vect.transform(df_test.TEXT.values)

    y_train = df_train.Label
    y_test = df_test.Label

    clf = LogisticRegression(C = 0.0001, penalty = 'l2', random_state = 42)
    clf.fit(X_train_tf, y_train)
    print('Training completed. Number of parameters: ' + str(len(clf.get_params().keys())))

    model = clf
    y_train_preds = model.predict_proba(X_train_tf)[:,1]
    y_test_preds = model.predict_proba(X_test_tf)[:,1]
    y_test_actual = df_test['Label']
    print(y_test_preds)
    
    output_dir = './results/bow/result_bow_' + readmission_mode

    fpr, tpr, df_out = vote_score(df_test, y_test_preds, readmission_mode, output_dir)
    rp80 = vote_pr_curve(df_test, y_test_preds, readmission_mode, output_dir)

    y_test_pred = [1 if i > 0.5 else 0 for i in y_test_preds]
    result = {'eval_loss': 'N/A',
              'eval_accuracy': str(accuracy_score(y_test_actual, y_test_pred)),                 
              'global_step': 'N/A',
              'training loss': 'N/A',
              'RP80': rp80}
    output_eval_file = os.path.join(output_dir, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))
    print('Completed evaluation of baseline Bag-of-Words model for readmission task: ' + readmission_mode)



Training completed. Number of parameters: 15
[0.48229836 0.56906856 0.55412124 ... 0.47267605 0.35944838 0.43165818]


04/16/2022 13:03:24 - INFO - run_readmission -   ***** Eval results *****
04/16/2022 13:03:24 - INFO - run_readmission -     RP80 = 0.05776173285198556
04/16/2022 13:03:24 - INFO - run_readmission -     eval_accuracy = 0.6050552922590837
04/16/2022 13:03:24 - INFO - run_readmission -     eval_loss = N/A
04/16/2022 13:03:24 - INFO - run_readmission -     global_step = N/A
04/16/2022 13:03:24 - INFO - run_readmission -     training loss = N/A


Recall at Precision of 80 is {} 0.05776173285198556
Completed evaluation of baseline Bag-of-Words model for readmission task: early




Training completed. Number of parameters: 15
[0.47047724 0.43949609 0.46887014 ... 0.51466221 0.410015   0.44151218]


04/16/2022 13:04:45 - INFO - run_readmission -   ***** Eval results *****
04/16/2022 13:04:45 - INFO - run_readmission -     RP80 = 0.2103448275862069
04/16/2022 13:04:45 - INFO - run_readmission -     eval_accuracy = 0.6111655239960823
04/16/2022 13:04:45 - INFO - run_readmission -     eval_loss = N/A
04/16/2022 13:04:45 - INFO - run_readmission -     global_step = N/A
04/16/2022 13:04:45 - INFO - run_readmission -     training loss = N/A


Recall at Precision of 80 is {} 0.2103448275862069
Completed evaluation of baseline Bag-of-Words model for readmission task: discharge
