In [9]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from typing import Union
from tqdm import tqdm
import os
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data Exploration, Luke edition

#### Setup

In [10]:
# NBME dataset
from project.data.data_loaders import get_train, get_features, get_patient_notes
train = get_train()
features = get_features()
patient_notes = get_patient_notes()

merged_df = train.merge(patient_notes, on="pn_num", how="left").merge(features, on="feature_num", how="left")
merged_df.head(10)

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,case_num_y,pn_history,case_num,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],0,HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],0,HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],0,HPI: 17yo M presents with palpitations. Patien...,0,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",0,HPI: 17yo M presents with palpitations. Patien...,0,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],0,HPI: 17yo M presents with palpitations. Patien...,0,Lightheaded
5,00016_005,0,16,5,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6,00016_006,0,16,6,"['adderall', 'adderrall', 'adderrall']","['321 329', '404 413', '652 661']",0,HPI: 17yo M presents with palpitations. Patien...,0,Adderall-use
7,00016_007,0,16,7,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,Shortness-of-breath
8,00016_008,0,16,8,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,Caffeine-use
9,00016_009,0,16,9,"['palpitations', 'heart beating/pounding']","['26 38', '96 118']",0,HPI: 17yo M presents with palpitations. Patien...,0,heart-pounding-OR-heart-racing


### Basic Text Statistics

In [11]:
# get all unique notes from training data
unique_patients = nbme['pn_num'].unique()
nbme_notes = patient_notes[patient_notes['pn_num'].isin(unique_patients)]
nbme_notes_raw = list(nbme_notes['pn_history'])

# Character count
nbme_notes['pn_history'].str.len().hist(bins=50, figsize=(5,5))

NameError: name 'nbme' is not defined

Perhaps we should explore the right-hand tail here? Did notes get cut off? And if so, can we take advantage of those notes / filter them out? Lets explore in cells below.

### Action: Create a dataframe with the notes and whether a feature is present or not

In [12]:
all_features = set(features.feature_text.unique())
patient_num = 41
def get_note_and_features(patient_num: int, merged_df: pd.DataFrame, all_features: set):
    this_note_data:pd.DataFrame = merged_df.query(f'pn_num == {patient_num}')
    features_present = set()
    for (feature_name, feature_num, annotation) in this_note_data[['feature_text', 'feature_num', 'annotation']].itertuples(index=False):
        feature_is_present = len(eval(annotation)) > 0
        if feature_is_present:
            features_present.add(feature_name)
    features_not_present = all_features.difference(features_present)
    return pd.Series([this_note_data['pn_history'].iloc[0], *np.ones(len(features_present)), *np.zeros(len(features_not_present))], index=['pn_history'] + list(features_present) + list(features_not_present))

present_df = pd.DataFrame([get_note_and_features(patient_num, merged_df, all_features) for patient_num in merged_df.pn_num.unique()])

### Models: 
# Naive Bayes classification 

We shall experiment with a few input types:
1. raw text
2. Bag-of-words tokenization
3. DL Transformer embedding

#### Action: Clean up written notes for Naive Bayes

In [13]:
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer


# Classify presence of NAUSEA
data = present_df[['pn_history','Nausea']]
data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
data['pn_history'] = data['pn_history'].str.strip().str.lower()

# Stem the notes


  data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pn_history'] = data['pn_history'].str.strip().str.lower()


In [14]:
# Split data. X% is used as training; 1-X% as testing. The data will be randomly split
TEST_SIZE = 0.2 
train, test = train_test_split(data, test_size=0.2, random_state=37)
print(f'We have {len(train)} train points and {len(test)} test points')

We have 800 train points and 200 test points


In [15]:
class BayesClassifier(ClassifierMixin, BaseEstimator):
    """
    Simple wrapper for Naive Bayes classifier.
    """

    def __init__(self, alpha=1.0):
        self.classifier = None
        

    def fit(self, X: pd.DataFrame, y: Union[np.ndarray, list]):
        """
        Fit Naive Bayes classifier according to X, y (ground truth).
        """
        labeled_featuresets = list(zip(X,y))
        self.classifier = nltk.NaiveBayesClassifier.train(labeled_featuresets)
        return self

    def predict(self, X):
        """
        Predict class labels for X.
        """
        return self.classifier.classify_many(X)

    def predict_proba(self, X):
        """
        Predict class probabilities for X.
        """
        return self.classifier.prob_classify_many(X)

In [16]:
# Convert to dict {"note": train['pn_history']}
train_featureset = []
for note in train['pn_history']:
    train_featureset.append({"note": note})

test_featureset = []
for note in test['pn_history']:
    test_featureset.append({"note": note})


clf = BayesClassifier().fit(train_featureset, train['Nausea'])

In [17]:
roc_auc_score(test['Nausea'], clf.predict(test_featureset))

0.5

Naive Bayes can't perform better than a coinflip when we just feed it raw text. Let's pursue the deep learning approach and perhaps bag of words or other forms of classical tokenization

#### Action: Use zero-shot approach for detection of Nausea in patient notes

In [18]:
from transformers import pipeline
class NLIClassifier(ClassifierMixin, BaseEstimator):
    classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
    labels = ['nausea', 'no nausea']

    def predict(self, X: Union[pd.DataFrame, pd.Series, list]) -> np.ndarray:
        assert len(X.shape) == 1 or X.shape[1] == 1, 'can only fit on text'
        out = []
        for t in tqdm(X, desc='Predicting'):
            prediction = self.classifier(t, self.labels)
            prediction = pd.Series(index=prediction['labels'], data=prediction['scores'])
            out.append(prediction['nausea'])
        return np.array(out)

In [20]:
predictions = NLIClassifier().predict(train['pn_history']) if 'train_NLI_predictions_nausea.npz' not in os.listdir('.') else np.fromfile('train_NLI_predictions_nausea.npz')
predictions.tofile('train_NLI_predictions_nausea.npz')

In [21]:
roc_auc_score(train['Nausea'], predictions)

0.8763496296296296

In [23]:
hard_prediction = (predictions > 0.5).astype(int)
from sklearn.metrics import classification_report
print(classification_report(train['Nausea'], hard_prediction))

              precision    recall  f1-score   support

         0.0       0.98      0.42      0.59       675
         1.0       0.24      0.96      0.38       125

    accuracy                           0.51       800
   macro avg       0.61      0.69      0.49       800
weighted avg       0.87      0.51      0.56       800



In [25]:
present_df

Unnamed: 0,pn_history,17-year,Adderall-use,Family-history-of-thyroid-disorder,Few-months-duration,Lightheaded,Intermittent-symptoms,heart-pounding-OR-heart-racing,Family-history-of-MI-OR-Family-history-of-myocardial-infarction,Male,...,Heavy-caffeine-use,viral-symptoms-OR-rhinorrhea-OR-scratchy-throat,No-blood-in-stool,Episodes-last-15-to-30-minutes,LMP-2-months-ago-or-Last-menstrual-period-2-months-ago,No-shortness-of-breath,Lack-of-other-thyroid-symptoms,FHx-of-PUD-OR-Family-history-of-peptic-ulcer-disease,Irregular-menses,Diminished-appetite
0,HPI: 17yo M presents with palpitations. Patien...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mr. Cleveland is a 17yo M who was consented by...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17 yo M w/ no cardiac or arrhythmia PMH presen...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,HPI: Dillon Cleveland is an otherwise healthy ...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Pt is 20 yo F w headache since yesterday morni...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
996,"20 F no PMH, lives w/ roommate in apartment ha...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
997,20 y/o F with no PMH is presenting with 1 day ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
998,Ms. Madden is a 20 yo female presenting w/ the...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
