In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin
from typing import Union
from tqdm import tqdm
import os
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
import pickle

  from pandas import MultiIndex, Int64Index


# Data Exploration, Luke edition

#### Setup

In [2]:
# NBME dataset
train = pd.read_csv("data/train.csv")
features = pd.read_csv("data/features.csv")
patient_notes = pd.read_csv("data/patient_notes.csv")

merged_df = train.merge(patient_notes, on="pn_num", how="left").merge(features, on="feature_num", how="left")
merged_df.head(10)

Unnamed: 0,id,case_num_x,pn_num,feature_num,annotation,location,case_num_y,pn_history,case_num,feature_text
0,00016_000,0,16,0,['dad with recent heart attcak'],['696 724'],0,HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-MI-OR-Family-history-of-myoc...
1,00016_001,0,16,1,"['mom with ""thyroid disease']",['668 693'],0,HPI: 17yo M presents with palpitations. Patien...,0,Family-history-of-thyroid-disorder
2,00016_002,0,16,2,['chest pressure'],['203 217'],0,HPI: 17yo M presents with palpitations. Patien...,0,Chest-pressure
3,00016_003,0,16,3,"['intermittent episodes', 'episode']","['70 91', '176 183']",0,HPI: 17yo M presents with palpitations. Patien...,0,Intermittent-symptoms
4,00016_004,0,16,4,['felt as if he were going to pass out'],['222 258'],0,HPI: 17yo M presents with palpitations. Patien...,0,Lightheaded
5,00016_005,0,16,5,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,No-hair-changes-OR-no-nail-changes-OR-no-tempe...
6,00016_006,0,16,6,"['adderall', 'adderrall', 'adderrall']","['321 329', '404 413', '652 661']",0,HPI: 17yo M presents with palpitations. Patien...,0,Adderall-use
7,00016_007,0,16,7,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,Shortness-of-breath
8,00016_008,0,16,8,[],[],0,HPI: 17yo M presents with palpitations. Patien...,0,Caffeine-use
9,00016_009,0,16,9,"['palpitations', 'heart beating/pounding']","['26 38', '96 118']",0,HPI: 17yo M presents with palpitations. Patien...,0,heart-pounding-OR-heart-racing


### Basic Text Statistics

In [3]:
# get all unique notes from training data
unique_patients = nbme['pn_num'].unique()
nbme_notes = patient_notes[patient_notes['pn_num'].isin(unique_patients)]
nbme_notes_raw = list(nbme_notes['pn_history'])

# Character count
nbme_notes['pn_history'].str.len().hist(bins=50, figsize=(5,5))

NameError: name 'nbme' is not defined

Perhaps we should explore the right-hand tail here? Did notes get cut off? And if so, can we take advantage of those notes / filter them out? Lets explore in cells below.

### Action: Create a dataframe with the notes and whether a feature is present or not

In [47]:
all_features = set(features.feature_text.unique())
patient_num = 41
def get_note_and_features(patient_num: int, merged_df: pd.DataFrame, all_features: set):
    this_note_data:pd.DataFrame = merged_df.query(f'pn_num == {patient_num}')
    features_present = set()
    for (feature_name, feature_num, annotation) in this_note_data[['feature_text', 'feature_num', 'annotation']].itertuples(index=False):
        feature_is_present = len(eval(annotation)) > 0
        if feature_is_present:
            features_present.add(feature_name)
    features_not_present = all_features.difference(features_present)
    return pd.Series([this_note_data['pn_history'].iloc[0], *np.ones(len(features_present)), *np.zeros(len(features_not_present))], index=['pn_history'] + list(features_present) + list(features_not_present))

present_df = pd.DataFrame([get_note_and_features(patient_num, merged_df, all_features) for patient_num in merged_df.pn_num.unique()])

### Models: 
# Naive Bayes classification 

We shall experiment with a few input types:
1. raw text
2. Bag-of-words tokenization
3. DL Transformer embedding

#### Action: Clean up written notes for Naive Bayes

In [64]:
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer


# Classify presence of NAUSEA
data = present_df[['pn_history','Nausea', 'Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals']]
data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
data['pn_history'] = data['pn_history'].str.strip().str.lower()

# Stem the notes


  data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pn_history'] = data['pn_history'].str.replace('[^A-Za-z0-9 ]+', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pn_history'] = data['pn_history'].str.strip().str.lower()


In [101]:
# Split data. X% is used as training; 1-X% as testing. The data will be randomly split
TEST_SIZE = 0.2 
train, test = train_test_split(data, test_size=0.2, random_state=37)
print(f'We have {len(train)} train points and {len(test)} test points')

We have 800 train points and 200 test points


In [50]:
class BayesClassifier(ClassifierMixin, BaseEstimator):
    """
    Simple wrapper for Naive Bayes classifier.
    """

    def __init__(self, alpha=1.0):
        self.classifier = None
        

    def fit(self, X: pd.DataFrame, y: Union[np.ndarray, list]):
        """
        Fit Naive Bayes classifier according to X, y (ground truth).
        """
        labeled_featuresets = list(zip(X,y))
        self.classifier = nltk.NaiveBayesClassifier.train(labeled_featuresets)
        return self

    def predict(self, X):
        """
        Predict class labels for X.
        """
        return self.classifier.classify_many(X)

    def predict_proba(self, X):
        """
        Predict class probabilities for X.
        """
        return self.classifier.prob_classify_many(X)

In [51]:
# Convert to dict {"note": train['pn_history']}
train_featureset = []
for note in train['pn_history']:
    train_featureset.append({"note": note})

test_featureset = []
for note in test['pn_history']:
    test_featureset.append({"note": note})


clf = BayesClassifier().fit(train_featureset, train['Nausea'])

In [52]:
roc_auc_score(test['Nausea'], clf.predict(test_featureset))

0.5

Naive Bayes can't perform better than a coinflip when we just feed it raw text. Let's pursue the deep learning approach and perhaps bag of words or other forms of classical tokenization

#### Action: Use zero-shot approach for detection of Nausea in patient notes

In [53]:
from transformers import pipeline
class NLIClassifier(ClassifierMixin, BaseEstimator):
    classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
    labels = ['nausea', 'no nausea']

    def predict(self, X: Union[pd.DataFrame, pd.Series, list]) -> np.ndarray:
        assert len(X.shape) == 1 or X.shape[1] == 1, 'can only fit on text'
        out = []
        for t in tqdm(X, desc='Predicting'):
            prediction = self.classifier(t, self.labels)
            prediction = pd.Series(index=prediction['labels'], data=prediction['scores'])
            out.append(prediction['nausea'])
        return np.array(out)

KeyboardInterrupt: 

In [None]:
predictions = NLIClassifier().predict(train['pn_history']) if 'train_NLI_predictions_nausea.npz' not in os.listdir('.') else np.fromfile('train_NLI_predictions_nausea.npz')
predictions.tofile('train_NLI_predictions_nausea.npz')

In [66]:
roc_auc_score(train['Nausea'], predictions)

0.35134814814814813

## Action: classifier for `Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals` (212)

Row example: `'gone without a period is 4 months', 'Bleeding lasts 2-6 days', 'can be heavy or light']","['130 163', '165 188', '193 214']"`

### Naíve Zero Shot

Must associate with menstruation via query, which is way harder than with simple Male/Female classifier

In [104]:
from transformers import pipeline
class ZeroShotIrregularMenstruationClassifier(ClassifierMixin, BaseEstimator):
    classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")
    labels = ['Irregular flow', 'Irregular frequency', 'irregular intervals', 'Irregular frequency of period', 'irregular menstruation intervals']

    def predict(self, X: Union[pd.DataFrame, pd.Series, list]) -> np.ndarray:
        assert len(X.shape) == 1 or X.shape[1] == 1, 'can only fit on text'
        out = []
        for t in tqdm(X, desc='Predicting'):
            prediction = self.classifier(t, self.labels)
            prediction = pd.Series(index=prediction['labels'], data=prediction['scores'])
            # out.append(prediction['irregular menstruation'])
            print(prediction)
            maximum_term = prediction[1:].max()
            out.append(maximum_term)
        return np.array(out)

In [105]:
zeroshot_train = train

# Note: This takes about an hour to run if not cached
predictions = ZeroShotIrregularMenstruationClassifier().predict(zeroshot_train['pn_history']) if 'train_predictions_irregular_menstruation2.npz' not in os.listdir('.') else np.fromfile('train_predictions_irregular_menstruation2.npz')
predictions.tofile('train_predictions_irregular_menstruation2.npz')


[A
[A
[A
[A
Predicting:   0%|          | 4/800 [00:37<2:05:54,  9.49s/it]


KeyboardInterrupt: 

In [102]:
roc_auc_score(train['Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals'], predictions)

0.966891327808434

In [103]:
print(classification_report(train['Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals'].to_numpy(), predictions.round()))

              precision    recall  f1-score   support

         0.0       1.00      0.74      0.85       733
         1.0       0.26      1.00      0.42        67

    accuracy                           0.77       800
   macro avg       0.63      0.87      0.64       800
weighted avg       0.94      0.77      0.82       800



Shoot! Zero-shot is quite bad at classifying this highly compound phrase.

In [29]:
import os
from pathlib import Path
_FILE_PATH = os.path.abspath('')
_DIR_PATH = os.path.dirname(_FILE_PATH)
_PROJECT_PATH = Path(_DIR_PATH).parent.absolute()
EMBEDDINGS_NAME = "all-mpnet-base-v2"
EMBEDDINGS_DIR = f"{_PROJECT_PATH}/data/embeddings/{EMBEDDINGS_NAME}"

# # Stack up data/embeddings/EMBEDDINGS_NAME/*.npy files, each of which is a row of cleaned training data
# train = np.fromfile(f"{EMBEDDINGS_DIR}/16.npy")
# type(train)

numpy.ndarray

In [91]:
# from project.data.data_loaders import get_clean_train_data
# # Run inference
# predictions = []
# train = get_clean_train_data()
# train = train.drop_duplicates(subset=['pn_num'])
# test = train.sample(n=10, random_state=1)

# classifier = ZeroShotIrregularMenstruationClassifier()
# for idx, r in tqdm(test.iterrows(), desc='Eval:', total=len(test)):
#     if r['feature_text'] == 'Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals':
        
#         prediction = classifier.predict(r["pn_history"])

    

### Encoder + Decision Tree / Logistic Regression

In [87]:
# Retrieve cached embedding from filesystem
import os
from pathlib import Path
_FILE_PATH = os.path.abspath('')
_DIR_PATH = os.path.dirname(_FILE_PATH)
_PROJECT_PATH = Path(_DIR_PATH).parent.absolute()
EMBEDDINGS_NAME = "all-mpnet-base-v2"
EMBEDDINGS_DIR = f"{_PROJECT_PATH}/data/embeddings/{EMBEDDINGS_NAME}"

def get_embedding(pn_num: int) -> np.ndarray:
    """
    Retrieve cached embedding from filesystem, or return None if not found.
    """
    embedding_path = f"{EMBEDDINGS_DIR}/{pn_num}.npy"
    if os.path.exists(embedding_path):
        return np.load(embedding_path)
    else:
        return None


In [94]:
train = get_clean_train_data()
irregular_menstruation_data = train.query('feature_text == "Irregular-flow-OR-Irregular-frequency-OR-Irregular-intervals"')
print(train.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14300 entries, 0 to 14299
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            14300 non-null  object
 1   case_num      14300 non-null  int64 
 2   pn_num        14300 non-null  int64 
 3   feature_num   14300 non-null  int64 
 4   annotation    14300 non-null  object
 5   location_raw  14300 non-null  object
 6   feature_text  14300 non-null  object
 7   pn_history    14300 non-null  object
 8   location      14300 non-null  object
dtypes: int64(3), object(6)
memory usage: 1.6+ MB
None


In [97]:
from project.embedding.embedding_mechanism import CorpusEmbedder
train['embedding'] = [CorpusEmbedder('all-mpnet-base-v2').embed(row[['pn_num', 'pn_history']]) for _, row in tqdm(train.iterrows())]



KeyboardInterrupt: 



In [17]:
class EncoderClassifier(ClassifierMixin, BaseEstimator):
    def __init__(self):
        self.classifier = XGBClassifier(eval_metric='logloss')

    def fit(self, X: pd.DataFrame, y: Union[np.ndarray, list]):
        assert len(X.shape) == 1 or X.shape[1] == 1, 'can only fit on text'
        embeddings = get_embedding()
        self.classifier = self.classifier.fit(embeddings, y)
        return self

    def predict(self, X: pd.DataFrame) -> np.ndarray:
        assert len(X.shape) == 1 or X.shape[1] == 1, 'can only fit on text'
        assert len(X) > 1, 'assuming we have batches'
        embeddings = np.stack(self.embeddings.loc[X.index.tolist()].values)
        return self.classifier.predict_proba(embeddings)[:, 1]  # 1 because 1 = MALE, 0 = FEMALE
        


Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location_raw,feature_text,pn_history,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...,"[(696, 724)]"
13,00041_000,0,41,0,[],[],Family-history-of-MI-OR-Family-history-of-myoc...,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,[]
26,00046_000,0,46,0,[father: heart attack],[824 844],Family-history-of-MI-OR-Family-history-of-myoc...,Mr. Cleveland is a 17yo M who was consented by...,"[(824, 844)]"
39,00082_000,0,82,0,[Father MI],[622 631],Family-history-of-MI-OR-Family-history-of-myoc...,17 yo M w/ no cardiac or arrhythmia PMH presen...,"[(622, 631)]"
52,00100_000,0,100,0,[Dad-MI],[735 741],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: Dillon Cleveland is an otherwise healthy ...,"[(735, 741)]"
...,...,...,...,...,...,...,...,...,...
14215,95145_900,9,95145,900,[],[],No-relief-with-Motrin-OR-no-relief-with-tylenol,Pt is 20 yo F w headache since yesterday morni...,[]
14232,95228_900,9,95228,900,[],[],No-relief-with-Motrin-OR-no-relief-with-tylenol,"20 F no PMH, lives w/ roommate in apartment ha...",[]
14249,95243_900,9,95243,900,"[tylenol has not helped, ibuprofen which has n...","[310 317;338 352, 322 352]",No-relief-with-Motrin-OR-no-relief-with-tylenol,20 y/o F with no PMH is presenting with 1 day ...,"[(310, 317), (338, 352), (322, 352)]"
14266,95330_900,9,95330,900,"[tylenol not helped, Ibuprofen not helped]","[473 480;493 503, 462 471;493 503]",No-relief-with-Motrin-OR-no-relief-with-tylenol,Ms. Madden is a 20 yo female presenting w/ the...,"[(473, 480), (493, 503), (462, 471), (493, 503)]"


In [None]:
clf = EncoderClassifier().fit(train['pn_history'], train['Male'].to_numpy())