In [1]:
from pathlib import Path
import pandas as pd
import pandas as pd 
import spacy
from termcolor import colored
from wordcloud import WordCloud,STOPWORDS
from spacy import displacy
from nltk.tokenize import sent_tokenize, word_tokenize 
# import warnings
# warnings.filterwarnings("ignore")


""" Data Loading """
DATA_PATH = Path('input/nbme-score-clinical-patient-notes')
features_df = pd.read_csv(DATA_PATH / 'features.csv')
patient_notes_df = pd.read_csv(DATA_PATH / 'patient_notes.csv')
train_df = pd.read_csv(DATA_PATH / 'train.csv')

""" Load an English pipeline optimized for CPU """
nlp = spacy.load('en_core_web_sm')

In [2]:
def patient_data(pn_num):
    subset = train_df[train_df['pn_num'] == pn_num].copy()
    
    features_lst = subset['feature_num'].tolist()
    
    subset['location'] = subset['location'].apply(eval)
    subset['annotation'] = subset['annotation'].apply(eval)
    locations  = subset["location"]
    
    print("*"*80)
    print(colored("Patient Number: " + str(pn_num), 'blue'))
    patient_history = patient_notes_df[patient_notes_df['pn_num']==pn_num]['pn_history'].item()
    print(colored("Annotated Text", 'blue'))
    
    # Process to Manually Render Data
    ents = []
    for location in locations:
        for _ in range(len(location)):
            for loc in location:
                val = loc.split()
                ents.append({
                'start': int(val[0]), 
                'end' :  int(val[1]),
                'label' : "Annotation"
                })
    ents = sorted(ents, key = lambda i: i['start'])

    doc = {
        'text' : patient_history,
        'ents' : ents
    }
    colors = {"Annotation" :"linear-gradient(to right, #0d6307, #0ef00e, #ffffff);" } 
    options = {"colors": colors}
    spacy.displacy.render(doc,
                          style='ent',
                          options=options,
                          manual=True,
                          jupyter=True,
                          )
    
    print(colored("\nVisualizing NER", 'blue'))
    doc = nlp(patient_history)
    displacy.render(doc, style='ent', jupyter=True)
    
    print(colored("\nVisualizing POS tagging", 'green'))
    sentences = sent_tokenize(patient_history)
    word_count = lambda sentence: len(word_tokenize(sentence))
    pos_text = max(sentences, key=word_count)  
    doc = nlp(pos_text)
    displacy.render(doc, style="dep")

    print(colored("\nFeatures", 'green'))
    for feature_num in features_lst:
        feature = features_df[features_df['feature_num'] == feature_num]['feature_text'][feature_num]
        print(colored(feature, 'blue'))
        
patient_data(331)
# patient_data(46)
# patient_data(100)

********************************************************************************
Patient Number: 331[0m
[34mAnnotated Text[0m


[34m
Visualizing NER[0m


[32m
Visualizing POS tagging[0m


[32m
Features[0m
[34mFamily-history-of-MI-OR-Family-history-of-myocardial-infarction[0m
[34mFamily-history-of-thyroid-disorder[0m
[34mChest-pressure[0m
[34mIntermittent-symptoms[0m
[34mLightheaded[0m
[34mNo-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance[0m
[34mAdderall-use[0m
[34mShortness-of-breath[0m
[34mCaffeine-use[0m
[34mheart-pounding-OR-heart-racing[0m
[34mFew-months-duration[0m
[34m17-year[0m
[34mMale[0m
