In [2]:
import pandas as pd
import spacy
from spacy import displacy
import scispacy
import medspacy
from medspacy.ner import TargetRule
import en_core_sci_sm
import en_ner_bc5cdr_md

In [3]:
note_path = '../MIMIC/physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv'
diag_path = '../MIMIC/physionet.org/files/mimiciii/1.4/DIAGNOSES_ICD.csv'

In [4]:
notes_df = pd.read_csv(note_path)
diag_df = pd.read_csv(diag_path)

In [5]:

summary_df = notes_df.loc[notes_df['CATEGORY'] == 'Discharge summary', ['SUBJECT_ID', 'HADM_ID', 'TEXT']]
disease_df = diag_df[diag_df['ICD9_CODE'].notna() & diag_df['ICD9_CODE'].str.startswith(('303'))].copy()
patients_df = pd.merge(summary_df, disease_df, on=['SUBJECT_ID', 'HADM_ID'] )


In [6]:
nlp = spacy.load('en_core_web_sm')
for index, row in patients_df.head(1).iterrows():
  entities = nlp(row['TEXT'])
  displacy.render(entities, style='ent', jupyter=True)

In [7]:
nlp = spacy.load('en_core_sci_sm')
for index, row in patients_df.head(1).iterrows():
  entities = nlp(row['TEXT'])
  displacy.render(entities, style='ent', jupyter=True)

In [8]:
nlp = medspacy.load()
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
  TargetRule('EtOH abuse', 'DISEASE'),
  TargetRule('EtOH', 'SUBSTANCE'),
  TargetRule('Alcohol Withdrawl', 'PROBLEM'),
  TargetRule('EtOH Withdrawl', 'PROBLEM')
]
target_matcher.add(target_rules)
for index, row in patients_df.head(1).iterrows():
  entities = nlp(row['TEXT'])
  displacy.render(entities, style='ent', jupyter=True)

In [9]:
pd.options.mode.chained_assignment = None
import re
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [10]:
def tsne_plot(model,words,sample_size, preTrained=False):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in words[:sample_size]:
      if preTrained:
          tokens.append(model[word])
      else:
          tokens.append(model.wv[word])
      labels.append(word)

    tokens = np.array(tokens)
    tsne_model = TSNE(perplexity=20, early_exaggeration=12, n_components=2, init='pca', n_iter=1000, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    plt.figure(figsize=(16, 16))
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()

In [11]:

import spacy
nlp = spacy.load('en_core_sci_sm')
df = notes_df.loc[:100, 'TEXT'].tolist()
corpus = []
for row in range(0, len(df)):
  str_tokens=[]
  tokens=nlp(df[row]).ents
  for i in range(0, len(tokens)):
    str_tokens.append(tokens[i].text)
  corpus.append(str_tokens)

print(corpus)

[['Admission', 'Discharge', 'ADDENDUM', 'RADIOLOGIC', 'STUDIES', 'Radiologic studies', 'cavitary', 'lesions', 'left lung apex', 'consistent with', 'infectious process/tuberculosis', 'moderate-sized left pleural effusion', 'HEAD', 'CT', 'intracranial hemorrhage', 'mass\neffect', 'infarction', 'consistent with', 'ABDOMINAL', 'CT', 'Abdominal CT', 'lesions', 'T10', 'sacrum', 'osteoporosis', 'repeat imaging', 'outpatient', 'M.D.', 'By:[**Hospital', 'MEDQUIST36', 'JOB'], ['Admission', 'Discharge', 'Birth', 'Sex', 'Service', 'MICU', 'female', 'history of emphysema', 'home O2', 'days', 'shortness', 'breath', 'COPD', 'flare', 'days', 'admission', 'taper', 'day', 'oxygen', 'oxygen saturation', 'levofloxacin', 'nebulizers', '[**Hospital1 18**] Emergency Room', '[**Hospital3 **] Emergency Room', 'oxygen saturation', 'CPAP', 'nebulizer', 'treatment', 'Solu-Medrol', 'Review of', 'systems', 'negative', 'Fevers', 'nausea', 'vomiting', 'night sweats', 'weight', 'neurologic changes', 'rashes', 'palpita

In [12]:
model = Word2Vec(corpus, min_count=1)
vocabs = model.wv.key_to_index.keys()
print(vocabs)
print(model.wv['patient'])

dict_keys(['patient', 'day', 'Tablet Sig', 'Tablet', 'PO', 'Daily', 'days', 'Tablet PO', 'admission', 'Admission', 'Patient', 'hours', 'stable', 'daily', 'BLOOD', 'negative', 'BID', 'discharge', 'Pt', 'evidence', 'ED', "patient's", 'mouth', 'baseline', 'HCO3', 'weeks', 'pain', 'breath', 'LF', 'increased', 'admitted', 'history', 'INR', 'TID', 'Delayed Release', 'Please', 'bilaterally', 'MD', 'COPD', 'changes', 'Medications', 'consistent with', 'CXR', 'BP', 'Capsule Sig', 'intact', 'medications', 'CT', 'Service', 'Social History', 'PCP', 'Birth', 'Illness', 'Sex', 'floor', 'Name3', 'Surgical', 'Invasive Procedure', 'week', 'Family History', 'Location', 'time', 'severe', 'HTN', 'Discharge Medications', 'Physical Exam', 'Followup Instructions', 'decreased', 'Capsule PO', 'O2', 'Inhalation', 'hypotension', 'shortness', 'Tablet PO DAILY', 'DAILY', 'Tablet PO DAILY\n', 'DISCHARGE', 'appointment', 'post', 'ICU', 'nausea', 'Coumadin', 'improved', 'symptoms', 'Brief Hospital Course', 'chest pain

In [None]:
new_v = np.array(list(vocabs))
tsne_plot(model, new_v, -1)

<gensim.models.word2vec.Word2Vec at 0x7fe04f178820>