In [1]:
import pickle
import pandas as pd
import numpy as np
import nltk
import gensim
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
style.use('ggplot')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
with open('EHR_df_part2', 'rb') as f:
    EHR_df = pickle.load(f)

In [3]:
EHR_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,ICD9_CODE,LONG_TITLE,MEDICAL_HISTORY,SOCIAL_HISTORY,FAMILY_HISTORY,HISTORY_INFO,stroke_label
1,22532,167853,Discharge summary,admission date: discharge date: ...,11,"Pulmonary tuberculosis, unspecified, tubercle ...",": cardiomyopathy idiopathic, echocardiogram...",: the patient is an african american female ...,: there is a of colon cancer,": cardiomyopathy idiopathic, echocardiogram...",non-stroke
2,13702,196489,Discharge summary,admission date: discharge date...,518,Acute and chronic respiratory failure,": copd flare fev1 40 in , on 5l oxygen, s...",: the patient is married and worked as a clin...,": fhx cad father with an mi in his 40 s, d...",": copd flare fev1 40 in , on 5l oxygen, s...",non-stroke
3,26880,135453,Discharge summary,admission date: discharge date...,805,Closed fracture of sixth cervical vertebra,: coronary artery disease s p cabg chf htn ai...,: patient recently discharged from for seve...,: non contributory,: coronary artery disease s p cabg chf htn ai...,non-stroke
4,53181,170490,Discharge summary,admission date: discharge date...,225,Benign neoplasm of cerebral meninges,", past surgical history, facial history, and",to the initial note on she cam to the bt...,": cancer, diabetes, hearing loss, and heart d...",", past surgical history, facial history, and ...",non-stroke
5,20646,134727,Discharge summary,admission date: discharge date...,518,Acute respiratory failure,: cad s p stent in chf htn pe pancreati...,: the patient has been in rehab for the past ...,: doesn t know about siblings health childre...,: cad s p stent in chf htn pe pancreati...,non-stroke


In [4]:
#Removing features(SUBJECT_ID, HADM_ID, CATEGORY) from the final DataFrame
EHR_df.drop(columns=['SUBJECT_ID', 'HADM_ID','CATEGORY','ICD9_CODE'],inplace=True)

## NLP Pre-Processing

Cleaning the 'HISTORY_INFO' corpus with NLP pre-processing techniques

In [5]:
#Loading the Medical Abbreviation dictionary pkl 
with open('med_dict.plk', 'rb') as f:
    med_dict = pickle.load(f, encoding ='UTF-8')

In [6]:
#Creating the following helper functions:
def abbr_replace(string):
    '''
    Input: Input a given string that contains a description of the medical, social , 
    and family history of the patient
    
    Output: A cleaned string where all the medical abbreviations is replaced.
    '''
    re_string = ' '
    med_abbr = 0
    for s in string:
        if s in med_dict:
            s = med_dict[s]
            re_string += ' ' + s
        else:
            re_string += ' ' + s
    return re_string
            
def lem(string):
    '''Lemmatization of the words in the string'''
    lem = WordNetLemmatizer()
    lemma = [lem.lemmatize(t) for t in string]
    return lemma

def punc(x):
    '''Removing any punctuations'''
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', str(x))
    return x


stop = stopwords.words('english');

stop_words = ["disease, diseases, disorder, symptom, symptoms, drug, drugs, problems, problem,prob, probs, med, meds,\
, pill, pills, medicine, medicines, medication, medications, treatment, treatments, caps, capsules, capsule,\
, tablet, tablets, tabs, doctor, dr, dr., doc, physician, physicians, test, tests, testing, specialist, specialists,\
, side-effect, side-effects, patient, patients, pharmaceutical, pharmaceuticals, pharma, diagnosis, diagnose, diagnosed, exam,\
, challenge, device, condition, conditions, suffer, suffering ,suffered, feel, feeling, prescription, prescribe,\
, prescribed, over-the-counter, a, about, above, after, again, against, all, am, an, and, any, are, aren't, as, at, be, because, been, before,\
, being, below, between, both, but, by, can, can't, cannot, could, couldn't, did, didn't, do, does, doesn't,\
, doing, don't, down, during, each, few, for, from, further, had, hadn't, has, hasn't, have, haven't, having, he,\
, he'd, he'll, he's, her, here, here's, hers, herself, him, himself, his, how, how's, i, i'd, i'll, i'm, i've, if, in, into,\
, is, isn't, it, it's, its, itself, let's, me, more, most, mustn't, my, myself, no, nor, not, of, off, on, once, only, or,\
, other, ought, our, ours , ourselves, out, over, own, same, shan't, she, she'd, she'll, she's, should, shouldn't,\
, so, some, such, than, that, that's, the, their, theirs, them, themselves, then, there, there's, these, they,\
, they'd, they'll, they're, they've, this, those, through, to, too, under, until, up, very, was, wasn't, we, we'd,\
, we'll, we're, we've, were, weren't, what, what's, when, when's, where, where's, which, while, who, who's,\
, whom, why, why's, with, won't, would, wouldn't, you, you'd, you'll, you're, you've, your, yours, yourself,\
, yourselves, n't, 're, 've, 'd, 's, 'll, 'm",'life','wife','year','ago','non','contributory']

stop.extend(stop_words)

In [7]:
EHR_df.head()

Unnamed: 0,TEXT,LONG_TITLE,MEDICAL_HISTORY,SOCIAL_HISTORY,FAMILY_HISTORY,HISTORY_INFO,stroke_label
1,admission date: discharge date: ...,"Pulmonary tuberculosis, unspecified, tubercle ...",": cardiomyopathy idiopathic, echocardiogram...",: the patient is an african american female ...,: there is a of colon cancer,": cardiomyopathy idiopathic, echocardiogram...",non-stroke
2,admission date: discharge date...,Acute and chronic respiratory failure,": copd flare fev1 40 in , on 5l oxygen, s...",: the patient is married and worked as a clin...,": fhx cad father with an mi in his 40 s, d...",": copd flare fev1 40 in , on 5l oxygen, s...",non-stroke
3,admission date: discharge date...,Closed fracture of sixth cervical vertebra,: coronary artery disease s p cabg chf htn ai...,: patient recently discharged from for seve...,: non contributory,: coronary artery disease s p cabg chf htn ai...,non-stroke
4,admission date: discharge date...,Benign neoplasm of cerebral meninges,", past surgical history, facial history, and",to the initial note on she cam to the bt...,": cancer, diabetes, hearing loss, and heart d...",", past surgical history, facial history, and ...",non-stroke
5,admission date: discharge date...,Acute respiratory failure,: cad s p stent in chf htn pe pancreati...,: the patient has been in rehab for the past ...,: doesn t know about siblings health childre...,: cad s p stent in chf htn pe pancreati...,non-stroke


In [8]:
#Applying the helper functions to help clean the corpus
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].str.split()
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].apply(lambda x:abbr_replace(x))

In [9]:
#Applying the helper functions to help clean the corpus
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].str.split()
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].apply(lambda x:lem(x))
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].apply(lambda x:punc(x))
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].apply(lambda x:gensim.utils.simple_preprocess(str(x)))
EHR_df['HISTORY_INFO'] = EHR_df['HISTORY_INFO'].apply(lambda x:[item for item in x if item not in stop])
#joining all the words together before tokenizing the corpus
EHR_df['HISTORY_INFO']=[" ".join(review) for review in EHR_df['HISTORY_INFO'].values]


In [10]:
tmp = EHR_df[pd.notnull(EHR_df['HISTORY_INFO'])]

none_list = ['none', 'none none', 'none none none', 'unknown', 'unknown unknown',
             'unknown unknown unknown', 'none unknown unknown', 'denies', 'noncontributory',
            'unknown unknown nc', 'as above', 'non contributory']
null_count = 0 


for word in none_list:
    null_count += tmp[tmp['HISTORY_INFO'] == word].shape[0]
    
EHR_df = tmp[~tmp['HISTORY_INFO'].isin (none_list)]
EHR_df = EHR_df[EHR_df['HISTORY_INFO'] != '']
print ('Number of none or unknown value: %d' %null_count)


Number of none or unknown value: 81


Random Sampling of the non-stroke 

In [None]:
# sampling out stroke and non-stroke data and assign new user IDs
stroke = EHR_df[EHR_df['stroke_label'] == 'stroke']
non_stroke = EHR_df[EHR_df['stroke_label'] == 'non-stroke']
non_stroke = non_stroke.sample(2000)
EHR_df = pd.concat([stroke, non_stroke])
EHR_df['ID'] = range(1, EHR_df.shape[0] + 1)
# random shuffle data
EHR_df = EHR_df.sample(frac=1).reset_index(drop=True)
# examine the sampled data
EHR_df.head()

In [None]:
stroke_corpus = EHR_df[EHR_df.stroke_label == 'stroke']['HISTORY_INFO'].values
non_stroke_corpus = EHR_df[EHR_df.stroke_label == 'non-stroke']['HISTORY_INFO'].values

In [None]:
#Creating a helper function that will return the top 10 features in a record

def topfeaturesfreq(tokenizer,sparse_matrix, top_n):
    ''' This method will return the top 10 words associated with the tokenizer'''
    features_counts = tokenizer.get_feature_names()
    counts = sparse_matrix.astype(float)
    #sort the index by the word occurance in descending order
    counts_ind = np.argsort(counts.toarray().sum(axis=0))[::-1]
    top_features_count = [(features_counts[i],counts.toarray().sum(axis=0)[i]) for i in counts_ind[:top_n]]
    return top_features_count

def bottomfeaturesfreq(tokenizer,sparse_matrix, bot_n):
    ''' This method will return the bottom 10 words associated with the tokenizer'''
    features_counts = tokenizer.get_feature_names()
    counts = sparse_matrix.astype(float)
    #sort the index by the word occurance in descending order
    counts_ind = np.argsort(counts.toarray().sum(axis=0))[::-1]
    bottom_features_count = [(features_counts[i],counts.toarray().sum(axis=0)[i]) for i in counts_ind[-bot:]]
    return bottom_features_count

def plot_words(top_features,title,ylabel):
    ''' This fxn will help plot the top words'''
    plt.figure(figsize=(10,10))
    x_tic = range(len(top_features))
    plt.bar(x_tic, [t[1] for t in top_features], align = 'center')
    labels = [t[0] for t in top_features]
    plt.title(title)
    plt.xticks(x_tic,labels,rotation = 70)
    plt.ylabel(ylabel)
    plt.show()

### Count Vec
Generating the top 10 words that are associated to strokes and non-strokes from Count_Vec

In [None]:
count_vec = CountVectorizer(ngram_range=(2,2))
stroke_count = count_vec.fit_transform(stroke_corpus)

In [None]:
stroke_count_df = pd.DataFrame(stroke_count.toarray(), columns=count_vec.get_feature_names())
stroke_count_df.head()

In [None]:
plot_words(topfeaturesfreq(count_vec,stroke_count,top_n = 10),title = 'Top 10 words in Stroke Data using CountVec', ylabel = 'Number of Words')

In [None]:
non_stroke_count = count_vec.fit_transform(non_stroke_corpus)

In [None]:
plot_words(topfeaturesfreq(count_vec,non_stroke_count,top_n = 10),title = 'Top 10 words in Non-Stroke Data using CountVec', ylabel = 'Number of Words')

In [None]:
non_stroke_count_df = pd.DataFrame(non_stroke_count.toarray(), columns=count_vec.get_feature_names())

In [None]:
non_stroke_count_df = pd.DataFrame(non_stroke_count.toarray(), columns=count_vec.get_feature_names())
non_stroke_count_df.head()

### TFIDF Vec

Generating the top 10 words that are associated to strokes and non-strokes from TFIDF

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(2,2))
stroke_tfidf = tfidf_vec.fit_transform(stroke_corpus)

In [None]:
plot_words(topfeaturesfreq(tfidf_vec,stroke_tfidf,top_n = 10),title = 'Top 10 words in Stroke Data using TFIDF Vec', ylabel = 'TFIDF')

In [None]:
tfidf_vec = TfidfVectorizer(ngram_range=(2,2))
non_stroke_tfidf = tfidf_vec.fit_transform(non_stroke_corpus)
plot_words(topfeaturesfreq(tfidf_vec,non_stroke_tfidf,top_n = 10),title = 'Top 10 words in Non-Stroke Data using TFIDF Vec', ylabel = 'TFIDF')

### Word2Vec Pre-Trained

In [None]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model


In [None]:
w2vec = get_word2vec(MySentences(EHR_df.HISTORY_INFO.values),'w2vmodel')

In [None]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    def fit_transform(self, X, y=None):
        return self.transform(X)
    

In [None]:
# mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)

In [None]:
# mean_embedding_vectorizer.word2vec

## Testing Visualization

In [17]:
EHR_df.head()

Unnamed: 0,TEXT,LONG_TITLE,MEDICAL_HISTORY,SOCIAL_HISTORY,FAMILY_HISTORY,HISTORY_INFO,stroke_label
1,admission date: discharge date: ...,"Pulmonary tuberculosis, unspecified, tubercle ...",": cardiomyopathy idiopathic, echocardiogram...",: the patient is an african american female ...,: there is a of colon cancer,cardiomyopathy idiopathic echocardiogram showe...,non-stroke
2,admission date: discharge date...,Acute and chronic respiratory failure,": copd flare fev1 40 in , on 5l oxygen, s...",: the patient is married and worked as a clin...,": fhx cad father with an mi in his 40 s, d...",copd flare fev oxygen intubation distal trache...,non-stroke
3,admission date: discharge date...,Closed fracture of sixth cervical vertebra,: coronary artery disease s p cabg chf htn ai...,: patient recently discharged from for seve...,: non contributory,coronary artery disease cabg chf htn aicd atri...,non-stroke
4,admission date: discharge date...,Benign neoplasm of cerebral meninges,", past surgical history, facial history, and",to the initial note on she cam to the bt...,": cancer, diabetes, hearing loss, and heart d...",past surgical history facial history initial n...,non-stroke
5,admission date: discharge date...,Acute respiratory failure,: cad s p stent in chf htn pe pancreati...,: the patient has been in rehab for the past ...,: doesn t know about siblings health childre...,cad stent chf htn pe pancreatic mass depressio...,non-stroke


## PCA VISUALIZATION

In [None]:
EHR_vis = EHR_df[['HISTORY_INFO','stroke_label']]

In [None]:
# convert text into word frequency
count_vec = CountVectorizer(ngram_range = (2,2))
X_count = count_vec.fit_transform(EHR_vis.HISTORY_INFO)
counts = X_count.astype(float)
# converted notes into term frequency–inverse document frequency with TfidfVectorizer
tfidf_vec = TfidfVectorizer(ngram_range = (2,2))
X_tfidf = tfidf_vec.fit_transform(EHR_vis.HISTORY_INFO)
print (counts.shape)
print (X_tfidf.shape)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import Axes3D

def plot_2d(plot_columns, title, xlabel, ylabel):
    """
    this method generates 2D plot for dimensionality reduction
    """
    # Fit the model on the numeric columns from earlier.
    len_s = len(stroke_corpus)
    # Make a scatter plot, shaded according to cluster assignment.
    plt.figure(figsize = (8,8))
    plt.scatter(x=plot_columns[:len_s, 0], y=plot_columns[:len_s,1], c = ['red', 'green'], 
                cmap = cm.brg_r, label = 'Stroke')
    plt.scatter(x=plot_columns[len_s:,0], y=plot_columns[len_s:,1], c = 'green', 
                cmap = cm.brg_r, label = 'Non-Stroke')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.legend(loc = 4)

In [None]:
dataset_name = 'word ocurance'
X = counts.toarray()
# 1 for stroke and 0 for non-stroke
pca_2_count = PCA(2)
plot_columns_count = pca_2_count.fit_transform(X)
title = "Two PCA directions (%s)"%dataset_name
xlabel = "1st eigenvector"
ylabel = "2nd eigenvector"


In [None]:
# visulize the data with word occurance
plot_2d(plot_columns_count, title, xlabel, ylabel)

In [None]:
dataset_name = 'tfidf'
X = X_tfidf.toarray()
# 1 for stroke and 0 for non-stroke
pca_2_tfidf = PCA(2)
plot_columns_tfidf = pca_2_tfidf.fit_transform(X)
title = "Two PCA directions (%s)"%dataset_name
xlabel = "1st eigenvector"
ylabel = "2nd eigenvector"

# visulize the data with word tfidf
plot_2d(plot_columns_tfidf, title, xlabel, ylabel)

In [None]:
with open('EHR_df_final', 'wb') as f: #change
    pickle.dump(EHR_df, f)         #change 