# Naive Bayes Classifier on MIMICIII medical Notes
    By Binghui Zhang (bzhang62)

    Import libraries

In [1]:
import os
import pickle
import json
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import tensorflow
import re
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from keras.preprocessing.text import Tokenizer
from sklearn.metrics import precision_score
from sklearn.metrics import average_precision_score

    Load training and testing datasets. Label response categories according to readmission.
    If readmitted within 30 days label as 1, else 0 (not readmitted)

In [2]:
data_train = pd.read_csv("Data/train.csv",keep_default_na=False)
data_test = pd.read_csv("Data/test.csv",keep_default_na=False)
train_set = data_train["TEXT"]
train_y = data_train["DAYS_TO_READMIT"]
test_set = data_test["TEXT"]
test_y = data_test["DAYS_TO_READMIT"]
for i in range(len(train_y)):
    train_set[i] = re.sub(r"[^a-z0-9]"," ",train_set[i].lower())
    train_y[i] = int(train_y[i] or 0)
    if train_y[i] <= 30 : train_y[i]=1
    else : train_y[i] = 0
for i in range(len(test_y)):
    test_set[i] = re.sub(r"[^a-z0-9]"," ",test_set[i].lower())
    test_y[i] = int(test_y[i] or 0)
    if test_y[i] <= 30 : test_y[i]=1
    else : test_y[i] = 0

In [3]:
train_y = train_y.astype('int')
test_y = test_y.astype('int')

    Create tokenizer and tokenize vocab into integers. Features reflect whether a word was used in the note or not.

In [4]:
tokenizer = Tokenizer(num_words=10000,oov_token=0)
tokenizer.fit_on_texts(train_set)
tokenizer.fit_on_texts(test_set)
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

In [5]:
X_train = tokenizer.texts_to_matrix(train_set, mode='binary')
X_test = tokenizer.texts_to_matrix(test_set, mode='binary')

print('vocab size:', vocab_size)
for word in ['admission', 'date']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))
print(test_set[2][0:20])
print(X_test[2][55],X_test[2][57])

vocab size: 238470
admission: 55
date: 57
admission date      
1.0 1.0


    Train Naive Bayes Classifier with different priors

In [6]:
clf = CategoricalNB(class_prior=[0.8,0.2])
clf.fit(X_train,train_y)

CategoricalNB(class_prior=[0.8, 0.2])

In [7]:
print(clf.score(X_train,train_y))
print(clf.score(X_test,test_y))

0.722638278395505
0.6470642370697589


In [8]:
clf2 = CategoricalNB(class_prior=[0.5,0.5])
clf2.fit(X_train,train_y)

CategoricalNB(class_prior=[0.5, 0.5])

In [9]:
print(clf2.score(X_train,train_y))
print(clf2.score(X_test,test_y))

0.7229776118814069
0.6474323578133628


    Train two Gaussian Naive Bayes model with different priors

In [10]:
gnb = GaussianNB(priors=[0.8,0.2])
gnb.fit(X_train,train_y)

GaussianNB(priors=[0.8, 0.2])

In [11]:
print(gnb.score(X_train,train_y))
print(gnb.score(X_test,test_y))

0.7460128315406529
0.676513896558071


In [12]:
gnb2 = GaussianNB(priors=[0.5,0.5])
gnb2.fit(X_train,train_y)

GaussianNB(priors=[0.5, 0.5])

In [13]:
print(gnb2.score(X_train,train_y))
print(gnb2.score(X_test,test_y))

0.7460365059699019
0.676513896558071


    Display precision scores of each model

In [38]:
print('Precisions Scores:')
print('    Naive Bayes Classifier with informed prior:',round(average_precision_score(test_y,clf.predict_proba(X_test)[:,1]),3))
print('    Naive Bayes Classifier with flat prior:', round(average_precision_score(test_y, clf2.predict_proba(X_test)[:,1]),3))
print('    Gaussian Naive Bayes Classifier with informed prior:', round(average_precision_score(test_y, gnb.predict_proba(X_test)[:,1]),3))
print('    Gaussian Naive Bayes Classifier with flat prior:', round(average_precision_score(test_y, gnb2.predict_proba(X_test)[:,1]),3))

Precisions Scores:
    Naive Bayes Classifier with informed prior: 0.96
    Naive Bayes Classifier with flat prior: 0.96
    Gaussian Naive Bayes Classifier with informed prior: 0.947
    Gaussian Naive Bayes Classifier with flat prior: 0.947


In [34]:
clf.predict_proba(X_test[0:10])[:,1]

array([5.72133528e-118, 8.94651515e-090, 4.37404031e-263, 3.78201287e-166,
       8.80862544e-161, 1.00000000e+000, 8.17366011e-011, 1.76031340e-148,
       1.00000000e+000, 1.00000000e+000])

In [21]:
test_y[0:10]

0    1
1    1
2    0
3    1
4    0
5    1
6    1
7    0
8    1
9    1
Name: DAYS_TO_READMIT, dtype: int32