In [1]:
import csv
import numpy as np
import random

In [2]:
file = open("new_train.csv", "r", encoding='utf-8')
data = list(csv.reader(file, delimiter=","))
file.close()
data = data[1:]
data[0]

['0',
 ' Emergency Room Reports',
 'REASON FOR THE VISIT:,  Very high PT/INR.,HISTORY: , The patient is an 81-year-old lady whom I met last month when she came in with pneumonia and CHF.  She was noticed to be in atrial fibrillation, which is a chronic problem for her.  She did not want to have Coumadin started because she said that she has had it before and the INR has had been very difficult to regulate to the point that it was dangerous, but I convinced her to restart the Coumadin again.  I gave her the Coumadin as an outpatient and then the INR was found to be 12.  So, I told her to come to the emergency room to get vitamin K to reverse the anticoagulation.,PAST MEDICAL HISTORY:,1.  Congestive heart failure.,2.  Renal insufficiency.,3.  Coronary artery disease.,4.  Atrial fibrillation.,5.  COPD.,6.  Recent pneumonia.,7.  Bladder cancer.,8.  History of ruptured colon.,9.  Myocardial infarction.,10.  Hernia repair.,11.  Colon resection.,12.  Carpal tunnel repair.,13.  Knee surgery.,M

In [3]:
# Get a map from label numbers to names
label_map = {}
for sample in data:
    if int(sample[-1]) not in label_map:
        label_map[int(sample[-1])] = sample[1].strip()
print(label_map)

{0: 'Emergency Room Reports', 1: 'Surgery', 2: 'Radiology', 3: 'Podiatry', 4: 'Neurology', 5: 'Gastroenterology', 6: 'Orthopedic', 7: 'Cardiovascular / Pulmonary', 8: 'Nephrology', 9: 'ENT - Otolaryngology', 10: 'General Medicine', 11: 'Hematology - Oncology', 12: 'Cosmetic / Plastic Surgery', 13: 'SOAP / Chart / Progress Notes', 14: 'Chiropractic', 15: 'Psychiatry / Psychology', 16: 'Consult - History and Phy.', 17: 'Hospice - Palliative Care', 18: 'Neurosurgery', 19: 'Obstetrics / Gynecology', 20: 'Urology', 21: 'Discharge Summary', 22: 'Autopsy', 23: 'Dermatology', 24: 'Letters', 25: 'Office Notes', 26: 'Lab Medicine - Pathology', 27: 'Ophthalmology', 28: 'Speech - Language', 29: 'Dentistry', 30: 'Pediatrics - Neonatal', 31: 'Physical Medicine - Rehab', 32: 'Bariatrics', 33: 'Endocrinology', 34: 'Pain Management', 35: 'IME-QME-Work Comp etc.', 36: 'Allergy / Immunology', 37: 'Sleep Medicine', 38: 'Diets and Nutritions', 39: 'Rheumatology'}


In [4]:
# Find frequency of each label in training data
def get_probabilities(data):
    counts = {}
    for sample in data:
        if int(sample[-1]) not in counts:
            counts[int(sample[-1])] = 0
        counts[int(sample[-1])] += 1
    s = sum(counts.values())
    probabilities = counts
    for k in probabilities:
        probabilities[k] /= s
    return probabilities
get_probabilities(data)

{0: 0.015873015873015872,
 1: 0.21743512219702696,
 2: 0.05366591080876795,
 3: 0.008818342151675485,
 4: 0.04283194759385236,
 5: 0.044343663391282435,
 6: 0.072814310909549,
 7: 0.07785336356764928,
 8: 0.015873015873015872,
 9: 0.020660115898211137,
 10: 0.052658100277147894,
 11: 0.017132779037540943,
 12: 0.004787100025195263,
 13: 0.034013605442176874,
 14: 0.0030234315948601664,
 15: 0.011337868480725623,
 16: 0.10330057949105569,
 17: 0.0015117157974300832,
 18: 0.017888636936255985,
 19: 0.030990173847316706,
 20: 0.033761652809271854,
 21: 0.021919879062736205,
 22: 0.001763668430335097,
 23: 0.005291005291005291,
 24: 0.004787100025195263,
 25: 0.009574200050390527,
 26: 0.0012597631645250692,
 27: 0.016880826404635927,
 28: 0.002015621063240111,
 29: 0.005291005291005291,
 30: 0.013857394809775762,
 31: 0.004031242126480222,
 32: 0.003779289493575208,
 33: 0.004031242126480222,
 34: 0.013605442176870748,
 35: 0.0030234315948601664,
 36: 0.0015117157974300832,
 37: 0.0030234

In [5]:
def tokenize(s):
    '''Filter out non-alphabet characters
    Standardize case
    Return list of words'''
    out = []
    cur = ""
    for c in s:
        if c.isalpha():
            cur += c.lower()
        elif len(cur) > 0:
            out.append(cur)
            cur = ""
    return out

In [11]:
# Get full list of words that appear
def get_word_list(data):
    words = set()
    for sample in data:
        for word in tokenize(sample[2]):
            # Filter out common words
            if len(word) > 2 and word not in ['the','and','when','where','who']:
                words.add(word)
    return words
word_list = get_word_list(data)

In [12]:
# Get frequency of words in data set
def get_word_frequencies(data):
    word_frequencies = {}
    for sample in data:
        words = set(tokenize(sample[2]))
        for word in words:
            if word not in word_frequencies:
                word_frequencies[word] = 0
            word_frequencies[word] += 1
    s = sum(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= s
    return word_frequencies

In [13]:
# Create a look-up table for the probability of a sample being a given class
# given a word w is in the sample
def create_lookup_table(word_list, data):
    counts = {}
    table = {}
    sample_sets = []
    for sample in data:
        word_set = set(tokenize(sample[2]))
        k = int(sample[3])  # class
        if k not in counts:
            counts[k] = 0
        counts[k] += 1
        for word in word_set:
            if word not in word_list:
                continue
            if word not in table:
                table[word] = {}
            if k not in table[word]:
                table[word][k] = 0
            table[word][k] += 1
    for word in table:
        for k in table[word]:
            table[word][k] /= counts[k]

    return table

In [14]:
def naive_bayes(text, probabilities, prob_table, word_frequencies):
    words = set(tokenize(text))
    best_class = -1
    best_prob = 0
    for label in label_map:
        if label not in probabilities:
            continue
        cur_prob = np.log(probabilities[label])
        for word in words:
            if word in prob_table and word in word_frequencies:
                if label in prob_table[word]:
                    cur_prob += np.log(prob_table[word][label])
                else:
                    cur_prob = 1
            if cur_prob == 1:
                break
        if 1 > cur_prob and (cur_prob > best_prob or best_class == -1):
            best_class = label
            best_prob = cur_prob
    return best_class, best_prob


In [15]:
# Test model
random.shuffle(data)
segments = []
for i in range(10):
    segments.append(data[(i*len(data))//10:((i+1)*len(data))//10])
for i in range(10):
    train = []
    for j in range(10):
        if j != i:
            train.extend(segments[i])
    test = segments[i]
    probabilities = get_probabilities(train)
    word_list = get_word_list(train)
    prob_table = create_lookup_table(word_list, train)
    word_frequencies = get_word_frequencies(train)
    correct = 0
    preds = []
    ts = []
    for sample in test:
        pred, _ = naive_bayes(sample[2], probabilities, prob_table, word_frequencies)
        if pred == int(sample[3]):
            correct += 1
        preds.append(pred)
        ts.append(int(sample[3]))
    # Calculate F1 score
    f1 = 0
    for j in label_map:
        f_neg = 0
        pos = 0
        p_pos = 0
        t_pos = 0
        for pred, t in zip(preds, ts):
            if pred == j:
                p_pos += 1
            if t == j:
                pos += 1
                if pred != j:
                    f_neg += 1
                else:
                    t_pos += 1
        if t_pos > 0:
            f1 += 2 / (p_pos / t_pos + pos / t_pos)
    f1 /= len(label_map)
    print(i, f1)


0 0.7956568303310714
1 0.8523403004482049
2 0.7508536202086405
3 0.7354380144401643
4 0.7646817178145245
5 0.8320737335578651
6 0.8960095139936183
7 0.7812623389515083
8 0.883453684720318
9 0.8576167387765959


In [None]:
print(list(zip(preds, t)))

In [None]:
print(probabilities)

In [None]:
naive_bayes(data[1][2], probabilities, prob_table, word_frequencies)

In [None]:
label_map

In [None]:
word_frequencies