In [None]:
import csv
import numpy as np
import random
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['tok2vec', 'parser', 'ner'])
stopwords = nlp.Defaults.stop_words
# stopwords = ['the','and','when','where','who']

In [None]:
file = open("new_train.csv", "r", encoding='utf-8')
data = list(csv.reader(file, delimiter=","))
file.close()
data = data[1:]
data[0]

In [None]:
random.seed(2023)
random.shuffle(data)
segments = []
for i in range(10):
    segments.append(data[(i*len(data))//10:((i+1)*len(data))//10])

train_data = []
for i in range(10):
    train = []
    for j in range(10):
        if j != i:
            train.extend(segments[i])
    train_data.append(train)

In [None]:
# Get a map from label numbers to names
label_map = {}
for sample in data:
    if int(sample[-1]) not in label_map:
        label_map[int(sample[-1])] = sample[1].strip()
print(label_map)

In [None]:
# Find frequency of each label in training data
def get_probabilities(data):
    counts = {}
    for sample in data:
        if int(sample[-1]) not in counts:
            counts[int(sample[-1])] = 0
        counts[int(sample[-1])] += 1
    s = sum(counts.values())
    probabilities = counts
    for k in probabilities:
        probabilities[k] /= s
    return probabilities
get_probabilities(data)

In [None]:
def tokenize(s):
    return [word.lemma_ for word in nlp(s)]

In [None]:
tokenized_train_data = []
for data in train_data:
    curr = []
    for sample in data:
        words = set(tokenize(sample[2]))
        curr.append(words)
    tokenized_train_data.append(curr)

In [None]:
# Get full list of words that appear
def get_word_list(tokenized_data):
    ans = set()
    for words in tokenized_data:
        for word in words:
            if len(word) > 2 and word not in stopwords:
                ans.add(word)
    return ans

In [None]:
# Get frequency of words in data set
def get_word_frequencies(tokenized_data):
    word_frequencies = {}
    for words in tokenized_data:
        for word in words:
            if word not in word_frequencies:
                word_frequencies[word] = 0
        word_frequencies[word] += 1
    s = sum(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= s
    return word_frequencies

In [None]:
# Create a look-up table for the probability of a sample being a given class
# given a word w is in the sample
def create_lookup_table(word_list, data, tokenized_data):
    counts = {}
    table = {}
    sample_sets = []
    for word_set, sample in zip(tokenized_data, data):
        k = int(sample[3])  # class
        if k not in counts:
            counts[k] = 0
        counts[k] += 1
        for word in word_set:
            if word not in word_list:
                continue
            if word not in table:
                table[word] = {}
            if k not in table[word]:
                table[word][k] = 0
            table[word][k] += 1
    for word in table:
        for k in table[word]:
            table[word][k] /= counts[k]

    return table

In [None]:
def naive_bayes(text, probabilities, prob_table, word_frequencies, tokenized=False):
    words = text if tokenized else set(tokenize(text))
    best_class = -1
    best_prob = 0
    for label in label_map:
        if label not in probabilities:
            continue
        cur_prob = np.log(probabilities[label])
        for word in words:
            if word in prob_table and word in word_frequencies:
                if label in prob_table[word]:
                    cur_prob += np.log(prob_table[word][label])
                else:
                    cur_prob = 1
            if cur_prob == 1:
                break
        if 1 > cur_prob and (cur_prob > best_prob or best_class == -1):
            best_class = label
            best_prob = cur_prob
    return best_class, best_prob


In [None]:
# Test model
for i, (test, train, tokenized_train) in enumerate(zip(segments, train_data, tokenized_train_data)):
    probabilities = get_probabilities(train)
    word_list = get_word_list(tokenized_train)
    prob_table = create_lookup_table(word_list, train, tokenized_train)
    word_frequencies = get_word_frequencies(tokenized_train)
    correct = 0
    preds = []
    ts = []
    for sample, text in zip(test, tokenized_train):
        pred, _ = naive_bayes(text, probabilities, prob_table, word_frequencies, tokenized=True)
        if pred == int(sample[3]):
            correct += 1
        preds.append(pred)
        ts.append(int(sample[3]))
    # Calculate F1 score
    f1 = 0
    for j in label_map:
        f_neg = 0
        pos = 0
        p_pos = 0
        t_pos = 0
        for pred, t in zip(preds, ts):
            if pred == j:
                p_pos += 1
            if t == j:
                pos += 1
                if pred != j:
                    f_neg += 1
                else:
                    t_pos += 1
        if t_pos > 0:
            f1 += 2 / (p_pos / t_pos + pos / t_pos)
    f1 /= len(label_map)
    print(i, f1)


In [None]:
print(list(zip(preds, ts)))

In [None]:
print(probabilities)

In [None]:
naive_bayes(data[1][2], probabilities, prob_table, word_frequencies)

In [None]:
label_map

In [None]:
word_frequencies