### 2. Naive Bayes Classifier for fortune cookie messages

In [4]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer

Utility function to read the file and convert into list of lines

In [5]:
def read_file(file_name):
    with open(file_name) as f:
        lines_list = f.readlines()
    lines_list = [x.strip() for x in lines_list]
    return lines_list

#### Pre-processing step1: Build vocabulary by removing stop words

In [6]:
stop_words_list = read_file("stoplist")
vectorizer = CountVectorizer(stop_words=stop_words_list, analyzer='word', binary=True)

#### Pre-processing step2: Transform input messages into message-word binary matrix

In [7]:
training_messages = read_file('traindata')
vectorizer = vectorizer.fit(training_messages)
msg_word_matrix = vectorizer.transform(training_messages)

#Read the traning labels and attach as last column to word-doc matrix
labels = np.array([read_file("trainlabels")], dtype='int64').ravel()
msg_word_label_matrix = hstack([msg_word_matrix, np.reshape(labels,(labels.size,1))])

#### Pre-processing step3: Save the message-word matrix into a file

In [8]:
words = vectorizer.get_feature_names()
words.append('label')

msg_word_df = pd.DataFrame(msg_word_label_matrix.toarray(), columns=words)
msg_word_df.to_csv("preprocessed.txt", index=False)

### Classify test messages using Naive bayes 

As each word as feature
is binary, we treat it as bernouli r.v, so we use BernouliNB classifier

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(msg_word_matrix, labels, test_size=0.2, stratify=labels)
ber_nb = BernoulliNB()
ber_nb.fit(X_train, y_train)
y_predicted = ber_nb.predict(X_test)
accuracy_score(y_test, y_predicted)

0.8153846153846154

Save the test results into results.txt

In [10]:
test_messages = read_file('testdata')
test_msg_word_matrix = vectorizer.transform(test_messages)
predicted_labels = ber_nb.predict(test_msg_word_matrix)
np.savetxt("results.txt", predicted_labels, newline="\n", fmt="%s")