In [1]:
from collections import Counter
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV, PredefinedSplit
import numpy as np
import pandas as pd

In [2]:
def get_vocab(data: pd.DataFrame, input_col: str):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '')
    tokens = data[input_col].str.split(expand=True)
    return tokens.stack().value_counts()

In [3]:
def pre_process_binary(data, input_col, vocab):
    data[input_col] = data[input_col].str.lower()
    data[input_col] = data[input_col].str.replace('[^\w\s]', '')
    tokens = data[input_col].str.split(expand=True)
    out = np.zeros(shape=(tokens.shape[0], vocab.shape[0]))
    for i, row in tokens.iterrows():
        cnt = Counter(row)
        del cnt[None]
        for j, word in enumerate(vocab):
            if cnt[word] > 0:
                out[i][j] = 1
            else:
                out[i][j] = 0
    return out

In [4]:
yelp_train = pd.read_csv('./data/yelp-train.txt', sep='\t', names=["Comments", "Rating"])
yelp_valid = pd.read_csv('./data/yelp-valid.txt', sep='\t', names=["Comments", "Rating"])
yelp_test = pd.read_csv('./data/yelp-test.txt', sep='\t', names=["Comments", "Rating"])

In [5]:
vocab = get_vocab(data=yelp_train, input_col="Comments")
yelp_valid_x = pre_process_binary(yelp_valid, "Comments", vocab.index[0:10000])
yelp_train_x = pre_process_binary(yelp_train, "Comments", vocab.index[0:10000])
yelp_test_x = pre_process_binary(yelp_test, "Comments", vocab.index[0:10000])

In [6]:
yelp_train_y = np.array(yelp_train["Rating"])
yelp_valid_y = np.array(yelp_valid["Rating"])
yelp_test_y = np.array(yelp_test["Rating"])

In [7]:
f1s = np.zeros(shape=(99, 2))
alphas = np.arange(1e-2, 1, 0.01)
for i, alpha in enumerate(alphas):
    nb_clf = BernoulliNB(alpha=alpha)
    nb_clf.fit(yelp_train_x, yelp_train_y)
    f1s[i][0] = alpha
    f1s[i][1] = f1_score(yelp_valid_y, nb_clf.predict(yelp_valid_x), average='weighted')

In [14]:
print(f1s)

[[0.01       0.40903082]
 [0.02       0.40915768]
 [0.03       0.41011247]
 [0.04       0.41077684]
 [0.05       0.40583422]
 [0.06       0.40206818]
 [0.07       0.40021266]
 [0.08       0.39896917]
 [0.09       0.39966249]
 [0.1        0.39607291]
 [0.11       0.39500699]
 [0.12       0.39484421]
 [0.13       0.39540054]
 [0.14       0.38982818]
 [0.15       0.39168808]
 [0.16       0.38827599]
 [0.17       0.3859998 ]
 [0.18       0.38704911]
 [0.19       0.38831163]
 [0.2        0.38830908]
 [0.21       0.3850284 ]
 [0.22       0.38587802]
 [0.23       0.38587802]
 [0.24       0.38583814]
 [0.25       0.38595184]
 [0.26       0.38703283]
 [0.27       0.38703283]
 [0.28       0.38820864]
 [0.29       0.38931594]
 [0.3        0.38806247]
 [0.31       0.38813807]
 [0.32       0.38832293]
 [0.33       0.38832293]
 [0.34       0.38464353]
 [0.35       0.38601921]
 [0.36       0.3861923 ]
 [0.37       0.38701097]
 [0.38       0.38588123]
 [0.39       0.38588123]
 [0.4        0.38787051]
