In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
import pickle

x_train = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_train_split.pkl', 'rb'))
y_train = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_train_split.pkl', 'rb'))

x_val = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_val_split.pkl', 'rb'))
y_val = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_val_split.pkl', 'rb'))

x_test = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_test.pkl', 'rb'))
y_test = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_test.pkl', 'rb'))

In [3]:
from collections import defaultdict
import numpy as np


class TfIdfVectorizer:

    def __init__(self):
        return

    def fit(self, data, min_df=1, max_df=1e9):
        def compute_idf(df, corpus_size):
            return np.log10(corpus_size/df)
        doc_count = defaultdict(int)
        corpus_size = len(data)
        for text in data:
            words = set(text.split())
            for word in words:
                doc_count[word] += 1
        words_idf = [(word, compute_idf(df, corpus_size)) for word,
                     df in doc_count.items() if df >= min_df and df <= max_df]
        self._vocabulary = dict(words_idf)
        self._words_id = dict([(word, index)
                               for index, word in enumerate(self._vocabulary.keys())])
        print("Vocabulary size: "+str(len(self._vocabulary)))

    def transform(self, data):
        data_tf_idf = []
        for text in data:
            words = [word for word in text.split() if word in self._vocabulary.keys()]
            if len(words) == 0:
                data_tf_idf.append([])
                continue
            word_set = set(words)
            max_tf = max([words.count(word) for word in word_set])
            words_tf_idf = []
            for word in word_set:
                tf = words.count(word)
                tf_idf = (self._vocabulary[word]*tf)/max_tf
                words_tf_idf.append((word, tf_idf))
            data_tf_idf.append(words_tf_idf)
        return data_tf_idf


In [4]:
!pip install --quiet gspread
from google.colab import auth
!pip install --upgrade oauth2client
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials 
gc = gspread.authorize(GoogleCredentials.get_application_default())

Requirement already up-to-date: oauth2client in /usr/local/lib/python3.6/dist-packages (4.1.3)


In [5]:
import numpy as np


class NaiveBayes:
    def __init__(self, vocab):
        self._vocabulary = vocab
        return

    def fit(self, X, Y, alpha=1):
        self.log10_word_in_label = dict()
        count_word_in_label = defaultdict(int)
        total_word_in_label = defaultdict(int)
        count_doc_in_label = defaultdict(int)
        self.labels = set(Y)
        for vector, label in zip(X, Y):
            count_doc_in_label[label] += 1
            for word, value in vector:
                count_word_in_label[(label, word)] += value
                total_word_in_label[label] += value
        for label in self.labels:
            self.log10_labels = np.array(
                [np.log10(count_doc_in_label[label]/len(Y)) for label in self.labels])
            for word in self._vocabulary:
                self.log10_word_in_label[(label, word)] = np.log10((count_word_in_label[(label, word)]+alpha) / 
                                                                   (total_word_in_label[label]+len(self._vocabulary)*alpha))

    def predict(self, X, pre_proba=[]):
        if len(pre_proba) == len(self.labels):
            temp = pre_proba
        else:
            temp = self.log10_labels
        predicted = [np.argmax([temp[label]+np.sum([self.log10_word_in_label[(label, word_id)]
                                                    for word_id, value in vector]) for label in self.labels]) for vector in X]
        return predicted


def compute_accuracy(predicted, expected):
    return np.sum(np.equal(predicted, expected))/len(expected)

In [6]:
def train_model(X_train, Y_train, X_val, Y_val, alpha, min_df, name):
    print("alpha =", alpha, "min_df =", min_df)
    cv = TfIdfVectorizer()
    cv.fit(X_train, min_df=min_df)

    train_count_vector = cv.transform(X_train)
    val_count_vector = cv.transform(X_val)

    naive = NaiveBayes(cv._vocabulary.keys())
    naive.fit(train_count_vector, Y_train, alpha=alpha)

    predicted = naive.predict(train_count_vector)
    train_acc = compute_accuracy(predicted, Y_train)
    print("Train_acc: ", train_acc)
    
    predicted = naive.predict(val_count_vector)
    val_acc = compute_accuracy(predicted, Y_val)
    print("Val_acc: ", val_acc)

    wb = gc.open_by_url(
        'https://docs.google.com/spreadsheets/d/1ocIsqhCjbYmLQLgYiODkDG3JMhkvPOY42l_ACHwUYzM/edit#gid=0')
    sheet = wb.worksheet(name)
    data = sheet.get_all_values()
    append = [alpha, min_df, train_acc, val_acc]
    sheet.append_row(append)

In [None]:
alpha = 1
for min_df in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha, min_df, "TF-IDF Vector - alpha = 1 - min_df thay doi")

alpha = 1 min_df = 1
Vocabulary size: 110473
Train_acc:  0.8872884807642463
Val_acc:  0.8578199052132701
alpha = 1 min_df = 2
Vocabulary size: 49657
Train_acc:  0.8981375199022475
Val_acc:  0.8739632701421801
alpha = 1 min_df = 3
Vocabulary size: 37226
Train_acc:  0.8989150960861999
Val_acc:  0.8770734597156398
alpha = 1 min_df = 4
Vocabulary size: 31295
Train_acc:  0.8992483430221795
Val_acc:  0.8779620853080569
alpha = 1 min_df = 5
Vocabulary size: 27525
Train_acc:  0.8986188765875514
Val_acc:  0.8794431279620853
alpha = 1 min_df = 6
Vocabulary size: 24878
Train_acc:  0.8982115747769097
Val_acc:  0.8789988151658767
alpha = 1 min_df = 7
Vocabulary size: 22827
Train_acc:  0.8975821083422816
Val_acc:  0.8789988151658767
alpha = 1 min_df = 8
Vocabulary size: 21206
Train_acc:  0.89717480653164
Val_acc:  0.8782582938388626
alpha = 1 min_df = 9
Vocabulary size: 19875
Train_acc:  0.8963972303476876
Val_acc:  0.8775177725118484
alpha = 1 min_df = 10
Vocabulary size: 18708
Train_acc:  0.896101

In [8]:
min_df = 5
for alpha in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha*0.01, min_df, "TF-IDF Vector - alpha thay doi - min_df =5")

alpha = 0.01 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9260191802125375
Val_acc:  0.8763329383886256
alpha = 0.02 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9222794090421002
Val_acc:  0.8767772511848341
alpha = 0.03 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9193912689302773
Val_acc:  0.8764810426540285
alpha = 0.04 min_df = 5
Vocabulary size: 27525
Train_acc:  0.918021253749028
Val_acc:  0.876925355450237
alpha = 0.05 min_df = 5
Vocabulary size: 27525
Train_acc:  0.916836375754434
Val_acc:  0.8767772511848341
alpha = 0.06 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9159847446958196
Val_acc:  0.8770734597156398
alpha = 0.07 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9154663605731848
Val_acc:  0.8767772511848341
alpha = 0.08 min_df = 5
Vocabulary size: 27525
Train_acc:  0.914466619765246
Val_acc:  0.8764810426540285
alpha = 0.09 min_df = 5
Vocabulary size: 27525
Train_acc:  0.913837153330618
Val_acc:  0.8770734597156398
alpha = 0.1 min_df = 5
Vocabulary size: 27

In [None]:
min_df = 1
for alpha in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha*0.01, min_df, "TF-IDF Vector - alpha thay doi - min_df =1")

alpha = 0.01 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9488280816084719
Val_acc:  0.8807760663507109
alpha = 0.02 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9445328988780687
Val_acc:  0.8816646919431279
alpha = 0.03 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9408671825822935
Val_acc:  0.882553317535545
alpha = 0.04 min_df = 1
Vocabulary size: 110473
Train_acc:  0.938201207094457
Val_acc:  0.8818127962085308
alpha = 0.05 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9362387529159107
Val_acc:  0.8819609004739336
alpha = 0.06 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9345354907986818
Val_acc:  0.8816646919431279
alpha = 0.07 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9332765579294257
Val_acc:  0.8816646919431279
alpha = 0.08 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9321657348094938
Val_acc:  0.8822571090047393
alpha = 0.09 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9310178842522309
Val_acc:  0.8821090047393365
alpha = 0.1 min_df = 1
Vocabul

In [9]:
alpha = 0.03
for min_df in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha, min_df, "TF-IDF Vector - alpha = 0.03 - min_df thay doi")

alpha = 0.03 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9408671825822935
Val_acc:  0.882553317535545
alpha = 0.03 min_df = 2
Vocabulary size: 49657
Train_acc:  0.9293516495723331
Val_acc:  0.8791469194312796
alpha = 0.03 min_df = 3
Vocabulary size: 37226
Train_acc:  0.9247972747806125
Val_acc:  0.8776658767772512
alpha = 0.03 min_df = 4
Vocabulary size: 31295
Train_acc:  0.9217610249194653
Val_acc:  0.8775177725118484
alpha = 0.03 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9193912689302773
Val_acc:  0.8764810426540285
alpha = 0.03 min_df = 6
Vocabulary size: 24878
Train_acc:  0.9180953086236902
Val_acc:  0.8773696682464455
alpha = 0.03 min_df = 7
Vocabulary size: 22827
Train_acc:  0.9166142111304476
Val_acc:  0.8770734597156398
alpha = 0.03 min_df = 8
Vocabulary size: 21206
Train_acc:  0.914985003887881
Val_acc:  0.8772215639810427
alpha = 0.03 min_df = 9
Vocabulary size: 19875
Train_acc:  0.9136149887066316
Val_acc:  0.8763329383886256
alpha = 0.03 min_df = 10
Vocabulary si

In [None]:
min_df = 1
alpha = 0.03

cv = TfIdfVectorizer()
cv.fit(x_train, min_df=min_df)
train_count_vector = cv.transform(x_train)
test_count_vector = cv.transform(x_test)
naive = NaiveBayes(cv._vocabulary.keys())
naive.fit(train_count_vector, y_train, alpha=alpha)

predicted = naive.predict(test_count_vector)
test_acc = compute_accuracy(predicted, y_test)
print('Test accuracy : ', test_acc)

Vocabulary size: 110473
Test accuracy :  0.890774819843964
