In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pickle

x_train = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_train_split.pkl', 'rb'))
y_train = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_train_split.pkl', 'rb'))

x_val = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_val_split.pkl', 'rb'))
y_val = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_val_split.pkl', 'rb'))

x_test = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/x_test.pkl', 'rb'))
y_test = pickle.load(open('/content/drive/My Drive/PROJECT 2/DATA/y_test.pkl', 'rb'))

In [0]:
from collections import defaultdict
import numpy as np

class CountVectorizer:

    def __init__(self):
        return

    def fit(self, data, min_df=1, max_df=1e9):
        doc_count = defaultdict(int)
        corpus_size = len(data)
        for text in data:
            words = set(text.split())
            for word in words:
                doc_count[word] += 1
        words_df = [(word, df) for word, df in doc_count.items()
                    if df >= min_df and df <= max_df]
        self._vocabulary = dict(words_df)
        print("Vocabulary size: "+str(len(self._vocabulary)))


    def transform(self, data):
        data_count_vector = []
        for text in data:
            words = [word for word in text.split() if word in self._vocabulary.keys()]
            if len(words) == 0:
                data_count_vector.append([])
                continue
            word_set = set(words)
            words_count = []
            for word in word_set:
                words_count.append((word, words.count(word)))
            data_count_vector.append(words_count)
        return data_count_vector

In [0]:
!pip install --quiet gspread
from google.colab import auth
!pip install --upgrade oauth2client
auth.authenticate_user()
import gspread
from oauth2client.client import GoogleCredentials 
gc = gspread.authorize(GoogleCredentials.get_application_default())

Requirement already up-to-date: oauth2client in /usr/local/lib/python3.6/dist-packages (4.1.3)


In [0]:
import numpy as np


class NaiveBayes:
    def __init__(self, vocab):
        self._vocabulary = vocab
        return

    def fit(self, X, Y, alpha=1):
        self.log10_word_in_label = dict()
        count_word_in_label = defaultdict(int)
        total_word_in_label = defaultdict(int)
        count_doc_in_label = defaultdict(int)
        self.labels = set(Y)
        for vector, label in zip(X, Y):
            count_doc_in_label[label] += 1
            for word, value in vector:
                count_word_in_label[(label, word)] += value
                total_word_in_label[label] += value
        for label in self.labels:
            self.log10_labels = np.array(
                [np.log10(count_doc_in_label[label]/len(Y)) for label in self.labels])
            for word in self._vocabulary:
                self.log10_word_in_label[(label, word)] = np.log10((count_word_in_label[(label, word)]+alpha) / 
                                                                   (total_word_in_label[label]+len(self._vocabulary)*alpha))

    def predict(self, X, pre_proba=[]):
        if len(pre_proba) == len(self.labels):
            temp = pre_proba
        else:
            temp = self.log10_labels
        predicted = [np.argmax([temp[label]+np.sum([self.log10_word_in_label[(label, word_id)]
                                                    for word_id, value in vector]) for label in self.labels]) for vector in X]
        return predicted


def compute_accuracy(predicted, expected):
    return np.sum(np.equal(predicted, expected))/len(expected)

In [0]:
def train_model(X_train, Y_train, X_val, Y_val, alpha, min_df, name):
    print("alpha =", alpha, "min_df =", min_df)
    cv = CountVectorizer()
    cv.fit(X_train, min_df=min_df)

    train_count_vector = cv.transform(X_train)
    val_count_vector = cv.transform(X_val)

    naive = NaiveBayes(cv._vocabulary.keys())
    naive.fit(train_count_vector, Y_train, alpha=alpha)

    predicted = naive.predict(train_count_vector)
    train_acc = compute_accuracy(predicted, Y_train)
    print("Train_acc: ", train_acc)
    
    predicted = naive.predict(val_count_vector)
    val_acc = compute_accuracy(predicted, Y_val)
    print("Val_acc: ", val_acc)

    wb = gc.open_by_url(
        'https://docs.google.com/spreadsheets/d/1ocIsqhCjbYmLQLgYiODkDG3JMhkvPOY42l_ACHwUYzM/edit#gid=0')
    sheet = wb.worksheet(name)
    data = sheet.get_all_values()
    append = [alpha, min_df, train_acc, val_acc]
    sheet.append_row(append)

In [0]:
alpha = 1
for min_df in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha, min_df, "Count Vector - alpha = 1 - min_df thay doi")

alpha = 1 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9124301107120376
Val_acc:  0.8794431279620853
alpha = 1 min_df = 2
Vocabulary size: 49657
Train_acc:  0.9072462694856889
Val_acc:  0.8775177725118484
alpha = 1 min_df = 3
Vocabulary size: 37226
Train_acc:  0.9048024586218388
Val_acc:  0.8754443127962085
alpha = 1 min_df = 4
Vocabulary size: 31295
Train_acc:  0.9033583885659273
Val_acc:  0.8752962085308057
alpha = 1 min_df = 5
Vocabulary size: 27525
Train_acc:  0.9018032361980227
Val_acc:  0.8752962085308057
alpha = 1 min_df = 6
Vocabulary size: 24878
Train_acc:  0.9003221387047803
Val_acc:  0.8757405213270142
alpha = 1 min_df = 7
Vocabulary size: 22827
Train_acc:  0.8991002332728552
Val_acc:  0.8757405213270142
alpha = 1 min_df = 8
Vocabulary size: 21206
Train_acc:  0.8985077942755582
Val_acc:  0.8747037914691943
alpha = 1 min_df = 9
Vocabulary size: 19875
Train_acc:  0.8981375199022475
Val_acc:  0.8742594786729858
alpha = 1 min_df = 10
Vocabulary size: 18708
Train_acc:  0.8975

In [0]:
min_df = 1
for alpha in range(1, 20):
  train_model(x_train, y_train, x_val, y_val, alpha*0.01, min_df, "Count Vector - alpha thay doi - min_df =1")

alpha = 0.01 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9520494686562743
Val_acc:  0.8791469194312796
alpha = 0.02 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9483467249231681
Val_acc:  0.8798874407582938
alpha = 0.03 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9455696671233383
Val_acc:  0.8785545023696683
alpha = 0.04 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9435701855074611
Val_acc:  0.8791469194312796
alpha = 0.05 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9420890880142185
Val_acc:  0.8792950236966824
alpha = 0.06 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9407190728329692
Val_acc:  0.8791469194312796
alpha = 0.07 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9395341948383752
Val_acc:  0.8792950236966824
alpha = 0.08 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9377568778464842
Val_acc:  0.8794431279620853
alpha = 0.09 min_df = 1
Vocabulary size: 110473
Train_acc:  0.9365349724145592
Val_acc:  0.8794431279620853
alpha = 0.1 min_df = 1
Vocab

In [5]:
min_df = 1
alpha = 0.02

cv = CountVectorizer()
cv.fit(x_train, min_df=min_df)
train_count_vector = cv.transform(x_train)
test_count_vector = cv.transform(x_test)
naive = NaiveBayes(cv._vocabulary.keys())
naive.fit(train_count_vector, y_train, alpha=alpha)

predicted = naive.predict(test_count_vector)
test_acc = compute_accuracy(predicted, y_test)
print('Test accuracy : ', test_acc)

Vocabulary size: 110473
Test accuracy :  0.8886506660314057
