In [5]:
import numpy as np
import math

In [6]:
class TF_IDF:
    def __init__(self):
        self.sms = []
        self.label = []

        self.n_iklan = 0
        self.dict_iklan = {}
        self.dict_non_iklan = {}

        self.n_non_iklan = 0
        self.all_iklan = {}
        self.all_non_iklan = {}

        self.all_words = {}

    def set_train_and_label(self, sms, label):
        self.sms = sms
        self.label = label

    def compute_freq(self):
        for doc_number in range(len(self.sms)):
            line = self.sms[doc_number].split(" ")
            label = self.label[doc_number]
            for word in line:
                if word in self.all_words:
                    self.all_words[word] += 1
                else:
                    self.all_words[word] = 1
            if label == 1:
                self.n_iklan += 1
                for word in line:
                    if word in self.all_iklan:
                        self.all_iklan[word] += 1
                    else:
                        self.all_iklan[word] = 1
                    if word in self.dict_iklan:
                        if doc_number in self.dict_iklan[word]:
                            self.dict_iklan[word][doc_number] += 1
                        else:
                            self.dict_iklan[word][doc_number] = 1
                    else:
                        self.dict_iklan[word] = {}
                        self.dict_iklan[word][doc_number] = 1  
            else:
                self.n_non_iklan += 1
                for word in line:
                    if word in self.all_non_iklan:
                        self.all_non_iklan[word] += 1
                    else:
                        self.all_non_iklan[word] = 1
                    if word in self.dict_non_iklan:
                        if doc_number in self.dict_non_iklan[word]:
                            self.dict_non_iklan[word][doc_number] += 1
                        else:
                            self.dict_non_iklan[word][doc_number] = 1
                    else:
                        self.dict_non_iklan[word] = {}
                        self.dict_non_iklan[word][doc_number] = 1

    def get_tf_idf_scores(self):
        scores = {}
        for key, value in self.all_words.items():
            frequency_iklan = 0
            frequency_non_iklan = 0
            df_iklan = 0
            df_non_iklan = 0
            idf_iklan = 0
            idf_non_iklan = 0
            # score(word) = frequency_word * idf

            if key in self.all_iklan:
                frequency_iklan = self.all_iklan[key]
            
            if key in self.all_non_iklan:
                frequency_non_iklan = self.all_non_iklan[key]

            # compute df iklan = unique document
            if key in self.dict_iklan:
                df_iklan = len(self.dict_iklan[key])
            
            if key in self.dict_non_iklan:
                df_non_iklan = len(self.dict_non_iklan[key])
            
            scores[key] = {}
            scores[key][0] = 0
            scores[key][1] = 0

            if df_iklan != 0:
                scores[key][1] = frequency_iklan * math.log(self.n_iklan / df_iklan)

            if df_non_iklan != 0:
                scores[key][0] = frequency_non_iklan * math.log(self.n_non_iklan / df_non_iklan)
        
        return scores

In [7]:
class MNB_TextClassifier:
    def __init__(self):
        pass
    
    def proba_y_given_x(self, y, x):
        proba = self.proba_y(y)
        for term in x:
            proba *= self.condprob[term][y] if term in self.terms else self.condprob[''][y]
        return proba
    
    def proba_y(self, y):
        return self.prior[y]

    def fit(self, X, y, tfidf=False):
        X_input = X.copy()
        y_input = y.copy()

        if not type(X) == np.ndarray:
            X = X.str.split(' ')
            X = np.array(X)
        if not type(y) == np.ndarray:
            y = np.array(y)
        if not X.shape[0] == y.shape[0]:
            raise Exception('number of rows in X isn\'t match with number of rows in y')

        self.prior = {}
        self.condprob = {}
        self.is_tfidf = tfidf

        self.terms = {term for row in X for term in row}
        self.classes = {cls for cls in y}

        m = X.shape[0]
        for cls in self.classes:
            self.prior[cls] = sum([1 if y[i] == cls else 0 for i in range(m)]) / m

        if self.is_tfidf:
            # Scoring TF-IDF
            tf_idf = TF_IDF()
            tf_idf.set_train_and_label(X_input.tolist(), y_input.tolist())
            tf_idf.compute_freq()
            self.scores = tf_idf.get_tf_idf_scores()

            denominator_smooth = sum([self.scores[term][cls] for term in self.terms for cls in self.classes])
            word_scores_sum = {}
            for cls in self.classes:
                word_scores_sum[cls] = sum([self.scores[term][cls] for term in self.terms])
            
            # Compute conditional probability (self.condprob)
            for term in self.terms:
                if not term in self.condprob:
                    self.condprob[term] = {}

                for cls in self.classes:                    
                    nominator = self.scores[term][cls] + 1
                    denominator = word_scores_sum[cls] + len(self.terms)
                    self.condprob[term][cls] = nominator / denominator
            # Unknown term
            self.condprob[''] = {}
            for cls in self.classes:
                self.condprob[''][cls] = 1 / (word_scores_sum[cls] + len(self.terms))
        else:
            # Word count dictionary
            word_sum = {}
            for i in range(m):
                for j in range(len(X[i])):
                    if not X[i][j] in word_sum:
                        word_sum[X[i][j]] = {}
                    if not y[i] in word_sum[X[i][j]]:
                        word_sum[X[i][j]][y[i]] = 0 
                    if not 1 - y[i] in word_sum[X[i][j]]:
                        word_sum[X[i][j]][1 - y[i]] = 0 
                    word_sum[X[i][j]][y[i]] += 1

            words_count_in_cls = {}
            for i in range(m):
                if not y[i] in words_count_in_cls:
                    words_count_in_cls[y[i]] = 0
                words_count_in_cls[y[i]] += len(X[i])

            # Compute conditional probability (self.condprob)
            for term in self.terms:
                if not term in self.condprob:
                    self.condprob[term] = {}
                for cls in self.classes:
                    nominator = word_sum[term][cls] + 1
                    denominator = words_count_in_cls[cls] + len(self.terms)
                    self.condprob[term][cls] = nominator / denominator
            # Unknown term
            self.condprob[''] = {}
            for cls in self.classes:
                self.condprob[''][cls] = 1 / (words_count_in_cls[cls] + len(self.terms))

    def predict_single(self, x):
        return 1 if self.proba_y_given_x(1, x) > self.proba_y_given_x(0, x) else 0
        
    def predict(self, X):
        X = X.str.split(' ')
        return [self.predict_single(x) for x in X]

In [9]:
import time
import pandas as pd
from sklearn.metrics import accuracy_score

from utils import fair_train_test_split

t1 = time.time()
df = pd.read_csv('clean_dataset_with_stemming.csv')
X = df['Teks']
y = df['label']
X_train, X_test, y_train, y_test = fair_train_test_split(X, y, test_size=0.1)
print('Splitting data train and test is done in', time.time() - t1)
print()

t1 = time.time()
clf_without_tf_idf = MNB_TextClassifier()
clf_without_tf_idf.fit(X_train, y_train, tfidf=False)
y_pred = clf_without_tf_idf.predict(X_test)
print('Accuracy of MNB_TextClassifier without TF-IDF : ', accuracy_score(y_pred, y_test))
print('Time elapsed :', time.time() - t1)
print()

t1 = time.time()
clf_with_tf_idf = MNB_TextClassifier()
clf_with_tf_idf.fit(X_train, y_train, tfidf=True)
y_pred = clf_with_tf_idf.predict(X_test)
print('Accuracy of MNB_TextClassifier with TF-IDF : ', accuracy_score(y_pred, y_test))
print('Time elapsed :', time.time() - t1)

Splitting data train and test is done in 0.09890365600585938

Accuracy of MNB_TextClassifier without TF-IDF :  0.9228650137741047
Time elapsed : 0.7784121036529541

Accuracy of MNB_TextClassifier with TF-IDF :  0.931129476584022
Time elapsed : 0.8723371028900146
