In [51]:
import pandas as pd
import numpy as np
import json
import re

In [74]:
class MNB_TextClassifier:
    
    def __init__(self):
        self.prior = {}
        self.condprob = {}
    
    # p(class=y | {term0=x[0], term1=x[1], ...} )
    # please note that you dont add new data to vocabulary when predicting
    def proba_y_given_x(self, y, x):
        # YOUR CODE HERE
        
        proba_x_given_y = 1
        for term in x:
            if term not in self.condprob or y not in self.condprob[term]:
                proba_x_given_y *= 1 / (self.condprob_denom[y] + len(self.vocabulary))
            else:
                proba_x_given_y *= self.condprob[term][y]
        
        result_numerator = proba_x_given_y * self.prior[y]
        return result_numerator
    
    # p(c)
    def proba_y(self, y):
        return self.prior[y]

    # update self.prior[class] as p(class=class) 
    # update self.condprob[term][class] as p(term=term | class=class)
    def fit(self, X, y):
        # YOUR CODE HERE
        self.y_count = {}
        num_of_ys = 0
        for lst_y in y:
            for y_i in lst_y:
                self.y_count[y_i] = self.y_count.get(y_i, 0) + 1
                self.prior[y_i] = self.prior.get(y_i, 0) + 1
                num_of_ys += 1
        
        for y_val in self.prior:
            self.prior[y_val] /= num_of_ys
        
        self.vocabulary = set(word for x in X for word in x)
        # initialize values
        for term in self.vocabulary:
            for y_val in self.y_count.keys():
                if term not in self.condprob:
                    self.condprob[term] = {}
                self.condprob[term][y_val] = 0
        
        self.condprob_denom = {} # for denominator
        for x, lst_y in zip(X,y):
            for y_i in lst_y:
                for term in x:
                    self.condprob[term][y_i] += 1 # intermediate value (numerator)
                self.condprob_denom[y_i] = self.condprob_denom.get(y_i, 0) + len(x);
        
        # Final value for condprob
        for term in self.vocabulary:
            for y_val in self.y_count.keys():
                self.condprob[term][y_val] = (self.condprob[term][y_val] + 1) / (self.condprob_denom[y_val] + len(self.vocabulary))
        
    def predict_single(self, x):
        # YOUR CODE HERE
        prob_max = 0
        best_class = -1
        for y_val in self.y_count:
            prob = self.proba_y_given_x(y_val, x)
            if prob > prob_max:
                best_class = y_val
                prob_max = prob
        
        return best_class
        
    def predict(self, X):
        return [self.predict_single(x) for x in X]

In [3]:
# load data
train_df = pd.read_csv('data/cerpen-training.csv')
cv_df = pd.read_csv('data/cerpen-cross_validation.csv')
test_df = pd.read_csv('data/cerpen-test.csv')

In [5]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,title,source,authors,categories,text
0,0,Adios,http://cerpenmu.com/cerpen-perpisahan/adios.html,['Salman Reza Al-Fachrezy'],"['Cerpen Pengalaman Pribadi', 'Cerpen Perpisah...","Lady, begitulah nama dia. \r\r\nDia adalah ana..."
1,1,My Love Story,http://cerpenmu.com/cerpen-cinta-segitiga/my-l...,['Sherly Milenia Islamiati'],['Cerpen Cinta Segitiga'],“anggap saja malam ini kita pacaran” kata kata...
2,2,Menunggu,http://cerpenmu.com/cerpen-cinta-dalam-hati-te...,['Yulia Nurhasanah'],['Cerpen Cinta Dalam Hati (Terpendam)'],"Mengaguminya dalam diam, setiap detik hanya bi..."
3,3,Zea dan Persahabatan,http://cerpenmu.com/cerpen-remaja/zea-dan-pers...,['Na'],['Cerpen Remaja'],Pagi ini Zidan bermaksud menyapa Zea. Sekaligu...
4,4,Hadiah Untuk Rysta,http://cerpenmu.com/cerpen-persahabatan/hadiah...,['Deshinta Maharani'],"['Cerpen Anak', 'Cerpen Persahabatan']","Hai, namaku Shofieya Rika Alyanabila, temanku ..."


In [72]:
# given panda dataframe return X(examples) and Y(class)
def data_to_examples(df):
    X = []
    y = []
    for _, row in df.iterrows():
        categories = json.loads(str(row['categories']).replace("\'", '\"'))
        text = str(row['text']).lower()
        text = re.sub('[^a-z]+', ' ', text)
        x = text.split()
        
        X.append(x)
        y.append(categories)
    return np.array(X), np.array(y)

In [73]:
# load training data to examples
X, y = data_to_examples(train_df)

In [70]:
def jaccard_similarity(A, B):
    A_and_B = [elem for elem in A if value in B]
    return len(A_and_B) / (len(A) + len(B) - len(A_and_B))

In [75]:
# Train naive bayes
mnb_clf = MNB_TextClassifier()
mnb_clf.fit(X, y)

In [76]:
genres = set(genre for y_i in y for genre in y_i)

59


In [77]:
# load cross validation data to examples
X, y = data_to_examples(cv_df)

In [None]:
# find treshold

tresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
treshold = 0.5
best_jaccard_similarity = 0
for t in tresholds:
    avg_jaccard = 0
    for x, y_i in zip(X, y):
        predicted_genres = []
        for genre in genres:
            if mnb_clf.proba_y_given_x(genre, x) > t:
                predicted_genres.append(genre)
        avg_jaccard += jaccard_similarity(predicted_genres, y_i)
    avg_jaccard /= len(X)
    if avg_jaccard > best_jaccard_similarity:
        best_jaccard_similarity = avg_jaccard
        treshold = t

print(treshold, best_jaccard_similarity)

In [None]:
# load test data to examples
X, y = data_to_examples(test_df)

In [None]:
avg_jaccard = 0
for x, y_i in zip(X, y):
    predicted_genres = []
    for genre in genres:
        if mnb_clf.proba_y_given_x(genre, x) > t:
            predicted_genres.append(genre)
    avg_jaccard += jaccard_similarity(predicted_genres, y_i)
avg_jaccard /= len(X)
print("Average jaccard similarity: ", avg_jaccard)