In [1]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import math
import re
import pandas as pd
import numpy as np

In [2]:
stemm = StemmerFactory()
stemmer = stemm.create_stemmer()
stop = StopWordRemoverFactory()
stopwords = stop.get_stop_words()

In [3]:
data = pd.read_csv("coba.csv", delimiter=",", encoding='mac_roman')

In [4]:
data.head()

Unnamed: 0,Status,Judul,Nama Penulis,Sinopsis
0,Akidah,Syarah Akidah Ahlus Sunnah Wal Jamaah,Edi Ah Iyubenu,Alasan Penting Memiliki Buku Ini : 1. Buku ini...
1,Sirah,25 Kisah Kehidupan Sahabat Nabi Yang Paling He...,Mustafa Ibrahim,Buku ini memuat kisah kehidupan dari 25 sahaba...
2,Sirah,Utsman Bin Affan : Akhir Tragis Sahabat Paling...,Imam Al Ghazali,"tuduhan miring telah terjawab, tetapi fitnah m..."
3,Akidah,Al Irsyad Ila Shahihil I'tiqad,@nashihatku,Layaknya seorang musafir yang menempuh suatu p...
4,Sirah,Samudra Keteladanan Muhammad,Wahyudi Nurr Arifin,Buku ini memotret begitu banyak keteladanan sa...


### Model Split

In [5]:
# Pembagian data training & testing
x_train, x_test, y_train, y_test = train_test_split(data['Sinopsis'], data['Status'], train_size=0.6,
                                                    test_size=0.4, shuffle=False)

In [6]:
train_data = pd.DataFrame({'teks':x_train, 'label':y_train})
test_data = pd.DataFrame({'teks':x_test, 'label':y_test})

In [7]:
train_data.reset_index(drop=True, inplace=True)

In [8]:
train_data

Unnamed: 0,teks,label
0,Alasan Penting Memiliki Buku Ini : 1. Buku ini...,Akidah
1,Buku ini memuat kisah kehidupan dari 25 sahaba...,Sirah
2,"tuduhan miring telah terjawab, tetapi fitnah m...",Sirah


In [9]:
# Ukuran data training atau total seluruh kelas
print('Ukuran data train:', train_data.shape)
n_train = train_data.shape[0]

Ukuran data train: (3, 2)


### Hitung jumlah kelas pada dokumen training

In [10]:
kelas_sirah = train_data['label'].value_counts()[0]
kelas_akidah = train_data['label'].value_counts()[1]

# Preprocessing

### Cleaning

In [11]:
cleaner = lambda doc: re.sub(r'[0-9]|\.|:|;|,|\?|!|\(|\)\'|\"|-', '', doc)

# data latih
for i in range(train_data.shape[0]):
    train_data.iloc[i].teks = cleaner(train_data.iloc[i].teks)
    
# data uji
for i in range(test_data.shape[0]):
    test_data.iloc[i].teks = cleaner(test_data.iloc[i].teks)

In [12]:
train_data.teks

0    Alasan Penting Memiliki Buku Ini   Buku ini di...
1    Buku ini memuat kisah kehidupan dari  sahabat ...
2    tuduhan miring telah terjawab tetapi fitnah me...
Name: teks, dtype: object

In [13]:
test_data.teks

3    Layaknya seorang musafir yang menempuh suatu p...
4    Buku ini memotret begitu banyak keteladanan sa...
Name: teks, dtype: object

### Functions

In [115]:
def tokenisasi(teks):
    # Case folding
    teks = teks.lower()
    
    # Tokenisasi
    token = re.findall('[A-Za-z]+', teks)
    token = np.array(token)
    
    return token

def filtering(token):
    # Stoplist
    stopwords = StopWordRemoverFactory().get_stop_words()

    # Hapus stopwords
    stopwordRemover = ~np.isin(token, stopwords)
    kata = token[stopwordRemover]
    
    return kata
  
def stemming(kata):
    # Inisial Stemmer
    stemmer = StemmerFactory().create_stemmer()

    # Stemming
    stem = np.vectorize(lambda t: stemmer.stem(t))
    kata = stem(kata)
    
    return kata

def preprocessing(teks):
    # Tokenisasi
    token = tokenisasi(teks)

    # Filtering
    kata = filtering(token)

    # Stemming
    kata = stemming(kata)
    
    return kata

def getFitur(dokumen):
    fitur = [preprocessing(teks) for teks in dokumen]
    fitur = [term for listTerm in fitur for term in listTerm]
    fitur = np.unique(fitur)
    return fitur

def term_presence(dokumen):
    # get Fitur
    fitur = getFitur(dokumen)
    
    # Term Pressence Table
    zero_data = np.zeros((dokumen.shape[0], fitur.shape[0]))
    tf = pd.DataFrame(zero_data, columns=fitur)
    
    for i, teks in enumerate(dokumen):
        kata = preprocessing(teks)
        # Term presence        
        temp = np.where(np.isin(fitur, kata), 1, 0)        
        
        tf.iloc[i] = temp
        
    return tf

def get_percentage(dokumen, percent):

    start = 0
    end = int(len(dokumen) * percent)

    return dokumen.iloc[start:end]

def entropy(rows, n_term, kelas):
    temp = 0
    
    for i, val in enumerate(kelas):
        temp += -(rows[i]/kelas[i] * 
                  np.where(np.isinf(np.log(rows[i]/kelas[i])), 0, np.log(rows[i]/kelas[i])))
    
    value = rows.iloc[-1] / n_term * temp
    
    return value

def information_gain(dokumen_1, dokumen_0, *kelas):

    # Entropy total
    entropy_total = 0
    
    for i, val in enumerate(kelas):
        entropy_total -= val / sum(kelas) * np.where(np.isinf(np.log(val / sum(kelas))), 
                                                     0, 
                                                     np.log(val / sum(kelas)))
    
    jumlah_term = dokumen_1.iloc[:, -1].sum()  
    
    s1_entropy = []
    s0_entropy = []
    
    idx = 0
    # dokumen nilai 1
    for i in dokumen_1.iterrows():
        """
        i = data per rows (tuple)
        ex : ("term", Index | Value
                          0 | 90
                          1 | 80
                          2 | 50 )
                 
        jumlah_term = jumlah term nilai 1
        kelas = jumlah per kelas
        ex : index | value 
            akidah | 1
            sirah  | 2
        """
        s1_entropy.append(entropy(i[1], jumlah_term, kelas))
        idx += 1
        
    # Dokumen nilai 0
    for i in dokumen_0.iterrows():
        s0_entropy.append(entropy(i[1], jumlah_term, kelas))

    # Convert to array matrix 
    s1_entropy = np.array(s1_entropy)
    s0_entropy = np.array(s0_entropy)
    
    information_gain = entropy_total - s1_entropy + s0_entropy

    # return as Pandas dataframe
    ig = pd.DataFrame(dokumen_1.index, columns=["Term"])    
    ig['IG'] = information_gain

    return ig

def posterior(dokumen, n_term):
    for i, val in enumerate(dokumen.iloc[:, 1:].iteritems()):
        dokumen[i] = (np.array(val[1]) + 1) / (val[1].sum() + n_term)        
        
    return dokumen

def prior(kelas):
    prior = []
    
    for i in kelas:
        prior.append(i / sum(kelas))
        
    print("Prior : \n", prior)
    return prior
    
def multinomial_naive_bayes(term, doc_uji, *kelas):    
    print("term : \n\n", term)
    n_total = term.shape[0]
    
    # post = term.apply(lambda x: (x.iloc[:, 1:] + 1) / (x.iloc[:, 1:].sum() + n_term))
    post = posterior(term, n_total)
    pr = prior(kelas)
    
    print("data uji : \n ", data_uji)
    data = []
    for i, val in enumerate(data_uji):
#         index_data.append(np.where(term[term['Term'].isin(val)], True, False))
#         data.append(term[term['Term'].isin(val)])
        data.append(ig_100[ig_100['Term'].isin(val)])
#         index_data.append(np.where(ig_100[ig_100['Term'].isin(i)], True, False))
#         index_data.append(np.where(np.isin(i, term['Term']), True, False))
    
#     Check if empty then fill the value
    result = []
    for i, val in enumerate(data):
        result.append(np.where(data[i].empty, kelas[i], data[i] * kelas[i]))
#     print(np.where(data[0].empty, True, False))
    print("result : \n", result[0][0][1])
    
# #   Convert to dataframe    
#     post = pd.DataFrame(post).T
    
#   result = pr * term_yang_ketemu

    return post
# # get term table
# df = tf_1[tf_1.index.isin(ig_10['Term'])].rename_axis('Term').reset_index()

multinomial_naive_bayes(df, data_uji, kelas_akidah, kelas_sirah)

term : 

        Term         0         1
0     alqur  0.166667  0.166667
1        an  0.166667  0.166667
2  ayatayat  0.166667  0.166667
3     bagai  0.166667  0.166667
4    bantai  0.166667  0.166667
5     besar  0.166667  0.166667
Prior : 
 [0.3333333333333333, 0.6666666666666666]
data uji : 
  [array(['layak', 'orang', 'musafir', 'tempuh', 'suatu', 'jalan', 'tuju',
       'tempat', 'nan', 'jauh', 'siap', 'bekal', 'lengkap', 'jalan',
       'manusia', 'tuju', 'allah', 'bukankah', 'hidup', 'dunia', 'jalan',
       'grafis', 'finishnya', 'haribaan', 'allah', 'hembus', 'nafas',
       'gera', 'perilaku', 'bahkan', 'lintas', 'hati', 'sama', 'ayun',
       'kaki', 'lewat', 'siasia', 'semua', 'catat', 'kitab', 'kelak',
       'hisab'], dtype='<U9'), array(['buku', 'potret', 'banyak', 'teladan', 'sang', 'nabi', 'hari',
       'kupas', 'biasa', 'beliau', 'kala', 'jahit', 'baju', 'robekketika',
       'dipasarsaat', 'jalan', 'akrab', 'anakanak', 'mulia', 'tamu',
       'banyak', 'biasa', 'be

Unnamed: 0,Term,0,1
0,alqur,0.166667,0.166667
1,an,0.166667,0.166667
2,ayatayat,0.166667,0.166667
3,bagai,0.166667,0.166667
4,bantai,0.166667,0.166667
5,besar,0.166667,0.166667


In [66]:
df

Unnamed: 0,Term,0,1
0,alqur,0,1
1,an,0,1
2,ayatayat,0,1
3,bagai,0,1
4,bantai,0,1
5,besar,0,1


In [15]:
# Get test_data fitur
data_uji = [preprocessing(teks) for teks in test_data.teks]

In [17]:
# index_data = []

# for i, val in enumerate(data_uji):
#     index_data.append(np.where(np.isin(data_uji[i], ig_100['Term']), True, False))
    
# data_uji[1][index_data[1]]

# Proses data training

## Term Presences

In [18]:
binary_tf = term_presence(train_data.teks)

In [19]:
binary_tf

Unnamed: 0,ahli,alas,alqur,an,ayatayat,baca,bagai,bahas,bahasa,bantai,...,saw,sekaligus,si,tahqiq,teladan,terus,tuduh,tulis,ulama,upaya
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


### TF Tables

In [20]:
tf = pd.concat([binary_tf, train_data.label], axis=1)

In [21]:
tf

Unnamed: 0,ahli,alas,alqur,an,ayatayat,baca,bagai,bahas,bahasa,bantai,...,sekaligus,si,tahqiq,teladan,terus,tuduh,tulis,ulama,upaya,label
0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,Akidah
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sirah
2,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,Sirah


### Hitung jumlah nilai 1 & 0 pada tiap kelas

In [22]:
# Hitung jumlah nilai 1
akidah_1 = tf[tf.label == 'Akidah'].sum()
sirah_1 = tf[tf.label == 'Sirah'].sum()
total_1 = tf.sum(axis=0)

In [23]:
tf_1 = pd.concat([akidah_1, sirah_1, total_1], axis=1)

# Drop label
tf_1 = tf_1[:-1]

tf_1

Unnamed: 0,0,1,2
ahli,1,0,1
alas,1,0,1
alqur,0,1,1
an,0,1,1
ayatayat,0,1,1
baca,0,2,2
bagai,0,1,1
bahas,1,0,1
bahasa,1,0,1
bantai,0,1,1


In [24]:
# Hitung jumlah nilai 0
akidah_0 = (tf[tf.label == 'Akidah'] == 0).sum()
sirah_0 = (tf[tf.label == 'Sirah'] == 0).sum()

In [25]:
tf_0 = pd.concat([akidah_0, sirah_0], axis=1)

# Hitung total
tf_0[tf_0.shape[1]] = tf_0.sum(axis=1)

# Drop label
tf_0 = tf_0[:-1]

tf_0

Unnamed: 0,0,1,2
ahli,0,2,2
alas,0,2,2
alqur,1,1,2
an,1,1,2
ayatayat,1,1,2
baca,1,0,1
bagai,1,1,2
bahas,0,2,2
bahasa,0,2,2
bantai,1,1,2


## Information Gain

In [26]:
ig = information_gain(tf_1, tf_0, kelas_akidah, kelas_sirah)

# Sorted IG
ig = ig.sort_values(by=["IG", "Term"], ascending=[False, True]).reset_index(drop=True)



In [27]:
# Information Gain threshold
# 100%
ig_100 = ig

# 50%
ig_50 = get_percentage(ig, .5)

# 10%
ig_10 = get_percentage(ig, .1)


In [65]:
# get term table
df = tf_1[tf_1.index.isin(ig_10['Term'])].rename_axis('Term').reset_index()

# drop total column
df = df.drop(df.columns[len(df.columns)-1], axis=1)

df

Unnamed: 0,Term,0,1
0,alqur,0,1
1,an,0,1
2,ayatayat,0,1
3,bagai,0,1
4,bantai,0,1
5,besar,0,1
