## LOAD DATA

In [1]:
import pandas as pd

In [2]:
path = '..\data\data_berita.csv'
df = pd.read_csv(path)
df = df[['dokumen','label']].copy()
df.head()

Unnamed: 0,dokumen,label
0,Presenter Raffi Ahmad baru saja membeli mobil ...,entertaiment
1,Pedangdut Ayu Ting Ting memang selalu menjadi ...,entertaiment
2,Lucinta Luna begitu percaya diri akan kecantik...,entertaiment
3,Lucinta Luna kembali menyita perhatian dengan ...,entertaiment
4,"Atlet badminton, Loh Kean Yew saat ini tengah...",entertaiment


-----

## PREPROCESSING

In [3]:
# preprocessing all

def case_folding(str_data):
    return str_data.lower()

from nltk.tokenize import word_tokenize
def tokenizing(str_data):
    return (word_tokenize(str_data))

from nltk.corpus import stopwords
stopword_id = stopwords.words('indonesian')
stopword_en = stopwords.words('english')
stopword_all = stopword_id + stopword_en
def stopword_removal(list_str_data):
    list_str_data_t = []
    for s in list_str_data:
        if (s not in stopword_all) & s.isalpha():
            list_str_data_t.append(s)
    return list_str_data_t

def preprocessing(str_data):
    str_data_t = case_folding(str_data)
    str_data_t = tokenizing(str_data_t)
    str_data_t = stopword_removal(str_data_t)
    str_data_t = ' '.join(str_data_t)
    return str_data_t

In [4]:
df['preprocessing'] = df.dokumen.apply(preprocessing)

In [5]:
df.head()

Unnamed: 0,dokumen,label,preprocessing
0,Presenter Raffi Ahmad baru saja membeli mobil ...,entertaiment,presenter raffi ahmad membeli mobil mewah roll...
1,Pedangdut Ayu Ting Ting memang selalu menjadi ...,entertaiment,pedangdut ayu ting ting sorotan publik terkini...
2,Lucinta Luna begitu percaya diri akan kecantik...,entertaiment,lucinta luna percaya kecantikannya lucinta men...
3,Lucinta Luna kembali menyita perhatian dengan ...,entertaiment,lucinta luna menyita perhatian penampilan cant...
4,"Atlet badminton, Loh Kean Yew saat ini tengah...",entertaiment,atlet badminton loh kean yew perbincangan kala...


-------

## NGRAM

In [6]:
import re
def generate_ngrams(s, n):
    # Convert to lowercases
    s = s.lower()
    
    # Replace all none alphanumeric characters with spaces
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    
    # Break sentence in the token, remove empty tokens
    tokens = [token for token in s.split(" ") if token != ""]
    
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[tokens[i:] for i in range(n)])
    a = [" ".join(ngram) for ngram in ngrams]
    return a

def n_gram(data):
    # Generate n-grams
    unigram = generate_ngrams(data, 1)
    bigram = generate_ngrams(data, 2)
    trigram = generate_ngrams(data, 3)

    data = [
        unigram,
        bigram,
        trigram
    ]

    return data

In [7]:
df['bigram'] = df.preprocessing.apply(lambda x:n_gram(x)[1])

In [8]:
df.head()

Unnamed: 0,dokumen,label,preprocessing,bigram
0,Presenter Raffi Ahmad baru saja membeli mobil ...,entertaiment,presenter raffi ahmad membeli mobil mewah roll...,"[presenter raffi, raffi ahmad, ahmad membeli, ..."
1,Pedangdut Ayu Ting Ting memang selalu menjadi ...,entertaiment,pedangdut ayu ting ting sorotan publik terkini...,"[pedangdut ayu, ayu ting, ting ting, ting soro..."
2,Lucinta Luna begitu percaya diri akan kecantik...,entertaiment,lucinta luna percaya kecantikannya lucinta men...,"[lucinta luna, luna percaya, percaya kecantika..."
3,Lucinta Luna kembali menyita perhatian dengan ...,entertaiment,lucinta luna menyita perhatian penampilan cant...,"[lucinta luna, luna menyita, menyita perhatian..."
4,"Atlet badminton, Loh Kean Yew saat ini tengah...",entertaiment,atlet badminton loh kean yew perbincangan kala...,"[atlet badminton, badminton loh, loh kean, kea..."


------

## TF-IDF

In [9]:
import math
import mpu

def creating_bag_of_word(doc_l):
    bag =[]
    for i in doc_l:
        bag.extend(i)
    bag_of_word = list(set(bag))
    # print('jumlah term')
    # print(len(bag_of_word))
    bag_of_word.sort()
    return(bag_of_word)

def count_doc_freq(bow, doc_l):
    document_frequency = {}
    for b in bow:
        count_df = 0
        for d in doc_l:
            if b in d : count_df+=1           
        document_frequency[b] = count_df
    return document_frequency

def count_inv_doc_freq(doc_freq, doc_l):
    log_document_frequency = {}
    for k,v in doc_freq.items():
        log_document_frequency[k] = math.log10(len(doc_l)/v)
    return log_document_frequency

def count_log_term_freq(bow, doc_l):
    log_term_frequency = {}
    for b in bow:
        log_term_frequency[b] = {}
        for index,d in enumerate(doc_l):
            tfd = d.count(b)
            if tfd == 0:
                log_term_frequency[b][index] = 0
            else :
                log_term_frequency[b][index] = 1 + math.log10(tfd)
    return log_term_frequency

def count_tfidf(document_l):
    '''
    document_l : list dari dokumen yang sudah menjadi list dari ngram

    '''
    doc_l = document_l
    bag_of_word = creating_bag_of_word(doc_l)
    doc_freq = count_doc_freq(bag_of_word,doc_l) 
    idf = count_inv_doc_freq(doc_freq,doc_l) 
    tf = count_log_term_freq(bag_of_word,doc_l) 
    tfidf = tf

    # mpu.io.write('bag_of_word.pickle', bag_of_word)
    # mpu.io.write('idf.pickle', idf)

    # read pickle
    # unserialized_data = mpu.io.read('idf.pickle')

    for k in tf.keys():
        for i in tfidf[k].keys():
            tfidf[k][i] = tfidf[k][i] * idf[k]
    # tfidf = count_tfidf(df.bigram.tolist())
    df_tfidf = pd.DataFrame(tfidf)
    list_tfidf = df_tfidf.to_numpy().tolist()
    return list_tfidf


In [10]:
import mpu
# mpu.io.read('idf.pickle')

In [11]:
df['tfidf'] = count_tfidf(df.bigram.tolist())

In [12]:
df.head()

Unnamed: 0,dokumen,label,preprocessing,bigram,tfidf
0,Presenter Raffi Ahmad baru saja membeli mobil ...,entertaiment,presenter raffi ahmad membeli mobil mewah roll...,"[presenter raffi, raffi ahmad, ahmad membeli, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.301..."
1,Pedangdut Ayu Ting Ting memang selalu menjadi ...,entertaiment,pedangdut ayu ting ting sorotan publik terkini...,"[pedangdut ayu, ayu ting, ting ting, ting soro...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Lucinta Luna begitu percaya diri akan kecantik...,entertaiment,lucinta luna percaya kecantikannya lucinta men...,"[lucinta luna, luna percaya, percaya kecantika...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Lucinta Luna kembali menyita perhatian dengan ...,entertaiment,lucinta luna menyita perhatian penampilan cant...,"[lucinta luna, luna menyita, menyita perhatian...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,"Atlet badminton, Loh Kean Yew saat ini tengah...",entertaiment,atlet badminton loh kean yew perbincangan kala...,"[atlet badminton, badminton loh, loh kean, kea...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


------------

## NAIVE BAYES

In [14]:
from sklearn.model_selection import train_test_split
# from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(df.tfidf.tolist(), df.label, test_size=0.2, random_state=0)
gnb = MultinomialNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

entertaiment       0.67      1.00      0.80         2
    olahraga       1.00      0.50      0.67         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



In [15]:
gnb.predict([df.iloc[1].tfidf])

array(['entertaiment'], dtype='<U12')

In [22]:
proba = gnb.predict_proba([df.iloc[1].tfidf])[0]
print('Probality untuk label entertaiment : ')
print(proba[0])
print('Probality untuk label olahraga : ')
print(proba[1])


Probality untuk label entertaiment : 
1.0
Probality untuk label olahraga : 
0.0


In [8]:
import joblib
import numpy as np

saved_model_path = 'naivebayes_model.sav'
loaded_model = joblib.load(saved_model_path)
# data = [df.iloc[1].tfidf]
# proba = loaded_model.predict_proba(data)[0]
# data = loaded_model.predict(data)[0]
# print()
# print()
# print('predict')
# print(data)
# print('Probality untuk label entertaiment : ')
# print(proba[0])
# print('Probality untuk label olahraga : ')
# print(proba[1])

In [9]:
# loaded_model.class_weight()

AttributeError: 'GaussianNB' object has no attribute 'class_weight'

In [93]:
import joblib
# save the model to disk
filename = 'naivebayes_model.sav'
joblib.dump(gnb, filename)
  
# load the model from disk
# loaded_model = joblib.load(filename)

In [5]:
import mpu

bow = mpu.io.read('bag_of_word.pickle')
bow


['abimanyu mantan',
 'abimanyu proyek',
 'acara bertajuk',
 'acara keterangannya',
 'acara perayaan',
 'afc cup',
 'aff sungguh',
 'ahmad acara',
 'ahmad membeli',
 'ahmad mengaku',
 'ahmad menghubungi',
 'ahmad mengomelinya',
 'ahmad menyebut',
 'ahmad pembelian',
 'ahmad raffi',
 'ahmad rans',
 'ahmad ronaldinho',
 'air klub',
 'air rumah',
 'ajang indonesia',
 'akun instagram',
 'alamat palsu',
 'albagir layak',
 'albagir mistar',
 'alias pelakor',
 'amal charity',
 'amal jakarta',
 'amali halaman',
 'amali ketua',
 'amali rombongan',
 'amanda manopo',
 'andini jefri',
 'andini konfrensi',
 'andini nichol',
 'aneh tindakan',
 'angkat besi',
 'announcement selasa',
 'apresiasi laga',
 'arema fc',
 'arsenal mikel',
 'arsenal sepakat',
 'arteta menyebut',
 'artis pedangdut',
 'asmara amanda',
 'asmawi indonesia',
 'asmawi ketua',
 'atensi tampil',
 'atlet badminton',
 'atlet cabor',
 'atlet indonesia',
 'atlet loh',
 'atletik bulu',
 'ayu ting',
 'babak penyisihan',
 'badminton loh',
 

In [6]:
len(bow)

764

In [3]:
mpu.io.read('idf.pickle')

{'abimanyu mantan': 1.3010299956639813,
 'abimanyu proyek': 1.3010299956639813,
 'acara bertajuk': 1.3010299956639813,
 'acara keterangannya': 1.3010299956639813,
 'acara perayaan': 1.3010299956639813,
 'afc cup': 1.3010299956639813,
 'aff sungguh': 1.3010299956639813,
 'ahmad acara': 1.3010299956639813,
 'ahmad membeli': 1.3010299956639813,
 'ahmad mengaku': 1.3010299956639813,
 'ahmad menghubungi': 1.3010299956639813,
 'ahmad mengomelinya': 1.3010299956639813,
 'ahmad menyebut': 1.3010299956639813,
 'ahmad pembelian': 1.3010299956639813,
 'ahmad raffi': 1.3010299956639813,
 'ahmad rans': 1.3010299956639813,
 'ahmad ronaldinho': 1.3010299956639813,
 'air klub': 1.3010299956639813,
 'air rumah': 1.3010299956639813,
 'ajang indonesia': 1.3010299956639813,
 'akun instagram': 1.3010299956639813,
 'alamat palsu': 1.3010299956639813,
 'albagir layak': 1.3010299956639813,
 'albagir mistar': 1.3010299956639813,
 'alias pelakor': 1.3010299956639813,
 'amal charity': 1.3010299956639813,
 'amal 