# NB-SVM

Ref -https://github.com/Joshua-Chin/nbsvm 

In [1]:
import argparse

import pandas as pd
from fastai.text import *
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

from src.nbsvm import NBSVM
from src.utils import load_data as loadD

In [2]:
lang = "id"
dataset = "indosum"
# df_test = pd.read_csv(f"data/csv/{lang}/{dataset}_test.csv", sep="\t", 
#                        encoding='utf-8', 
#                        header=None, 
#                        names=["label", "text"])

df_test = pd.read_csv(f"data/csv/{lang}/{dataset}_test.csv")
df_test.dropna(inplace = True)

In [None]:
df_train = pd.read_csv(f"data/sampled_{dataset}/csv/1000_{dataset}.csv")
df_train.dropna(inplace=True)

X_train, X_test, y_train, y_test = lda(df_train, df_test)
mnbsvm = NBSVM()
mnbsvm.fit(X_train, y_train)

In [11]:
sample_size_list = [200, 500]
test_accuracy = {}
for n in sample_size_list:
    print(f"Training on {n}")
    df_train = pd.read_csv(f"data/sampled_{dataset}/csv/{n}_{dataset}.csv")
    X_train, X_test, y_train, y_test = loadD(df_train[['label', 'text']], df_test[['label', 'text']])
    print("Fitting Model")
    mnbsvm = NBSVM()
    mnbsvm.fit(X_train, y_train)
    print("works")
    acc = mnbsvm.score(X_test, y_test)
    print("Test Accuracy: %s" % acc)
    test_accuracy[f"{n}_samples"] = acc
print(f"Accuracies : {test_accuracy}")


Training on 200
Vectorizing...
Encoding Labels...
Fitting Model
works
Test Accuracy: 0.7778310429317141
Training on 500
Vectorizing...
Encoding Labels...
Fitting Model
works
Test Accuracy: 0.8645467135400021
Accuracies : {'200_samples': 0.7778310429317141, '500_samples': 0.8645467135400021}


# ULMFiT

In [3]:
from src.ulmfit import ULMFiT, LangTokenizer
from pathlib import Path
import pandas as pd

In [4]:
df_train= pd.read_csv(f'data/sampled_{dataset}/csv/200_{dataset}.csv')
df_train.dropna(inplace=True)
df_test = pd.read_csv(f'data/csv/{lang}/{dataset}_test.csv',)
#                        sep="\t", 
#                        encoding='utf-8', 
#                        header=None, 
#                        names=["label", "text"])
df_test.dropna(inplace=True)
df_lm = pd.read_csv(f'data/csv/{lang}/{dataset}_train.csv',)
#                     sep="\t", 
#                        encoding='utf-8', 
#                        header=None, 
#                        names=["label", "text"])
df_lm.dropna(inplace=True)

In [5]:
path = Path('./data')

In [6]:
model = ULMFiT(lang="id", 
               path=path,
               train_lang_model = True,
               df_lm = df_lm,
               finetune=True, 
               load_encoder=True,
               n_epocs_cls=32, 
               n_epocs_lm=2)


In [7]:
# model.load(df_train, df_test)
df_train.head(), df_test.head(), df_lm.head()

(   Unnamed: 0                                               text      label
 0       27973  Anda yang gemar menonton film action atau yang...  teknologi
 1       14484  Awalnya , Andika Mahesa terkenal sebagai vokal...    showbiz
 2       49360  Jakarta , CNN Indonesia - - Adele belum boleh ...    showbiz
 3       52714  Seperti aplikasi keluaran Google lainnya , Map...  teknologi
 4       33471  Jakarta , CNN Indonesia - - Babak baru perusah...  teknologi,
    Unnamed: 0                                               text        label
 0           0  Jerussalem ( ANTARA News ) - Menteri Energi Is...  tajuk utama
 1           1  Suara.com - Gempa berkekuatan 6,1 skala richte...  tajuk utama
 2           2  Setelah melewati prosesi lamaran pada Minggu (...      showbiz
 3           3  Jakarta , CNN Indonesia - - Menteri Koordinato...  tajuk utama
 4           4  Jakarta , CNN Indonesia - - Setelah liburan di...    inspirasi,
    Unnamed: 0                                               t

In [8]:
model.fit(df_train, df_test)

Data loaded successfully
Finetuning the model
Loading Wiki-Language Model
Fitting LM for 2 epocs


epoch,train_loss,valid_loss,accuracy,time
0,4.60872,4.490697,0.298326,15:24
1,4.483107,4.35504,0.30971,15:25


Encoder Saved : data_id_enc successfully
Training the classifier
Loading Encoder
Fitting Classifier for 32 epocs


epoch,train_loss,valid_loss,accuracy,time
0,1.584896,1.586629,0.230769,00:01
1,1.477913,1.56261,0.230769,00:01
2,1.356734,1.514639,0.435897,00:01
3,1.239125,1.430662,0.666667,00:01
4,1.141495,1.305243,0.794872,00:01
5,1.056592,1.146425,0.820513,00:01
6,0.986228,0.96188,0.923077,00:01
7,0.914348,0.784841,0.923077,00:01
8,0.847274,0.629507,0.923077,00:01
9,0.793738,0.504449,0.923077,00:01


Saving Classifier Model as :id_32_epocs


RNNLearner(data=TextClasDataBunch;

Train: LabelList (160 items)
x: TextList
▁ xx bos ▁anda ▁yang ▁gemar ▁menonton ▁film ▁action ▁atau ▁yang ▁menampilkan ▁lakon ▁seorang ▁mata - mata ▁maupun ▁agen ▁rahasia ▁pastinya ▁ingat ▁dengan ▁adegan ▁dimana ▁seorang ▁karakter ▁sedang ▁menunjuk ▁ke ▁gambar ▁tangkapan ▁kamera ▁pengawas ▁ , ▁lalu ▁menye le tuk kan ▁kata ▁“ ▁en han ce ▁“ ▁ . ▁setelah nya ▁ , ▁adegan ▁biasanya ▁akan ▁berlanjut ▁menampilkan ▁versi xxunk ▁lebih ▁tinggi ▁dari ▁gambar ▁tersebut ▁ . ▁pertanyaan ▁kita ▁sebagai ▁penonton ▁umumnya ▁ , ▁“ ▁apakah ▁hal ▁seperti ▁itu ▁masuk ▁akal ▁di ▁dunia ▁nyata ▁? ▁” ▁gambar ▁yang ▁tadinya ▁tampak ▁kabur ▁atau ▁pixel ated ▁ , ▁seketika ▁juga ▁menjadi ▁sangat ▁tajam ▁sampai - sampai ▁wajah ▁seseorang ▁yang ▁tengah ▁diincar ▁sang ▁lakon ▁bisa ▁langsung ▁dikenali ▁ . ▁kalau ▁anda ▁pernah ▁mencoba ▁memperbesar ▁ukuran ▁gambar ▁ , ▁pastinya ▁anda ▁tahu ▁bahwa ▁hal ▁semacam ▁ini ▁hanya ▁bisa ▁terjadi ▁di ▁film ▁ . ▁akan ▁tetapi ▁perkembangan ▁pesat

In [None]:
len(df_lm)

In [17]:
df_train= pd.read_csv(f'data/sampled_{dataset}/csv/500_{dataset}.csv')
df_train.dropna(inplace=True)
df_test = pd.read_csv(f'data/csv/{lang}/{dataset}_test.csv')
df_test.dropna(inplace=True)
df_lm = pd.read_csv(f'data/csv/{lang}/{dataset}_train.csv')
df_lm.dropna(inplace=True)
path = Path('./data')
model = ULMFiT(lang="id", 
               path=path,
               train_lang_model = False,
               df_lm = df_lm,
               finetune=False, 
               load_encoder=True,
               n_epocs_cls=32, 
               n_epocs_lm=2)
model.fit(df_train, df_test)

Data loaded successfully
Training the classifier
Loading Encoder
Fitting Classifier for 32 epocs


epoch,train_loss,valid_loss,accuracy,time
0,1.632174,1.725384,0.33,00:02
1,1.398257,1.57336,0.69,00:02
2,1.227032,1.251446,0.83,00:02
3,1.091214,0.88733,0.88,00:02
4,0.96608,0.602172,0.89,00:02
5,0.846706,0.480924,0.88,00:02
6,0.743999,0.470508,0.88,00:02
7,0.655065,0.506406,0.88,00:02
8,0.580439,0.53669,0.89,00:02
9,0.515043,0.54639,0.89,00:02


Saving Classifier Model as :id_32_epocs


RNNLearner(data=TextClasDataBunch;

Train: LabelList (400 items)
x: TextList
▁ xx bos ▁nubia xxup ▁z 17 ▁mini ▁diluncurkan ▁pertama ▁kali ▁di ▁tiongkok ▁dengan ▁daya ▁tarik ▁utama ▁terletak ▁pada ▁kamera ▁ganda ▁dan ▁desain nya ▁ . ▁tapi ▁ , ▁itu ▁saja ▁ternyata ▁tak ▁membuat ▁nubia ▁ber puas ▁diri ▁ . ▁demi ▁menggaet ▁lebih ▁banyak ▁pengguna ▁ , ▁nubia ▁– ▁sub ▁brand xxup ▁z xxup te ▁kembali ▁menggu lir kan ▁varian ▁baru ▁untuk ▁pasar ▁india ▁ , ▁di ▁mana ▁kali ▁ini ▁mereka ▁juga ▁mencoba ▁menonjolkan ▁sisi xxup ▁ram ▁dengan ▁kapasitas ▁6 xxup ▁gb ▁ . ▁dengan ▁demikian ▁ , ▁selain ▁tampil ▁ cia mik ▁ , ▁mampu ▁menj ep ret ▁foto ▁yang ▁api k ▁ , ▁nubia xxup ▁z 17 ▁mini ▁juga ▁menawarkan ▁kinerja ▁multi ▁- ▁tas king ▁yang ▁mulus ▁lewat ▁varian ▁limited ▁edition ▁dengan ▁balutan ▁warna ▁aurora ▁blue ▁ . ▁di bandingkan ▁varian ▁sebelumnya ▁ , ▁nubia xxup ▁z 17 ▁mini ▁aurora ▁blue ▁tak ▁menghadirkan ▁banyak ▁perbedaan ▁ . ▁jero annya ▁masih ▁mengadopsi ▁snap dra gon ▁6 53 ▁dengan ▁memori ▁

In [18]:
df_train= pd.read_csv(f'data/sampled_{dataset}/csv/1000_{dataset}.csv')
df_train.dropna(inplace=True)
df_test = pd.read_csv(f'data/csv/{lang}/{dataset}_test.csv')
df_test.dropna(inplace=True)
df_lm = pd.read_csv(f'data/csv/{lang}/{dataset}_train.csv')
df_lm.dropna(inplace=True)
path = Path('./data')
model = ULMFiT(lang="id", 
               path=path,
               train_lang_model = False,
               df_lm = df_lm,
               finetune=False, 
               load_encoder=True,
               n_epocs_cls=32, 
               n_epocs_lm=2)
model.fit(df_train, df_test)

Data loaded successfully
Training the classifier
Loading Encoder
Fitting Classifier for 32 epocs


epoch,train_loss,valid_loss,accuracy,time
0,1.467601,1.64754,0.695,00:03
1,1.197154,1.126684,0.9,00:03
2,0.999957,0.598562,0.92,00:04
3,0.833055,0.317662,0.93,00:04
4,0.681729,0.251156,0.935,00:03
5,0.565536,0.249411,0.925,00:03
6,0.478178,0.262744,0.935,00:03
7,0.412469,0.251386,0.94,00:04
8,0.367508,0.235152,0.935,00:03
9,0.329979,0.246104,0.94,00:04


Saving Classifier Model as :id_32_epocs


RNNLearner(data=TextClasDataBunch;

Train: LabelList (800 items)
x: TextList
▁ xx bos ▁jakarta ▁ , xxup ▁cnn ▁indonesia ▁- ▁- ▁partai ▁golkar ▁memecat ▁kader nya ▁yang ▁menjadi ▁motor ▁penggerak ▁generasi ▁muda ▁partai ▁golkar ▁ahmad xxunk ▁kurnia ▁ , ▁karena ▁dinilai ▁telah ▁melanggar ▁aturan ▁ . ▁salah ▁satunya ▁terkait ▁p elibat an ▁lembaga ▁lain ▁dalam ▁pertemuan ▁antara ▁setya xxunk ▁dengan ▁ketua ▁mahkamah ▁agung ▁hatta ▁ali ▁ . ▁sekretaris ▁jenderal ▁partai ▁golkar ▁idrus ▁mar ham ▁mengatakan ▁ , ▁p elibat an ▁lembaga ▁lain ▁seperti ▁komisi ▁pemberantas an ▁korupsi ▁ , xxup ▁ma ▁dan ▁komisi ▁yudisial ▁seharusnya ▁tidak ▁dilakukan ▁ . ▁sebab ▁pertemuan ▁antara ▁setya ▁dengan ▁hatta ▁disebut ▁terjadi ▁dalam ▁sidang ▁doktoral ▁politikus ▁golkar ▁adi es ▁kadir ▁ . ▁sebelum ▁memecat xxunk ▁ , ▁idrus ▁mengklaim ▁telah ▁mengirim ▁surat ▁peringatan ▁terlebih ▁dulu ▁kepada ▁yang ▁bersangkutan ▁ , ▁namun ▁surat ▁peringatan ▁itu ▁tidak ▁di gu bris ▁ . ▁" ▁oleh ▁karena ▁itu ▁ , ▁beberapa ▁h

In [19]:
df_train= pd.read_csv(f'data/sampled_{dataset}/csv/5000_{dataset}.csv')
df_train.dropna(inplace=True)
df_test = pd.read_csv(f'data/csv/{lang}/{dataset}_test.csv')
df_test.dropna(inplace=True)
df_lm = pd.read_csv(f'data/csv/{lang}/{dataset}_train.csv')
df_lm.dropna(inplace=True)
path = Path('./data')
model = ULMFiT(lang="id", 
               path=path,
               train_lang_model = False,
               df_lm = df_lm,
               finetune=False, 
               load_encoder=True,
               n_epocs_cls=32, 
               n_epocs_lm=2)
model.fit(df_train, df_test)

Data loaded successfully
Training the classifier
Loading Encoder
Fitting Classifier for 32 epocs


epoch,train_loss,valid_loss,accuracy,time
0,0.973444,0.585811,0.902,00:15
1,0.614445,0.295815,0.922,00:16
2,0.402421,0.282316,0.92,00:15
3,0.317474,0.278657,0.93,00:16
4,0.296221,0.274172,0.925,00:15
5,0.278189,0.279667,0.922,00:15
6,0.269087,0.280961,0.929,00:15
7,0.267483,0.279817,0.925,00:15
8,0.261336,0.27052,0.928,00:14
9,0.258795,0.302698,0.928,00:15


Saving Classifier Model as :id_32_epocs


RNNLearner(data=TextClasDataBunch;

Train: LabelList (4000 items)
x: TextList
▁ xx bos ▁jur gen ▁klo pp ▁yakin ▁liverpool ▁klub ▁sempurna ▁untuk ▁para ▁pemain ▁memenuhi ▁potensi ▁mereka ▁ . ▁hal ▁ini ▁diungkapkan ▁sang ▁manajer ▁di ▁tengah ▁persiapan ▁merekrut ▁anggota ▁baru ▁di ▁bursa ▁transfer ▁musim ▁dingin ▁ . ▁pelatih ▁ber pa spor ▁jerman ▁tersebut ▁baru - baru ▁ini ▁membuka ▁peluang ▁perekrutan ▁pemain ▁pada ▁januari ▁menyusul ▁partisipasi ▁sadi o ▁mane ▁di ▁piala ▁afrika ▁dan ▁cedera ▁yang ▁dialami ▁philippe ▁co ut inho ▁serta ▁danny ▁ing s ▁ . ▁klo pp ▁kemudian ▁menegaskan ▁an field ▁adalah ▁tempat ▁sempurna ▁untuk ▁pemain ▁mana ▁pun ▁memaksimalkan ▁bakat ▁mereka ▁ , ▁merujuk ▁pada ▁ke ge mi langan ▁co ut inho ▁ , ▁roberto ▁firm ino ▁ , ▁adam ▁lal lana ▁dan ▁di vo ck ▁ori gi ▁di ▁bawah ▁komando ▁pelatih ▁sejak ▁tiba ▁musim ▁lalu ▁ . xxup ▁sim xxup ak xxup ▁ju xxup ga ▁: ▁klo pp ▁sin dir ▁dr ax ler ▁“ ▁mungkin ▁ada ▁beberapa ▁klub ▁di ▁dunia ▁sepak bola ▁yang ▁dapat ▁bermain ▁le

In [20]:
df_train= pd.read_csv(f'data/sampled_{dataset}/csv/9000_{dataset}.csv')
df_train.dropna(inplace=True)
df_test = pd.read_csv(f'data/csv/{lang}/{dataset}_test.csv')
df_test.dropna(inplace=True)
df_lm = pd.read_csv(f'data/csv/{lang}/{dataset}_train.csv')
df_lm.dropna(inplace=True)
path = Path('./data')
model = ULMFiT(lang="id", 
               path=path,
               train_lang_model = False,
               df_lm = df_lm,
               finetune=False, 
               load_encoder=True,
               n_epocs_cls=32, 
               n_epocs_lm=2)
model.fit(df_train, df_test)

Data loaded successfully
Training the classifier
Loading Encoder
Fitting Classifier for 32 epocs


epoch,train_loss,valid_loss,accuracy,time
0,0.691723,0.372891,0.922222,00:25
1,0.409441,0.265597,0.932778,00:25


KeyboardInterrupt: 