In [1]:
import math
import nltk
import pandas as pd
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## Load Data
Ambil data yang ingin dilakukan text processing, dimana nantinya data itu kita lakukan pembobotan teks menggunakan 5 metode, <br>diantaranya yaitu :
1. Raw Term Weighting
2. Binary Term Weighting
3. Log Frequency Weighting
4. Term Frequency Inverse Document Frequency
5. TF-IDF Weighting

<img src="./image/proses.png" width="700px" align="center">

In [2]:
df = pd.read_csv('./datasets/motogpmandalika.csv', usecols=['teks', 'label'])
df.head()

Unnamed: 0,teks,label
0,b ada catat motor balap sesi kali naremaketurki,Positif
1,b ada rute bus gratis tonton motogp mandalika ...,Positif
2,b ahli sirkuit mandalika motogp baik titik utama,Positif
3,b airlangga hartarto kira official crew balap ...,Positif
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif


## Text Preprocessing
Tahap preprocessing mencakup berbagai proses seperti case folding, tokenizing, filtering,
stemming

### Case Folding
Case folding merupakan proses dalam text preprocessing yang dilakukan untuk menyeragamkan karakter pada data.<br> Proses case folding adalah proses mengubah seluruh huruf menjadi huruf kecil.

In [3]:
df['text_folded'] = df['teks'].str.lower()
df.head()

Unnamed: 0,teks,label,text_folded
0,b ada catat motor balap sesi kali naremaketurki,Positif,b ada catat motor balap sesi kali naremaketurki
1,b ada rute bus gratis tonton motogp mandalika ...,Positif,b ada rute bus gratis tonton motogp mandalika ...
2,b ahli sirkuit mandalika motogp baik titik utama,Positif,b ahli sirkuit mandalika motogp baik titik utama
3,b airlangga hartarto kira official crew balap ...,Positif,b airlangga hartarto kira official crew balap ...
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif,b aspal sirkuit mandalika benah tes pramusim m...


### Text Filtering
Menghilangkan kata yang tidak memiliki arti seperti simbol, link, dan lainnya.

In [4]:
df['text_filtered'] = df['text_folded'].str.replace('(@[a-z0-9]+)\w+',' ')\
          .str.replace("(b)",' ')\
          .str.replace('(http\S+)', ' ')\
          .str.replace('([^0-9a-z \t])',' ')\
          .str.replace(' +',' ')

df.head()

  df['text_filtered'] = df['text_folded'].str.replace('(@[a-z0-9]+)\w+',' ')\


Unnamed: 0,teks,label,text_folded,text_filtered
0,b ada catat motor balap sesi kali naremaketurki,Positif,b ada catat motor balap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki
1,b ada rute bus gratis tonton motogp mandalika ...,Positif,b ada rute bus gratis tonton motogp mandalika ...,ada rute us gratis tonton motogp mandalika ac...
2,b ahli sirkuit mandalika motogp baik titik utama,Positif,b ahli sirkuit mandalika motogp baik titik utama,ahli sirkuit mandalika motogp aik titik utama
3,b airlangga hartarto kira official crew balap ...,Positif,b airlangga hartarto kira official crew balap ...,airlangga hartarto kira official crew alap mo...
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif,b aspal sirkuit mandalika benah tes pramusim m...,aspal sirkuit mandalika enah tes pramusim mot...


### Text Stemming
Merupakan proses pemotongan imbuhan pada kata berimbuhan yang dijalankan dengan algoritme tertentu.

In [5]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [6]:
df['text_stemmed'] = [stemmer.stem(i) for i in df['text_filtered']]
df.head()

Unnamed: 0,teks,label,text_folded,text_filtered,text_stemmed
0,b ada catat motor balap sesi kali naremaketurki,Positif,b ada catat motor balap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki
1,b ada rute bus gratis tonton motogp mandalika ...,Positif,b ada rute bus gratis tonton motogp mandalika ...,ada rute us gratis tonton motogp mandalika ac...,ada rute us gratis tonton motogp mandalika aca...
2,b ahli sirkuit mandalika motogp baik titik utama,Positif,b ahli sirkuit mandalika motogp baik titik utama,ahli sirkuit mandalika motogp aik titik utama,ahli sirkuit mandalika motogp aik titik utama
3,b airlangga hartarto kira official crew balap ...,Positif,b airlangga hartarto kira official crew balap ...,airlangga hartarto kira official crew alap mo...,airlangga hartarto kira official crew alap mot...
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif,b aspal sirkuit mandalika benah tes pramusim m...,aspal sirkuit mandalika enah tes pramusim mot...,aspal sirkuit mandalika enah tes pramusim moto...


### Text Tokenizing
Tokenizing adalah operasi memisahkan teks menjadi potongan-potongan berupa token, bisa berupa potongan huruf,<br> kata, atau kalimat, sebelum dianalisis lebih lanjut.

In [7]:
df['tokenized'] = [i.split() for i in df['text_stemmed']]
df['stopworded'] = df['tokenized']
df.head()

Unnamed: 0,teks,label,text_folded,text_filtered,text_stemmed,tokenized,stopworded
0,b ada catat motor balap sesi kali naremaketurki,Positif,b ada catat motor balap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki,"[ada, catat, motor, alap, sesi, kali, naremake...","[ada, catat, motor, alap, sesi, kali, naremake..."
1,b ada rute bus gratis tonton motogp mandalika ...,Positif,b ada rute bus gratis tonton motogp mandalika ...,ada rute us gratis tonton motogp mandalika ac...,ada rute us gratis tonton motogp mandalika aca...,"[ada, rute, us, gratis, tonton, motogp, mandal...","[ada, rute, us, gratis, tonton, motogp, mandal..."
2,b ahli sirkuit mandalika motogp baik titik utama,Positif,b ahli sirkuit mandalika motogp baik titik utama,ahli sirkuit mandalika motogp aik titik utama,ahli sirkuit mandalika motogp aik titik utama,"[ahli, sirkuit, mandalika, motogp, aik, titik,...","[ahli, sirkuit, mandalika, motogp, aik, titik,..."
3,b airlangga hartarto kira official crew balap ...,Positif,b airlangga hartarto kira official crew balap ...,airlangga hartarto kira official crew alap mo...,airlangga hartarto kira official crew alap mot...,"[airlangga, hartarto, kira, official, crew, al...","[airlangga, hartarto, kira, official, crew, al..."
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif,b aspal sirkuit mandalika benah tes pramusim m...,aspal sirkuit mandalika enah tes pramusim mot...,aspal sirkuit mandalika enah tes pramusim moto...,"[aspal, sirkuit, mandalika, enah, tes, pramusi...","[aspal, sirkuit, mandalika, enah, tes, pramusi..."


### Remove Stopwords
Stopword merupakan kata yang diabaikan dalam pemrosesan dan biasanya disimpan di dalam stop lists.<br> Stop list ini berisi daftar kata umum yang mempunyai fungsi tapi tidak mempunyai arti.

In [8]:
swords = open('./datasets/stopwords-id.txt', 'r')
swords = swords.read()
swords = swords.replace('\n', ',').split(',')

In [9]:
# remove stopwords
def remove_stopwords(df_name):
    for index, value in enumerate(df_name):
        for j in df_name[index]:
            if j in swords:
                df_name[index].remove(df_name[index][df_name[index].index(j)])

In [10]:
remove_stopwords(df['stopworded'])
df.head()

Unnamed: 0,teks,label,text_folded,text_filtered,text_stemmed,tokenized,stopworded
0,b ada catat motor balap sesi kali naremaketurki,Positif,b ada catat motor balap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki,ada catat motor alap sesi kali naremaketurki,"[catat, motor, alap, sesi, kali, naremaketurki]","[catat, motor, alap, sesi, kali, naremaketurki]"
1,b ada rute bus gratis tonton motogp mandalika ...,Positif,b ada rute bus gratis tonton motogp mandalika ...,ada rute us gratis tonton motogp mandalika ac...,ada rute us gratis tonton motogp mandalika aca...,"[rute, us, gratis, tonton, motogp, mandalika, ...","[rute, us, gratis, tonton, motogp, mandalika, ..."
2,b ahli sirkuit mandalika motogp baik titik utama,Positif,b ahli sirkuit mandalika motogp baik titik utama,ahli sirkuit mandalika motogp aik titik utama,ahli sirkuit mandalika motogp aik titik utama,"[ahli, sirkuit, mandalika, motogp, aik, titik,...","[ahli, sirkuit, mandalika, motogp, aik, titik,..."
3,b airlangga hartarto kira official crew balap ...,Positif,b airlangga hartarto kira official crew balap ...,airlangga hartarto kira official crew alap mo...,airlangga hartarto kira official crew alap mot...,"[airlangga, hartarto, official, crew, alap, mo...","[airlangga, hartarto, official, crew, alap, mo..."
4,b aspal sirkuit mandalika benah tes pramusim m...,Positif,b aspal sirkuit mandalika benah tes pramusim m...,aspal sirkuit mandalika enah tes pramusim mot...,aspal sirkuit mandalika enah tes pramusim moto...,"[aspal, sirkuit, mandalika, enah, tes, pramusi...","[aspal, sirkuit, mandalika, enah, tes, pramusi..."


#### kekurangan pada Sastrawi terdapat beberapa kalimat yang tidak sesuai ketika dicari akar katanya seperti diminati menjadi mati

## Text Weighting

### Binary Term Weighting

Bobot suatu term pada binary term weighting adalah 1 (jika term tersebut muncul pada suatu dokumen)<br>atau 0 (jika term tersebut tidak muncul di dokumen)
<img src="./image/bin.png" alt="log weight" width="300"/>
<br>Binary term weighting tidak memperhatikan frekuensi kemunculan kata pada sebuah dokumen, pertama untuk melakukan Binary Weighting yaitu membuat dataframe baru, kemudian disimpan pada variabel bin_weight. 

Keterangan :
- <b>𝑤(𝑡,𝑑)</b> = pembobotan kemunculan (frekuensi) term `t` pada dokumen `d`

In [11]:
term = [item for value in df['stopworded'] for item in value]
doc = [f'd{i}' for i in range(0,len(df))]

variabel text_appear digunakan tf(t,d) untuk melihat setiap kemunculan pada data

In [12]:
text_appear = term

Sebelum itu kita buat dataframe baru dengan nama columnnya `d1`, `d2`, `d3`, dan `d4`

In [13]:
idx = list(set(term))
bin_weight = pd.DataFrame(columns=doc, index=idx)
bin_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,,,,,,,,,,,...,,,,,,,,,,
al,,,,,,,,,,,...,,,,,,,,,,
joni,,,,,,,,,,,...,,,,,,,,,,
musim,,,,,,,,,,,...,,,,,,,,,,
ninanews,,,,,,,,,,,...,,,,,,,,,,


Selanjutnya melakukan pembobotan menggunakan Binary Weighting dengan cara jika term muncul pada suatu dokumen maka berikan nilai 1, jika tidak muncul berikan 0.

In [14]:
for index, value in enumerate(df['stopworded']):
    for item in value:
        if df['stopworded'][index][df['stopworded'][index].index(item)] in term:
            bin_weight[bin_weight.columns[index]].loc[item] = 1

Mengganti nilai NaN menjadi 0 menggunakan .fillna()

In [15]:
bin_weight = bin_weight.fillna(0)

In [16]:
bin_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
al,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joni,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
musim,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ninanews,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Raw Term Weighting

Bobot suatu term pada sebuah dokumen merupakan jumlah kemunculan term tersebut pada dokumen, <br>Menghitung jumlah kemunculan text pada setiap document dan disimpan pada variabel text_count, berikut rumus dari Raw Term Weighting.

<h2><center>𝑤(𝑡,𝑑) = 𝑡𝑓(𝑡,𝑑)</center></h2>

Keterangan : 
- <b>𝑡𝑓(𝑡,𝑑)</b> = jumlah kemunculan (frekuensi) term `t` pada dokumen `d`

Sebelum itu kita buat dataframe baru dengan nama columnnya `d1`, `d2`, `d3`, dan `d4`

In [17]:
raw_weight = pd.DataFrame(columns=doc, index=idx)
raw_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,,,,,,,,,,,...,,,,,,,,,,
al,,,,,,,,,,,...,,,,,,,,,,
joni,,,,,,,,,,,...,,,,,,,,,,
musim,,,,,,,,,,,...,,,,,,,,,,
ninanews,,,,,,,,,,,...,,,,,,,,,,


Selanjutnya menghitung setiap kemunculan text pada dataset yang nantinya akan dilakukan pembobotan, sebelum itu kita perlu hilangkan nilai NaN terlebih dahulu menggunakan .fillna(0) atau mengisikanya dengan nilai 0.

In [18]:
raw_weight = raw_weight.fillna(0)

for index, value in enumerate(df['stopworded']):
    for item in value:
        if df['stopworded'][index][df['stopworded'][index].index(item)] in text_appear:
            raw_weight[raw_weight.columns[index]].loc[item] += 1

In [19]:
raw_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
al,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joni,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
musim,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ninanews,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Log Term Weighting

Bobot term pada sebuah dokumen merupakan logaritma dari frekuensi kemunculan term pada dokumen<br>
<img src="./image/logweight.png" alt="log weight" width="300"/>
<br>
Pertama kita copy nilai dari raw_weight ke log_weight sehingga tidak mengganggu nilai yang ada di raw_weight, yang nantinya kita dapat membandingkan satu sama lain.

In [20]:
log_weight = raw_weight.copy()

Berikut cara pembobotan dengan Log Term, pada line terakhir kita isikan 0 untuk data yang NaN

In [21]:
for col in log_weight.columns:
    log_weight[col] = log_weight[log_weight[col] != 0][col].apply(lambda x: math.log10(x)+1)

log_weight = log_weight.fillna(0)

Hasil dari Log Term Weighting

In [22]:
log_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
al,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
musim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ninanews,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Document Term Frequency

Document frequency <code>(df<sub>t</sub>)</code> merupakan jumlah dokumen yang mengandung term `t`<br>
<br>Istilah Term :
- <code>Rare terms</code> merupakan term yang memiliki nilai `df` yang kecil
- <code>Frequent terms</code> merupakan term yang memiliki nilai `df` besar

<br><code>Rare terms</code> seharusnya memiliki bobot yang lebih besar dari `Frequent terms` karena rare terms lebih informatif, Kata-kata yang muncul di banyak dokumen adalah kata yang tidak penting, karena tidak bisa membedakan isi dokumen-dokumen tersebut, Meskipun telah dilakukan filtering, masih terdapat kata-kata yang sering muncul
<code>Contoh : merupakan, tinggi, bisa, dll</code>

<br>Pertama kita copy nilai dari log_weight ke doc_weight sehingga tidak mengganggu nilai yang ada di log_weight, yang nantinya kita dapat membandingkan satu sama lain di akhir.

In [23]:
doc_weight = log_weight.copy()

Langkah selanjutnya yaitu membuat kolom baru dengan nama `df` dan isinya 0

In [24]:
doc_weight['df'] = 0

Selanjutnya yaitu menghitung setiap kemunculan `term` pada dokumen

In [25]:
for row in range(len(doc_weight)):
    for col in range(0, len(doc)):
        if doc_weight.iloc[row][col] > 0:
            doc_weight['df'].iloc[row] += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [26]:
doc_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d95,d96,d97,d98,d99,d100,d101,d102,d103,df
adan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
al,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
joni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
musim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
ninanews,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


## IDF Weight

Inverse document frequency weight (IDF) dapat dicari menggunakan :
<img src="./image/idf.png" alt="log weight" width="300"/>
Keterangan :
- <b>𝑁</b> merupakan total kolom pada kasus ini 4 yaitu `d1, d2, d3, d4`
- <b><code>df<sub>t</sub></code></b> = Document frequency of `t` (jumlah dokumen yang mengandung term `t`)
- Perhitungan <b><code>idf<sub>t</sub></code></b> dapat menggunakan logaritma basis berapapun

<br>Pertama kita copy nilai dari doc_weight ke idf_weight sehingga tidak mengganggu nilai yang ada di doc_weight, yang nantinya kita dapat membandingkan satu sama lain di akhir.

In [27]:
idf_weight = doc_weight.copy()

Langkah selanjutnya yaitu membuat kolom baru dengan nama `idf` dan isinya 0

In [28]:
idf_weight['idf'] = 0

In [29]:
for row in range(len(idf_weight)):
    for col in range(0, len(doc)):
        if idf_weight.iloc[row][col] > 0:
            idf_weight['idf'].iloc[row] = math.log10(len(doc)/idf_weight['df'].iloc[row])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [30]:
idf_weight.head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d96,d97,d98,d99,d100,d101,d102,d103,df,idf
adan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.017033
al,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1.716003
joni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.017033
musim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1.716003
ninanews,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,2.017033


## TF-IDF

tf-idf merupakan term weighting yang paling populer<br><br>
Catatan : 
- Tanda “-” pada notasi tf-idf adalah tanda hubung, bukan pengurangan!
- Term yang sering muncul di satu dokumen dan jarang muncul pada dokumen lain akan mendapatkan nilai tinggi

Nilai `tf-idf` dari sebuah term `t` merupakan perkalian antara nilai `tf` dan nilai `idf` nya.
<img src="./image/tfidf.png" alt="log weight" width="300"/>

<br>Pertama kita copy nilai dari idf_weight ke tfidf sehingga tidak mengganggu nilai yang ada di idf_weight, yang nantinya kita dapat membandingkan satu sama lain di akhir. untk variabel result digunakan untuk menampung hasil akhir TF-IDF

In [31]:
tfidf = idf_weight.copy()

Selanjutnya melakukan perhitungan menggunakan TF-IDF

In [32]:
for row in range(len(tfidf)):
    for col in range(0, len(doc)):
        tfidf[doc[col]].iloc[row] = math.log(tfidf[doc[col]].iloc[row]+1) * tfidf['idf'].iloc[row]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [33]:
tfidf[doc].head()

Unnamed: 0,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103
adan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
al,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joni,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
musim,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.189443,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ninanews,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.398101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
print(tfidf[tfidf['d1'] > 0][['d1']], tfidf[tfidf['d2'] > 0][['d2']], sep='\n\n')

                 d1
lengkap    0.812324
us         1.189443
mandalika  0.165580
tonton     0.913612
motogp     0.150893
rute       1.398101
aca        0.913612
gratis     1.067386

                 d2
sirkuit    0.354811
mandalika  0.165580
ahli       1.398101
aik        0.913612
motogp     0.150893
utama      1.189443
titik      1.189443


## Feature Selecetion

In [35]:
df['rank'] = 0
df['text_choosed'] = None

In [36]:
for index, val in enumerate(tfidf[doc]):
    df['rank'].loc[index] = tfidf[val].loc[tfidf[val] == max(tfidf[val])][0]
    df['text_choosed'].loc[index] = tfidf[val].loc[tfidf[val] == max(tfidf[val])].index.values[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [37]:
df[['teks', 'rank', 'text_choosed']].sort_values(by='rank', ascending=False).head(10)

Unnamed: 0,teks,rank,text_choosed
15,b beritaterkini polda ntb tempat personel khus...,1.680909,polda
31,b hafizh syahrin german moto classicmotogp mot...,1.680909,hafizh
9,b balik indah sirkuit mandalika emak-emak self...,1.680909,emak
101,b yuk libur lombok lombok travelist melayani n...,1.430043,jalan
34,b hai traveller yuk libur lombok lombok travel...,1.430043,li
42,b jadwal lintas balap motogp musim rilis musim...,1.430043,musim
10,b bandara igustingurahrai bandara hubung lombo...,1.430043,andara
92,b target sirkuitmandalika rampung cepat target...,1.430043,target
53,b kemenparekraf-dorna sports jalin kerja kait ...,1.430043,sports
16,b bmw indonesia serah unit bmw e m sport mgpa ...,1.430043,mw


## The Next
- Lemmatization
- Remove Duplicate Text
- Remove Length Text 1 char


## Classification With Naive Byess

In [122]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#### Split Data

In [123]:
X = tfidf[doc].to_numpy().reshape(len(tfidf[doc].columns), len(tfidf[doc].index))
y = df['label']

In [124]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [125]:
model = GaussianNB()

In [126]:
model.fit(x_train, y_train)

GaussianNB()

In [127]:
pred = model.predict(x_test)

In [128]:
print(pred)

['Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif'
 'Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif'
 'Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif' 'Positif']


#### Melihat Fit tidaknya Model

In [129]:
confusion_matrix(y_test, pred)

array([[19,  0],
       [ 2,  0]], dtype=int64)

In [130]:
accuracy_score(y_test, pred)

0.9047619047619048

In [131]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

     Positif       0.90      1.00      0.95        19
     negatif       0.00      0.00      0.00         2

    accuracy                           0.90        21
   macro avg       0.45      0.50      0.48        21
weighted avg       0.82      0.90      0.86        21



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## TF-IDF With TfidfVectorizer Library

In [132]:
vectorizer = TfidfVectorizer(max_features=2500)

In [133]:
dm = df['stopworded'].copy()
for idx, val in enumerate(dm):
    dm[idx] = ' '.join(val)

In [134]:
X = vectorizer.fit_transform(dm).toarray()
y = df['label']

In [135]:
model2 = GaussianNB()

In [136]:
model2.fit(x_train, y_train)

GaussianNB()

In [137]:
y_preds = model2.predict(x_test)

#### Melihat Fit tidaknya Model

In [138]:
print(confusion_matrix(y_test,y_preds))
print(classification_report(y_test,y_preds))
print('nilai akurasinya adalah ',accuracy_score(y_test, y_preds))

[[19  0]
 [ 2  0]]
              precision    recall  f1-score   support

     Positif       0.90      1.00      0.95        19
     negatif       0.00      0.00      0.00         2

    accuracy                           0.90        21
   macro avg       0.45      0.50      0.48        21
weighted avg       0.82      0.90      0.86        21

nilai akurasinya adalah  0.9047619047619048


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Kita bisa melihat dari metode TF-IDF Manual dengan TF-IDF menggunakan Library akurasi model yang dihasilkan sama