## Masukan library yang digunakan

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
pip install nltk





[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nurul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load dataset

In [4]:
data = pd.read_csv('dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


## Text Preprocessing

### Case Folding

In [5]:
import re

In [6]:
# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                                 # mengubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|wwww\.\S+', '', text)  # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text)              # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text)                 # menghapus tanda baca
    text = text.strip()
    return text

In [7]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t : ', raw_sample)
print('Case Folding\t : ', case_folding)

Raw data	 :  2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 :  plg yth sisa kuota flash anda kb download mytelkomsel apps di  utk cek kuotabeli paket flash atau hub


### Word Normalization

In [8]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any()
    else word for word in text.split()
    ])

    text = str.lower(text)
    return text

In [9]:
# Membandingkan before dan after word normalization
raw_data = data['teks'].iloc[696]
word_normal = text_normalize(case_folding)

print('Raw Data\t :', raw_data)
print('Word Normalize\t :', word_normal)

Raw Data	 : Btw magicomnya yg sedang Gais, gaada yg gede
Word Normalize	 : pelanggan yang terhormat sisa kuota flash anda kb download mytelkomsel apps di untuk cek kuotabeli paket flash atau hub


### Filtering (Stopword Removal)

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')

In [11]:
len(stopwords_ind)

758

In [12]:
# Melihat daftar stopwords dari nltk
stopwords_ind

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [13]:
# Membuat fungsi stopwords removal
# Menambahkan kata ke dalam stopwords
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ind = stopwords_ind + more_stopword

def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)

In [14]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t\t', stopword_removal)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stopword Removal 		 magicomnya yg gais gaada yg gede


### Stemming

In [15]:
pip -q install sastrawi

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [18]:
# Mengubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [19]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t\t :', stopword_removal)
print('Stemming \t\t :', text_stemming)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stopword Removal 		 : magicomnya yg gais gaada yg gede
Stemming 		 : magicomnya yg gais gaada yg gede


### Text Preprocessing Pipeline

In [20]:
# Membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text

In [21]:
%%time
data['clean_teks'] = data['teks'].apply(text_preprocessing_process)

CPU times: total: 5min 40s
Wall time: 13min 2s


In [22]:
data

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash my telkomsel app dpt ex...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,rupiah ribu spesial pilih aktif promo sd novem...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,langgan hormat sisa kuota flash kb download my...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,ga nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [23]:
# Menyimpan data yang sudah dipreprocessing ke dalam file csv
data.to_csv('clean_data.csv')

## Feature Engineering

In [24]:
# Memisahkan kolom feature dan target
x = data['clean_teks']
y = data['label']

In [25]:
x

0       promo beli paket flash my telkomsel app dpt ex...
1       rupiah ribu spesial pilih aktif promo sd novem...
2       langgan hormat sisa kuota flash kb download my...
3       langgan hormat sisa kuota flash kb download my...
4                rupiah ribu spesial pilih aktif buru skb
                              ...                        
1138                           yooo oke umumin grup kelas
1139                  ga nulis kerudung kirain warna jins
1140                                        mbak kirim ya
1141        nama beaok bwrangkat pagimau cas atay tranfer
1142                                       nomor bri nama
Name: clean_teks, Length: 1143, dtype: object

In [26]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1138    0
1139    0
1140    0
1141    0
1142    0
Name: label, Length: 1143, dtype: int64

### Feature Extraction (TF-IDF dan N-Gram)

In [34]:
# Save model
import pickle

# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_, open("feature_tf-idf.sav", "wb"))

In [35]:
# Menampilkan vocabulary dari TF-IDF
vec_TF_IDF.vocabulary_

{'promo': 2459,
 'beli': 322,
 'paket': 2252,
 'flash': 916,
 'my': 2044,
 'telkomsel': 3038,
 'app': 161,
 'dpt': 788,
 'extra': 887,
 'kuota': 1694,
 'lte': 1810,
 'telpon': 3041,
 'mnthr': 1995,
 'buru': 479,
 'cek': 520,
 'tselmemytsel': 3174,
 'sk': 2855,
 'rupiah': 2667,
 'ribu': 2616,
 'spesial': 2914,
 'pilih': 2339,
 'aktif': 66,
 'sd': 2721,
 'november': 2153,
 'langgan': 1724,
 'hormat': 1192,
 'sisa': 2848,
 'kb': 1514,
 'download': 784,
 'mytelkomsel': 2046,
 'apps': 166,
 'kuotabeli': 1695,
 'hub': 1211,
 'skb': 2856,
 'ekstra': 846,
 'pulsa': 2496,
 'dg': 683,
 'internet': 1310,
 'bulan': 465,
 'sjk': 2854,
 'augsept': 216,
 'detail': 678,
 'iring': 1333,
 'dgn': 684,
 'tarif': 3005,
 'hr': 1201,
 'panjang': 2265,
 'hits': 1181,
 'armada': 179,
 'curi': 608,
 'hati': 1142,
 'tekan': 3032,
 'okcall': 2205,
 'info': 1271,
 'eks': 843,
 'loh': 1796,
 'internetan': 1311,
 'pakai': 2250,
 'volume': 3290,
 'ultima': 3227,
 'mbhr': 1902,
 'hrga': 1204,
 'tariflokasi': 3007,
 't

In [36]:
# Menampilkan vocabulary dari TF-IDF
print(vec_TF_IDF.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [37]:
# Melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

3558


In [41]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1, columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
data_tabular_tf_idf.iloc[10:20, 60:70]

Unnamed: 0,akang,akangteteh,akbar,akreditasi,akses,aksi,aktif,aktifasi,aktivasi,aktivitas
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.144165,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.261863,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.244053,0.0,0.382416,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

In [44]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [45]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduced features
print('Original Feature Number', x_train.shape[1])
print('Reduced Feature Number', x_kbest_features.shape[1])

Original Feature Number 3558
Reduced Feature Number 3000


In [46]:
Data = pd.DataFrame(chi2_features.scores_, columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.931289
1,0.360753
2,1.526771
3,0.686416
4,0.758344
...,...
3553,1.124796
3554,0.497736
3555,0.686416
3556,2.842066


In [47]:
# Menampilkan feature beserta nilainya
feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data

Unnamed: 0,Nilai,Fitur
0,0.931289,aa
1,0.360753,aamiiiin
2,1.526771,aamiin
3,0.686416,ab
4,0.758344,abadi
...,...,...
3553,1.124796,zalora
3554,0.497736,zarkasi
3555,0.686416,zjt
3556,2.842066,zona


In [48]:
# Mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
2252,47.900669,paket
1694,44.307593,kuota
1102,42.783528,hadiah
2342,36.553838,pin
322,33.346075,beli
...,...,...
1980,0.051754,minta
306,0.044229,bca
617,0.040518,daftar
1856,0.033167,maksimal


In [49]:
mask = chi2_features.get_support()
mask

array([ True, False,  True, ...,  True,  True,  True])

In [51]:
# Menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square
new_feature = []
for bool, f in zip(mask, feature):
    if bool:
        new_feature.append(f)
    selected_feature = new_feature
selected_feature

['aa',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acara',
 'acaratks',
 'ada',
 'adapromo',
 'adi',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahub',
 'aigoo',
 'air',
 'ajaa',
 'ajabri',
 'ajak',
 'ajeng',
 'akang',
 'akbar',
 'akreditasi',
 'akses',
 'aksi',
 'aktif',
 'aktifasi',
 'aktivasi',
 'aktivitas',
 'akucintaislam',
 'akumulasi',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alat',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'aliando',
 'all',
 'allah',
 'alphard',
 'alquran',
 'aman',
 'amanda',
 'amin',
 'ampuun',
 'an',
 'anabdullah',
 'anak',
 'ananda',
 'and',
 'anda',
 'andaafr',
 'a

In [52]:
# Membuat vocabulary baru berdasarkan fitur yang terseleksi
new_selected_feature = {}

for (k, v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k] = v

new_selected_feature

{'promo': 2459,
 'beli': 322,
 'paket': 2252,
 'flash': 916,
 'my': 2044,
 'telkomsel': 3038,
 'app': 161,
 'dpt': 788,
 'extra': 887,
 'kuota': 1694,
 'lte': 1810,
 'telpon': 3041,
 'mnthr': 1995,
 'buru': 479,
 'cek': 520,
 'tselmemytsel': 3174,
 'sk': 2855,
 'rupiah': 2667,
 'ribu': 2616,
 'spesial': 2914,
 'pilih': 2339,
 'aktif': 66,
 'sd': 2721,
 'november': 2153,
 'langgan': 1724,
 'hormat': 1192,
 'sisa': 2848,
 'kb': 1514,
 'download': 784,
 'mytelkomsel': 2046,
 'apps': 166,
 'kuotabeli': 1695,
 'hub': 1211,
 'skb': 2856,
 'ekstra': 846,
 'pulsa': 2496,
 'dg': 683,
 'internet': 1310,
 'bulan': 465,
 'sjk': 2854,
 'augsept': 216,
 'detail': 678,
 'iring': 1333,
 'dgn': 684,
 'tarif': 3005,
 'hr': 1201,
 'panjang': 2265,
 'hits': 1181,
 'armada': 179,
 'curi': 608,
 'hati': 1142,
 'tekan': 3032,
 'okcall': 2205,
 'info': 1271,
 'eks': 843,
 'loh': 1796,
 'internetan': 1311,
 'pakai': 2250,
 'volume': 3290,
 'ultima': 3227,
 'mbhr': 1902,
 'hrga': 1204,
 'tariflokasi': 3007,
 't

In [53]:
len(new_selected_feature)

3000

In [54]:
pickle.dump(new_selected_feature, open("new_selected_feature_tf-idf.sav", "wb"))

In [55]:
# Menampilkan fitur-fitur yang sudah diseleksi
data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,ada,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modeling

In [56]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [57]:
# Import library
import random
from sklearn.model_selection import train_test_split

# Import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

In [58]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [59]:
# Menampilkan jumlah data training dan data testing
print('Banyaknya X_train : ', len(x_train))
print('Banyaknya X_test : ', len(x_test))
print('Banyaknya Y_train : ', len(y_train))
print('Banyaknya Y_test : ', len(y_test))

Banyaknya X_train :  914
Banyaknya X_test :  229
Banyaknya Y_train :  914
Banyaknya Y_test :  229


In [62]:
# Proses training menggunakan naive bayes
text_algorithm = MultinomialNB()

In [63]:
model = text_algorithm.fit(x_train, y_train)

In [66]:
# Membuat model prediksi
data_input = ("tolong belikan mama pulsa nomor as mama teman mama celaka kluarganya hubung mama ganti uangnyapenting")
data_input = text_preprocessing_process(data_input)

# Load
tfidf = TfidfVectorizer
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

# Hasil
hasil = model.predict(loaded_vec.fit_transform([data_input]))

if (hasil == 0):
    s = "SMS Normal"
elif (hasil == 1):
    s = "SMS Fraud"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Fraud


## Evaluasi Model

In [67]:
# Masukan library yang dibutuhkan
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       126
           1       0.89      0.88      0.89        66
           2       0.82      0.86      0.84        37

    accuracy                           0.91       229
   macro avg       0.89      0.90      0.89       229
weighted avg       0.91      0.91      0.91       229



In [68]:
# Menyimpan model
pickle.dump(model, open("model_fraud.sav", "wb"))