### Masukkan library yang digunakan

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load dataset

In [None]:
data = pd.read_csv('dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


### Text Preprocessing
#### Case Folding

In [None]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower() # merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text) # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text) # menghapus tanda baca
    text = text.strip()
    return text

In [None]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t :',raw_sample)
print('Case Folding\t :', case_folding)

Raw data	 : 2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 : plg yth sisa kuota flash anda kb download mytelkomsel apps di  utk cek kuotabeli paket flash atau hub


### Normalisasi Kata

In [None]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if (key_norm['singkat'] == word).any()
    else word for word in text.split()
    ])

    text = str.lower(text)
    return text

In [None]:
# membandingkan before dan after normalisasi kata

raw_data = data['teks'].iloc[696]
word_normal = text_normalize(case_folding)

print('Raw data\t :', raw_data)
print('Word Normalize\t :', word_normal)

Raw data	 : Btw magicomnya yg sedang Gais, gaada yg gede
Word Normalize	 : pelanggan yang terhormat sisa kuota flash anda kb download mytelkomsel apps di untuk cek kuotabeli paket flash atau hub


### Filtering (Stopword Removal)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')

In [None]:
len(stopwords_ind)

758

In [None]:
# melihat daftar stopword dari nltk
stopwords_ind

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [None]:
# membuat fungsi stopword removal

# menambahkan kata dalam stopword
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ind = stopwords_ind + more_stopword

def remove_stop_word(text):
    stopwords = set([
        'tsel', 'gb', 'rb', 'btw'
    ])
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    return ' '.join(filtered_words)

In [None]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t :', stopword_removal)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stopword Removal 	 : magicomnya yg sedang gais gaada yg gede


### Stemming

In [None]:
!pip -q install sastrawi

In [None]:
# merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [None]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t :', stopword_removal)
print('Stemming \t\t :', text_stemming)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
Stopword Removal 	 : magicomnya yg sedang gais gaada yg gede
Stemming 		 : magicomnya yg sedang gais gaada yg gede


### Text Preprocessing Pipeline

In [None]:
# membuat fungsi untuk menggabungkan seluruh langkah text preprocessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text

In [None]:
%%time
data['clean_teks'] = data['teks'].apply(text_preprocessing_process)

CPU times: total: 3min 29s
Wall time: 3min 32s


In [None]:
data

Unnamed: 0,teks,label,clean_teks
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash mulai di my telkomsel a...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,hari hanya rupiah ribu spesial buat anda yang ...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,langgan yang hormat sisa kuota flash anda kb d...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,langgan yang hormat sisa kuota flash anda kb d...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,hari hanya rupiah ribu spesial buat anda yang ...
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo sama oke nanti saya umumin di grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,belum ga ada nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak mau kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [None]:
# simpan data yang sudah dipreprocessing ke dalam file csv
data.to_csv('clean_data.csv')

### Feature Engineering


In [None]:
# pisahkan kolom feature dan target
x = data['clean_teks']
y = data['label']

In [None]:
x

0       promo beli paket flash mulai di my telkomsel a...
1       hari hanya rupiah ribu spesial buat anda yang ...
2       langgan yang hormat sisa kuota flash anda kb d...
3       langgan yang hormat sisa kuota flash anda kb d...
4       hari hanya rupiah ribu spesial buat anda yang ...
                              ...                        
1138        yooo sama oke nanti saya umumin di grup kelas
1139        belum ga ada nulis kerudung kirain warna jins
1140                                    mbak mau kirim ya
1141        nama beaok bwrangkat pagimau cas atay tranfer
1142                        nomor bri atas nama kamu mana
Name: clean_teks, Length: 1143, dtype: object

In [None]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1138    0
1139    0
1140    0
1141    0
1142    0
Name: label, Length: 1143, dtype: int64

### Feature Extraction (TF-IDF dan N-Gram)

In [None]:
# save model
import pickle

# RF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [None]:
# menampilkan vocabulary dari tf-idf
vec_TF_IDF.vocabulary_

{'promo': 2550,
 'beli': 345,
 'paket': 2337,
 'flash': 959,
 'mulai': 2108,
 'di': 728,
 'my': 2120,
 'telkomsel': 3169,
 'app': 171,
 'dpt': 830,
 'extra': 930,
 'kuota': 1749,
 'lte': 1872,
 'dan': 662,
 'telpon': 3172,
 'hingga': 1224,
 'mnthr': 2070,
 'buru': 512,
 'cek': 554,
 'tselmemytsel': 3317,
 'sk': 2975,
 'hari': 1176,
 'hanya': 1167,
 'rupiah': 2760,
 'ribu': 2709,
 'spesial': 3035,
 'buat': 485,
 'anda': 113,
 'yang': 3527,
 'pilih': 2430,
 'aktif': 71,
 'sekarang': 2850,
 'juga': 1493,
 'sd': 2821,
 'november': 2233,
 'langgan': 1785,
 'hormat': 1239,
 'sisa': 2968,
 'kb': 1574,
 'download': 826,
 'mytelkomsel': 2122,
 'apps': 176,
 'untuk': 3392,
 'kuotabeli': 1750,
 'atau': 217,
 'hub': 1258,
 'skb': 2976,
 'lagi': 1770,
 'ekstra': 889,
 'pulsa': 2588,
 'dg': 723,
 'internet': 1357,
 'bulan': 497,
 'pertama': 2418,
 'sjk': 2974,
 'augsept': 229,
 'detail': 718,
 'ada': 11,
 'iring': 1380,
 'dgn': 724,
 'tarif': 3135,
 'hr': 1248,
 'panjang': 2351,
 'dari': 669,
 'hits

In [None]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))

3563


In [None]:
# melihat fitur apa saja yang ada didalam corpus
print(vec_TF_IDF.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [None]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,ajar,ajeng,akademik,akan,akang,akangteteh,akbar,akhir,akreditasi,akses
10,0.0,0.0,0.0,0.231654,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.139011,0.0,0.0,0.0,0.154994,0.0,0.0
12,0.0,0.0,0.0,0.160868,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.182731,0.0,0.0,0.0,0.0,0.0,0.219226
14,0.0,0.0,0.0,0.235332,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Feature Selection

In [None]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
chi2_features = SelectKBest(chi2, k=3000)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduce feature
print('Original Feature Number', x_train.shape[1])
print('Reduced Feature Number', x_kbest_features.shape[1])

Original Feature Number 3563
Reduced Feature Number 3000


In [None]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.778662
1,0.360753
2,1.337194
3,0.716455
4,0.783102
...,...
3558,1.095635
3559,0.494099
3560,0.716455
3561,2.685028


In [None]:
# menampilkan feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data

Unnamed: 0,Nilai,Fitur
0,0.778662,aa
1,0.360753,aamiiiin
2,1.337194,aamiin
3,0.716455,ab
4,0.783102,abadi
...,...,...
3558,1.095635,zalora
3559,0.494099,zarkasi
3560,0.716455,zjt
3561,2.685028,zona


In [None]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
2337,44.191916,paket
1145,42.656336,hadiah
1749,39.717546,kuota
2433,37.689749,pin
1663,31.976164,klik
...,...,...
1708,0.035780,kopi
939,0.032327,fb
652,0.032191,daftar
1922,0.025060,maksimal


In [None]:
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [None]:
# menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square
new_feature=[]
for bool, f in zip(mask,feature):
    if bool :
        new_feature.append(f)
    selected_feature = new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'ada',
 'adalah',
 'adapromo',
 'adi',
 'adik',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agak',
 'agam',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahub',
 'aidzin',
 'aigoo',
 'air',
 'aja',
 'ajaa',
 'ajabri',
 'ajak',
 'ajeng',
 'akan',
 'akang',
 'akbar',
 'akhir',
 'akreditasi',
 'akses',
 'aksi',
 'aktif',
 'aktifasi',
 'aktivasi',
 'aktivitas',
 'akucintaislam',
 'akumulasi',
 'akun',
 'akurasi',
 'akurat',
 'alaikum',
 'alaikumsaya',
 'alaiqum',
 'alam',
 'alamat',
 'alamsyah',
 'alat',
 'alesannya',
 'algoritma',
 'alhamdulillah',
 'alhuda',
 'ali',
 'aliando',
 'all',
 'allah',
 'alphard',
 'alquran',
 'aman',
 'amanda',
 'amat',
 'amin',
 'ampuun',
 'an',
 'anabdu

In [None]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi

new_selected_feature = {}

for(k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature

{'promo': 2550,
 'beli': 345,
 'paket': 2337,
 'flash': 959,
 'mulai': 2108,
 'di': 728,
 'my': 2120,
 'telkomsel': 3169,
 'app': 171,
 'dpt': 830,
 'extra': 930,
 'kuota': 1749,
 'lte': 1872,
 'dan': 662,
 'telpon': 3172,
 'hingga': 1224,
 'mnthr': 2070,
 'buru': 512,
 'cek': 554,
 'tselmemytsel': 3317,
 'sk': 2975,
 'hari': 1176,
 'hanya': 1167,
 'rupiah': 2760,
 'ribu': 2709,
 'spesial': 3035,
 'buat': 485,
 'anda': 113,
 'yang': 3527,
 'pilih': 2430,
 'aktif': 71,
 'sekarang': 2850,
 'juga': 1493,
 'sd': 2821,
 'november': 2233,
 'langgan': 1785,
 'hormat': 1239,
 'sisa': 2968,
 'kb': 1574,
 'download': 826,
 'mytelkomsel': 2122,
 'apps': 176,
 'untuk': 3392,
 'kuotabeli': 1750,
 'atau': 217,
 'hub': 1258,
 'skb': 2976,
 'lagi': 1770,
 'ekstra': 889,
 'pulsa': 2588,
 'dg': 723,
 'internet': 1357,
 'bulan': 497,
 'pertama': 2418,
 'sjk': 2974,
 'augsept': 229,
 'detail': 718,
 'ada': 11,
 'iring': 1380,
 'dgn': 724,
 'tarif': 3135,
 'hr': 1248,
 'panjang': 2351,
 'dari': 669,
 'hits

In [None]:
len(new_selected_feature)

3000

In [None]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav",'wb'))

In [None]:
# menampilkan fitur yang sudah diseleksi

data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,ada,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.185855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Modelling

In [None]:
selected_x = x_kbest_features
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# import library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB

In [None]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)


In [None]:
# menampilkan jumlah data training dan data testing
print('Banyaknya X_train : ', len(x_train))
print('Banyaknya X_test : ', len(x_test))
print('Banyaknya Y_train : ', len(y_train))
print('Banyaknya Y_test : ', len(y_test))


Banyaknya X_train :  914
Banyaknya X_test :  229
Banyaknya Y_train :  914
Banyaknya Y_test :  229


In [None]:
# proses training menggunakan naive bayes
text_algorithm = MultinomialNB()

In [None]:
model = text_algorithm.fit(x_train, y_train)

In [None]:
# membuat model prediksi

data_input = ("promo beli paket flash mulai di my telkomsel app dpt extra kuota g lte dan extra telpon hingga mnthr buru cek di tselmemytsel sk")
data_input = text_preprocessing_process(data_input)

# load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Promo


In [None]:
# membuat model prediksi

data_input = ("saya delete saja ya nanti nama pull dulu atau sync dulu nanti masukin yang nama baru push")
data_input = text_preprocessing_process(data_input)

# load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Normal


In [None]:
# membuat model prediksi

data_input = ("tolong uang di transfer saja kesini mandiri an rahmad adrian nomor rek sms saja kalo sudah kirim trims")
data_input = text_preprocessing_process(data_input)

# load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "SMS Normal"
elif(hasil==1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 SMS Penipuan


### Evaluasi Model

In [None]:
# masukkan library yang dibutuhkan
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       126
           1       0.92      0.82      0.86        66
           2       0.80      0.89      0.85        37

    accuracy                           0.90       229
   macro avg       0.88      0.89      0.88       229
weighted avg       0.91      0.90      0.90       229



In [None]:
# menyimpan model
pickle.dump(model,open("model_fraud.sav", "wb"))