<a href="https://colab.research.google.com/github/mfadlisy/data-portfolio/blob/main/Text_Mining_Fraud_Message_Detection/Text_mining_Fraud_Message_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
import pickle # library save model
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

#Load Dataset

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Learn Data/Project Case Study/Text mining Fraud Message Detection/dataset_sms_spam_v1.csv')
data.head()

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2


#Text Preprocessing

##Case Folding

In [None]:
# membuat fungsi untuk case folding
def casefolding(text):
  text = text.lower()                                 # merubah kalimat menjadi huruf kecil
  text = re.sub(r'https?://\S+|wwww\.\S+', '', text)  # menghapus url dari kalimat
  text = re.sub(r'[-+]?[0-9]+', '',text)              # menghapus angka dari kalimat
  text = re.sub(r'[^\w\s]','',text)                   # menghapus tanda baca
  text = text.strip()
  return text

In [None]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t : ',raw_sample)
print('Case Folding\t : ',case_folding)

Raw data	 :  2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 :  plg yth sisa kuota flash anda kb download mytelkomsel apps di  utk cek kuotabeli paket flash atau hub


##Word Normalization

In [None]:
key_norm = pd.read_csv('/content/drive/MyDrive/Learn Data/Project Case Study/Text mining Fraud Message Detection/key_norm.csv')
key_norm.head()

Unnamed: 0,_id,singkat,hasil
0,1,abis,habis
1,2,accent,tekanan
2,3,accept,terima
3,4,accident,kecelakaan
4,5,achievement,prestasi


In [None]:
def text_normalization(text):
  text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
                   if (key_norm['singkat'] == word).any()
                   else word for word in text.split()
                   ])
  text = text.lower()
  return text

In [None]:
# membandingkan before dan after word normalization
word_normalization = text_normalization(case_folding)

print('Raw data\t : ',raw_sample)
print('Word Normalization\t : ',word_normalization)

Raw data	 :  2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Word Normalization	 :  pelanggan yang terhormat sisa kuota flash anda kb download mytelkomsel apps di untuk cek kuotabeli paket flash atau hubungi


##Filtering (Stopword Removal)

In [None]:
# menggunakan bahasa indonesia
stopwords_ina = stopwords.words('indonesian')

In [None]:
# mengecek jumlah kata yang sudah diambil
len(stopwords_ina)

758

In [None]:
# melihat daftar stopwords dari nltk
stopwords_ina[:11]

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri']

In [None]:
# menambahkan kata ke dalam stopword
more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ina = stopwords_ina + more_stopword

In [None]:
# membuat fungsi stopword removal
def remove_stop_word(text):
  clean_words = []
  text = text.split()
  for word in text:
    if word not in stopwords_ina:
      clean_words.append(word)
  return " ".join(clean_words)

In [None]:
# membandingkan after dan before stopword removal
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Rawa data\t\t: ',raw_sample)
print('Case Folding\t\t: ', case_folding)
print('Stopword Remvoal\t: ', stopword_removal)

Rawa data		:  Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding		:  btw magicomnya yg sedang gais gaada yg gede
Stopword Remvoal	:  magicomnya yg gais gaada yg gede


## Stemming

In [None]:
# merubah kata menjadi kata dasar
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
  text = stemmer.stem(text)
  return text

In [None]:
# membandingkan after dan before stemming
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Rawa data\t\t: ',raw_sample)
print('Case Folding\t\t: ', case_folding)
print('Stopword Remvoal\t: ', stopword_removal)
print('Text Stemming\t\t: ', text_stemming)

Rawa data		:  Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding		:  btw magicomnya yg sedang gais gaada yg gede
Stopword Remvoal	:  magicomnya yg gais gaada yg gede
Text Stemming		:  magicomnya yg gais gaada yg gede


##Text Preprocessing Pipeline

In [None]:
# menggabungkan semua proses text preprocessing
def text_preprocessing_process(text):
  text = casefolding(text)
  text = text_normalization(text)
  text = remove_stop_word(text)
  text = stemming(text)
  return text

In [None]:
# Proses stemming data memakan waktu 5 menit
# untuk itu data yang sudah di stem akan disimpan dan diimport

# %%time
# data['clean_teks'] = data['teks'].apply(text_preprocessing_process)

In [None]:
# data.sample(10)

In [None]:
data.to_csv('clean_data.csv')

In [None]:
# mengubah data, menjadi data yang sudah di stemming
data = pd.read_csv('/content/drive/MyDrive/Learn Data/Project Case Study/Text mining Fraud Message Detection/clean_data.csv')
data.sample(5)

Unnamed: 0.1,Unnamed: 0,teks,label,clean_teks
1135,1135,Yg mau ngampus aku pengen titip bawain SKL aku...,0,pergi kampus titip bawain skl prodi
467,467,Saya ARUM K PRASODJO operator TRI JKT PST beke...,1,arum k prasodjo operator tri jakarta ptpos per...
131,131,Mau Pergi Dari Hatiku nya Aliando di HP kamu? ...,2,pergi hati nya aliando hp ambil nsp gratis nya...
956,956,"oh, kelurahan dll dari data_pasien ternyata. b...",0,oh lurah lain data pasien butuh oge
164,164,Paket Flash anda 10 MB utk 1 hari akan berakhi...,2,paket flash mbak pd tarif non paket laku tangg...


#Feature Engineering

In [None]:
# memisahkan kolom feature dan target
x = data['clean_teks']
y = data['label']

In [None]:
x.head()

0    promo beli paket flash my telkomsel app extra ...
1    rupiah ribu spesial pilih aktif promo sd novem...
2    langgan hormat sisa kuota flash kb download my...
3    langgan hormat sisa kuota flash kb download my...
4             rupiah ribu spesial pilih aktif buru skb
Name: clean_teks, dtype: object

In [None]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: label, dtype: int64

##Feature Extraction (TF-IDF & N-Gram)

In [None]:
# save model
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit_transform(x.values.astype('U'))

pickle.dump(vec_TF_IDF.vocabulary_,open('feature_tf-idf.sav','wb'))

In [None]:
# menampilkan vocabulary dari tf_idf
# vec_TF_IDF.vocabulary_

In [None]:
# melihat jumlah fitur
print(len(vec_TF_IDF.get_feature_names_out()))

3416


In [None]:
# melihat fitur apa saja di dalam corpus
print(vec_TF_IDF.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [None]:
# membuat dalam bentuk tabular
x1 = vec_TF_IDF.transform(x.values.astype('U')).toarray()
data_tabular_tf_idf = pd.DataFrame(x1, columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# melihat data yang terdapat bobotnya
data_tabular_tf_idf.iloc[10:20,60:70]

Unnamed: 0,akang,akangteteh,akbar,akreditasi,akses,aksi,aktif,aktifasi,aktivasi,aktivitas
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.14944,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.262305,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.244053,0.0,0.382416,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##Feature Selection (Chi-Square)

In [None]:
# mengubah data tabular menjadi array
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [None]:
# Chi-Square
chi2_feature = SelectKBest(chi2, k=3000)
x_kbest_feature = chi2_feature.fit_transform(x_train, y_train)

# untuk reduced feature
print('Original Feature Number ', x_train.shape[1])
print('Reduce Feature Number ',x_kbest_feature.shape[1])

Original Feature Number  3416
Reduce Feature Number  3000


In [None]:
data_score = pd.DataFrame(chi2_feature.scores_,columns=['Nilai'])
data_score

Unnamed: 0,Nilai
0,0.843018
1,0.419698
2,1.558607
3,0.686416
4,0.759870
...,...
3411,1.126664
3412,0.503012
3413,0.686416
3414,2.918687


In [None]:
# menampilkan data feature beserta nilainya
feature = vec_TF_IDF.get_feature_names_out()

data_score['Fitur'] = feature
data_score

Unnamed: 0,Nilai,Fitur
0,0.843018,aa
1,0.419698,aamiiiin
2,1.558607,aamiin
3,0.686416,ab
4,0.759870,abadi
...,...,...
3411,1.126664,zalora
3412,0.503012,zarkasi
3413,0.686416,zjt
3414,2.918687,zona


In [None]:
# sort nilai fitur terbaik
data_score.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
2107,48.939838,paket
1567,45.025343,kuota
1031,43.690947,hadiah
2197,36.979908,pin
323,33.962373,beli
...,...,...
1536,0.044714,kopi
307,0.044468,bca
1712,0.031575,maksimal
3140,0.012716,via


In [None]:
mask = chi2_feature.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [None]:
# Menampilkan fitur yang terpilih berdasarkan nilai masknya atau nilai tertinggi
new_feature = []
for bool, f in zip(mask, feature):
  if bool:
    new_feature.append(f)
  selected_feature=new_feature

# selected_feature

In [None]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi
new_selected_feature = {}
for (k,v) in vec_TF_IDF.vocabulary_.items():
  if k in selected_feature:
    new_selected_feature[k]=v

# new_selected_feature

In [None]:
# melihat jumlah fitur yang terpilih
len(new_selected_feature)

3000

In [None]:
# save feature
pickle.dump(new_selected_feature,open('new_selected_feature_tf-idf.sav','wb'))

In [None]:
# menampilkan fitur yang sudah diseleksi dengan dataframe
data_selected_feature = pd.DataFrame(x_kbest_feature, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#Modelling

In [None]:
# mendefinisikan train dan test set
x = x_kbest_feature
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
# menampilkan jumlah train dan test set
print('Banyaknya x_train : ', len(x_train))
print('Banyakanya x_test : ', len(x_test))
print('Banyakanya y_train : ', len(y_train))
print('Banyakanya y_test : ', len(y_test))

Banyaknya x_train :  914
Banyakanya x_test :  229
Banyakanya y_train :  914
Banyakanya y_test :  229


In [None]:
# proses training menggunakan naive bayes
model = MultinomialNB()
model.fit(x_train, y_train)

In [None]:
# membuat model prediksi
data_input = 'bpr syariah yogya layan biaya usaha andasyaratcopy ktpkkbuku nikah jamin bpkbsertipikatproses survey cepat hubung'
data_input = text_preprocessing_process(data_input)

# load
tfidf = TfidfVectorizer
loaded_vec = TfidfVectorizer(decode_error='replace', vocabulary=set(pickle.load(open('new_selected_feature_tf-idf.sav','rb'))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if (hasil==0):
  s = 'SMS Normal'
elif (hasil==1):
  s = 'SMS Fraud'
else:
  s = 'SMS Promo'

print('Hasil Prediksi :',s)

Hasil Prediksi : SMS Fraud


#Model Evaluation

In [None]:
predicted = model.predict(x_test)
cm = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       126
           1       0.92      0.89      0.91        66
           2       0.84      0.84      0.84        37

    accuracy                           0.92       229
   macro avg       0.90      0.90      0.90       229
weighted avg       0.92      0.92      0.92       229



#Save Model

In [None]:
pickle.dump(model,open('model_fraud.sav','wb'))