# SMS Spam Detector

Natural Language Processing (COMP6576001) - LA01 - Kelompok 7

- Nadya Tyandra (2440032820)
- Randy Antonio (2440034170)
- Farrel Rasyad (2440048560)

## 1. Importing Libraries

In [1]:
import pandas as pd
import re
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import tkinter as tk
import tkinter.font as TkFont

## 2. Data Loading

In [2]:
data = pd.read_csv('dataset/dataset_sms_spam_v1.csv')
data

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2
...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0
1140,Mba mau kirim 300 ya,0
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0


## 3. Data Preprocessing

### 3.1 Data Preprocessing - Case Folding

In [3]:
data["label"].replace({1 : 2}, inplace=True)
data.loc[data["label"] == 2, "label"] = 1
data.dropna(inplace=True)
data['Teks'] = data['Teks'].str.lower()
data

Unnamed: 0,Teks,label
0,[promo] beli paket flash mulai 1gb di my telko...,1
1,2.5 gb/30 hari hanya rp 35 ribu spesial buat a...,1
2,"2016-07-08 11:47:11.plg yth, sisa kuota flash ...",1
3,"2016-08-07 11:29:47.plg yth, sisa kuota flash ...",1
4,4.5gb/30 hari hanya rp 55 ribu spesial buat an...,1
...,...,...
1138,"yooo sama2, oke nanti aku umumin di grup kelas",0
1139,😁 sebelumnya ga ad nulis kerudung. kirain warn...,0
1140,mba mau kirim 300 ya,0
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0


### 3.2 Data Preprocessing - Data Cleaning

In [4]:
def remove_http(s):
    return re.sub(r'http:[^\s]+|https:[^\s]+|www.[^\s]+', ' ', s)

def remove_hashtag(s):
    return re.sub(r'#[\w\d]+', ' hashtag ', s)

def remove_punc(s):
    return re.sub(r'[^\w\s]', ' ', s)

def remove_number(s):
    return re.sub(r'\d+', ' ', s)

def remove_spaces(s):
    return re.sub(r'\s{2,}', ' ', s)

def remove_newline(s):
    return re.sub(r'\\n{1}', ' ', s)

def remove_leadingspace(s):
    return s.strip()

def clean_data(data):
    data['Teks'] = data['Teks'].apply(lambda x: remove_http(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_hashtag(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_punc(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_number(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_spaces(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_newline(x))
    data['Teks'] = data['Teks'].apply(lambda x: remove_leadingspace(x))
    return data

### 3.3 Data Preprocessing - Text Normalization

In [5]:
kamus1 = pd.read_csv('dataset/new_kamusalay.csv', encoding="ISO-8859-1", names=['nonformal', 'formal'])
kamus2 = pd.read_csv('dataset/colloquial-indonesian-lexicon.csv', encoding="ISO-8859-1", usecols=['slang', 'formal'])
kamus2 = kamus2.rename(columns={'slang': 'nonformal'})

def normalize(data, kamusNormalisasi):
    for index in range(len(data)):
        d = kamusNormalisasi.set_index('nonformal')['formal'].to_dict()
        p = re.compile(r'\b(' + '|'.join(d.keys()) + r')\b')
        b = p.sub(lambda x: d[x.group()], data.Teks.iloc[index])
        data.loc[index, 'Teks'] = b
        
def normalization(data):
    normalize(data, kamus1)
    normalize(data, kamus2)       

### 3.4 Data Preprocessing - Stemming

In [6]:
stemmer = StemmerFactory().create_stemmer()

def stemming(data):
    for index in range(len(data)):
        sentence = data['Teks'].iloc[index]
        data.loc[index, 'Teks'] = stemmer.stem(sentence)

### 3.5 Data Preprocessing - Removing Stop Words

In [7]:
def remove_sw(dt, factory, stopword):
    for i in range(len(dt)):
        sentence = dt['Teks'].iloc[i]
        dt.loc[i, 'Teks'] = stopword.remove(sentence)
        
def stopword(data):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    remove_sw(data, factory, stopword)
    return data

### 3.6 Data Preprocessing - Removing Data that Have Less than or Equal 3 Words

In [8]:
def remove_three_words(data):
    data['len'] = data['Teks'].apply(lambda x: len(x.split()))
    data = data[~data['len'].isin([1,2,3])]
    return data

## 4. Feature Extraction, Classification, and Evaluation

In [9]:
vectorizer = None
lr = None
precision = None
recall = None
accuracy = None
f1 = None

def splitting(data):
    x = data['Teks']
    y = data['label']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    return x_train, x_test, y_train, y_test

def vectorizing(x_train, x_test):
    global vectorizer
    vectorizer = TfidfVectorizer()
    x_train_vector = vectorizer.fit_transform(x_train)
    x_test_vector = vectorizer.transform(x_test)
    return x_train_vector, x_test_vector

def modelling(data):
    global lr
    global precision
    global recall
    global accuracy
    global f1
    x_train, x_test, y_train, y_test = splitting(data)
    x_train_vector, x_test_vector = vectorizing(x_train, x_test)
    lr = LogisticRegression(random_state = 0).fit(x_train_vector, y_train)
    y_pred = lr.predict(x_test_vector)
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

## 5. GUI Building

In [10]:
input_text = None
input_data = None

def buildGUI():
    root = tk.Tk()

    root.title("SMS Spam Detector")
    root.iconbitmap(r'asset/spam.ico')
    root.geometry('900x600')
    root.resizable(width = False, height = False)

    header = TkFont.Font(family = "San Francisco", size = 24, weight = "bold")
    subheader = TkFont.Font(family = "San Francisco", size = 10)
    body = TkFont.Font(family = "San Francisco", size = 8)

    canvas1 = tk.Canvas(root, width = 900, height = 600)
    canvas1.pack()

    title = tk.Label(text = "SMS Spam Detector", font = header)
    canvas1.create_window(450, 40, window = title)

    # Input text
    input_label = tk.Label(root, text = "Input text", font = subheader).place(x = 40, y = 120)
    input_entry = tk.Text(root, height = 4, width = 40)
    canvas1.create_window(360, 130, window = input_entry)
    
    # After text cleaning
    cleaning_label = tk.Label(root, text = "After text cleaning", font = subheader).place(x = 40, y = 270)
    cleaning_entry = tk.Text(root, height = 4, width = 40, state = tk.DISABLED)
    canvas1.create_window(360, 280, window = cleaning_entry)
    
    # After normalization
    nor_label = tk.Label(root, text = "After normalization", font = subheader).place(x = 40, y = 360)
    nor_entry = tk.Text(root, height = 4, width = 40, state = tk.DISABLED)
    canvas1.create_window(360, 370, window = nor_entry)
    
    # After stemming
    stemming_label = tk.Label(root, text = "After stemming", font = subheader).place(x = 40, y = 450)
    stemming_entry = tk.Text(root, height = 4, width = 40, state = tk.DISABLED)
    canvas1.create_window(360, 460, window = stemming_entry)
    
    # Credit
    credit = tk.Label(text = "Kelompok 7 - Nadya Tyandra (2440032820), Randy Antonio (2440034170), Farrel Rasyad (2440048560) - Natural Language Processing (COMP6576001) - LA01", font = body)
    canvas1.create_window(450, 590, window = credit)
    
    def applyPreprocessing():
        global input_text
        global input_data
        input_text = str(input_entry.get("1.0", tk.END))
        input_data = pd.DataFrame({'Teks': [input_text]})
        
        input_data = clean_data(input_data)
        cleaning_entry['state'] = tk.NORMAL
        cleaning_entry.delete("1.0", tk.END)
        cleaning_output = input_data.iat[0,0]
        cleaning_entry.insert(tk.END, cleaning_output)
        cleaning_entry['state'] = tk.DISABLED
        
        normalization(input_data)
        nor_entry['state'] = tk.NORMAL
        nor_entry.delete("1.0", tk.END)
        nor_output = input_data.iat[0,0]
        nor_entry.insert(tk.END, nor_output)
        nor_entry['state'] = tk.DISABLED
        
        stemming(input_data)
        stemming_entry['state'] = tk.NORMAL
        stemming_entry.delete("1.0", tk.END)
        stemming_output = input_data.iat[0,0]
        stemming_entry.insert(tk.END, stemming_output)
        stemming_entry['state'] = tk.DISABLED
        
        button2['state'] = tk.NORMAL
    
    button1 = tk.Button(root, text = "Apply Preprocessing", font = subheader, command = applyPreprocessing, fg = "white", bg = "black", activebackground = "white", activeforeground = "black")
    canvas1.create_window(290, 200, window = button1)
    
    def makePrediction():
        vect = vectorizer.transform(input_data['Teks'])
        pred = lr.predict(vect)
        output_label2 = tk.Label(root, text = "             ", font = header, fg = "black").place(x = 650, y = 170)
        if(pred == 1):
            output_label = tk.Label(root, text = "Spam", font = header, fg = "red").place(x = 657, y = 170)
        elif(pred == 0):
            output_label = tk.Label(root, text = "Ham", font = header, fg = "green").place(x = 665, y = 170)
       
        model_info_label = tk.Label(root, text = 'Model Information', font = subheader).place(x = 650, y = 215)
        algorithm_label = tk.Label(root, text = 'Algorithm: TF-IDF Vectorizer and Logistic Regression', font = subheader).place(x = 550, y = 245)
        splitting_label = tk.Label(root, text = 'Splitting ratio: 80% : 20%', font = subheader).place(x = 550, y = 275)
        precision_label = tk.Label(root, text = 'Precision: ' + str(precision), font = subheader).place(x = 550, y = 305)
        recall_label = tk.Label(root, text = 'Recall: ' + str(recall), font = subheader).place(x = 550, y = 335)
        accuracy_label = tk.Label(root, text = 'Accuracy: ' + str(accuracy), font = subheader).place(x = 550, y = 365)
        f1_label = tk.Label(root, text = 'F1-Score: ' + str(f1), font = subheader).place(x = 550, y = 395)
        
    button2 = tk.Button(root, text = "Make Prediction", font = subheader, command = makePrediction, fg = "white", bg = "black", activebackground = "white", activeforeground = "black", state = tk.DISABLED)
    canvas1.create_window(290, 530, window = button2)

    root.mainloop()

In [11]:
if __name__ == '__main__':
    data = clean_data(data)
    normalization(data)
    stemming(data)
    data = stopword(data)
    data = remove_three_words(data)
    modelling(data)
    buildGUI()