In [3]:
from tkinter import *
from tkinter.filedialog import askopenfilename
from tkinter.messagebox import showerror
from tkinter.scrolledtext import ScrolledText
from tkinter.ttk import Combobox
import pandas as pd

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from collections import Counter
from sklearn.metrics import f1_score,accuracy_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras import backend as K
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [4]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def RNN():
    max_words = 3500
    max_len = 50
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.25)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy',recall_m,precision_m,f1_m])
    return model

In [5]:
class Trainer:
    def __init__(self,model_name):
        """
        model_name: "SVM", "NB" (Naive Bayes),"KNN", "LSTM", and "pretrainedLSTM"
        """
        self.model_name = model_name
        self.TfidfVectorizer = None
        self.Tokenizer = None
        
    def preprocess(self, input_path, mode, random_state=4, test_size=0.15):
        """
        mode: "train" or "predict", if predict, data will not be splitted
        """
        # Initialize dummy X,Y
        X,Y = None, None

        if self.model_name == "LSTM" or self.model_name == "pretrainedLSTM":
            # Load data as a list of pairs of sentences and labels (0 for non-spam, 1 for spam)
            data = []
            if mode != "predict_list":
                with open(input_path, encoding = 'utf-8') as f:
                    for line in f:
                        firstword = (line.split()[0])
                        label = None
                        if firstword == 'ham':
                            label = 0
                        else: 
                            label = 1
                        text = line.replace(firstword,'').strip()
                        data.append([text,label])
            elif mode == "predict_list":
                data = [[text,"?"] for text in input_path]

            # transform data to numpy array
            data = pd.DataFrame(np.array(data),columns=['text','label'])

            # Dividing the dataset into features and lables
            X = data['text']
            Y = data['label']


            # Tokenizer
            max_len = 50
            if mode == "train":
                max_words = 3500
                self.Tokenizer = Tokenizer(num_words=max_words)
                self.Tokenizer.fit_on_texts(X)
                X = self.Tokenizer.texts_to_sequences(X)
                X = sequence.pad_sequences(X,maxlen=max_len)
            else:
                X = self.Tokenizer.texts_to_sequences(X)
                X = sequence.pad_sequences(X,maxlen=max_len)



        else: # Other models
            # Load data as a list of pairs of sentences and labels (0 for non-spam, 1 for spam)
            data = []
            if mode != "predict_list":
                with open(input_path, encoding = 'utf-8') as f:
                    for line in f:
                        firstword = (line.split()[0])
                        label = None
                        if firstword == 'ham':
                            label = 0
                        else: 
                            label = 1
                        text = line.replace(firstword,'').strip()
                        data.append([text,label])
            elif mode == "predict_list":
                data = [[text,"?"] for text in input_path]

            # transform data to numpy array
            data = np.array(data)
            if (mode == "train"):

                self.TfidfVectorizer = TfidfVectorizer()

                # split data to X,Y
                X = self.TfidfVectorizer.fit_transform(data[:,0]) #TFIDF transform for data
                Y = data[:,1]    

            elif (mode == "predict" or mode == "predict_list"):
                X = self.TfidfVectorizer.transform(data[:,0]) #TFIDF transform for data
                Y = data[:,1]   


        # Split data into training and test set
        if mode == "train":
            X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=test_size,random_state=random_state)

            # self.X_train, self.X_test, self.Y_train, self.Y_test = X_train,X_test,Y_train,Y_test

            return X_train, X_test, Y_train, Y_test

        elif mode == "predict" or mode == "predict_list" :
            return X

    def train(self, input_path,test_size,random_state = 4):
        """
        input_path: path to .txt file for training spam samples    
        return: recall, precision, f_score

        """

        # Preprocess the dataaaaaa
        X_train, X_test, Y_train, Y_test = self.preprocess(input_path, random_state = random_state, mode = "train", test_size = test_size)

        #  Initial
        if self.model_name == "SVM":
            self.model = SVC(C=1e4)

        elif self.model_name == "KNN":
            self.model = KNeighborsClassifier(n_neighbors=1,metric='cosine')


        elif self.model_name == "Naive Bayes":
            self.model = model = MultinomialNB()


        elif self.model_name == "LSTM":
            self.model = RNN()

        elif self.model_name == "pretrainedLSTM":
            self.model = RNN()

            self.model.load_weights('weights-9529-test-9615-best.h5')

        # Training
        if self.model_name != "pretrainedLSTM":
            if self.model_name == "LSTM":
                mdcp = ModelCheckpoint(filepath='best_weights_train.h5',monitor='val_f1_m',save_best_only=True)
                self.model.fit(X_train,Y_train,batch_size=64,epochs=7, validation_split=0.2,callbacks=[mdcp])
                self.model.load_weights('best_weights_train.h5') # Retrieve best weights
            else:
                self.model.fit(X_train,Y_train)

        # Initialize metrics
        recall, precision, f_score = None, None, None

        # Evaluate
        if self.model_name == "LSTM" or self.model_name == "pretrainedLSTM":
            _,_,recall, precision, f_score = self.model.evaluate(X_test,Y_test,verbose=1)

        elif self.model_name == "Naive Bayes":

            Y_test_pred = (self.model.predict_proba(X_test.toarray())[:,1] > 0.25) # thresh = 0.25
            f_score = f1_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))
            precision = precision_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))
            recall = recall_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))
        else:
            Y_test_pred = self.model.predict(X_test)
            f_score = f1_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))
            precision = precision_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))
            recall = recall_score(Y_test_pred.astype('uint8'),Y_test.astype('uint8'))

        return recall, precision, f_score

    def predict(self, input_path):
        """
        input_path: path to .txt file for spam samples you want to predict (1 line 1 sample, "?"  for labels)
        """
        # Preprocess the data
        if isinstance(input_path, list):
            X = self.preprocess(input_path, mode = 'predict_list')
        else:
            X = self.preprocess(input_path, mode = "predict")
        if self.model_name == "Naive Bayes":
            Y_pred = (self.model.predict_proba(X)[:,1] > 0.25)*1
        elif self.model_name == "pretrainedLSTM" or self.model_name == "LSTM":
            Y_pred = (self.model.predict(X) > 0.5)*1
        else:
            Y_pred = self.model.predict(X)

        return Y_pred

In [36]:
class GUI(Frame):
    def __init__(self):
        Frame.__init__(self)
        self.master.title('Spam detection v1.0')
        self.master.geometry('440x510')
        self.master.rowconfigure(3, weight=0)
        self.master.columnconfigure(1, weight=0)
        self.grid(sticky=W+E+N+S)
        self.configure(background = 'lightgray')
        self.master.configure(background = 'lightgray')
        
        #create and configure button [Sellect data]
        self.browse = Button(self, text = 'Select data', command = self.load_data, width = 10)
        self.browse.grid(row = 0, column = 0, pady = 20, padx = 30, ipadx = 20, ipady = 6, columnspan = 2, sticky = W)
        
        #create and configure selected file path output
        self.path = Entry(self, disabledforeground = 'black')
        self.path.grid(row = 0, column = 1, padx = 15, ipadx = 56, ipady = 8, columnspan = 3, sticky = W)
        self.path.insert(0, 'No data selected') #init default text
        self.path.configure(state = 'disabled') #not allow changes
        
        #create and configure model combobox
        self.method = Combobox(self, value = ["SVM", "Naive Bayes", "KNN", "LSTM", "pretrainedLSTM"])
        self.method.current(0)   #init default value to SVM
        self.method.grid(row = 1, column = 0, padx = 30, ipadx = 43, ipady = 8, columnspan = 2, sticky = W)
        
        #create and configure input splitting ratio
        self.splitting_rate = Entry(self, width = 19, fg = 'gray')
        self.splitting_rate.grid(row = 1, column = 2, ipady = 8, columnspan = 2, sticky = W)
        self.splitting_rate.insert(0, 'Splitting ratio (%)')               #init default text
        self.splitting_rate.bind('<FocusIn>', self.onfocus_splitting)      #on focus on entry
        self.splitting_rate.bind('<FocusOut>', self.outfocus_splitting)    #off focus on entry
                
        #create and configure input space
        self.entry = ScrolledText(self, width = 15, height = 2, fg = 'gray')
        self.entry.grid(row = 2, column = 0, padx = 30, pady = 20, ipady = 40, ipadx = 125, columnspan = 3, sticky = W)
        self.entry.insert(INSERT, 'Insert query here and/or through file.')#init defalut text
        self.entry.bind('<FocusIn>', self.onfocus_entry)                   #on focus on entry
        self.entry.bind('<FocusOut>', self.outfocus_entry)                 #off focus on entry
        
        #input space is empty
        self.entry_changed = False
        
        #create and configure button [Select query]
        self.query_butt = Button(self, text = 'Select query', command = self.load_sample, width = 10)
        self.query_butt.grid(row = 3, column = 0, padx = 30, pady = 10, ipady = 6, sticky = W)
        
        #create and configure button [Start]
        self.start_butt = Button(self, text = 'Start', command = self.run, width = 10)
        self.start_butt.grid(row = 3, column = 1, padx = 20, ipady = 6)

        #create and configure button [Clear]
        self.clear_butt = Button(self, text = 'Clear', command = self.clear, width = 10)
        self.clear_butt.grid(row = 3, column = 2, padx = 37, pady = 5, ipady = 6, sticky = W)
        
        #create and configure output space
        self.output = ScrolledText(self, state = 'disabled', width = 15, height = 2)
        self.output.grid(row = 4, column = 0, padx = 30, pady = 10, ipady = 40, ipadx = 125, columnspan = 3, sticky = W)
        
    def onfocus_splitting(self, event = None):
        #if there is no input yet when on focus
        if self.splitting_rate.get() == 'Splitting ratio (%)':
            self.splitting_rate.configure(fg = 'black')   #font color to black
            self.splitting_rate.delete(0, END)            #delete default text

    def outfocus_splitting(self, event = None):
        #if there is no input when out of focus
        if not self.splitting_rate.get():
            self.splitting_rate.configure(fg = 'gray')    #font color to gray
            self.splitting_rate.insert(0, 'Splitting ratio (%)')  #init default value again
            
    def onfocus_entry(self, event = None):
        #if there is no input yet when on focus
        if self.entry.get('1.0', END)[:-1] == 'Insert query here and/or through file.':
            self.entry.configure(fg = 'black')    #font color to black
            self.entry.delete('1.0', END)         #delete default text
            self.entry_changed = True             #input has been changed

    def outfocus_entry(self, event = None):
        #if there is no input when out of focus
        if not len(self.entry.get('1.0', END)[:-1]):
            self.entry.configure(fg = 'gray')     #font color to gray
            self.entry.insert(INSERT, 'Insert query here and/or through file.')   #init default value again
            self.entry_changed = False     #input has not been changed or empty
    
    def run(self):
        #take variable
        model = self.method.get()
        path = self.path.get()
        query = self.entry.get("1.0", END)
        flag = 1
        
        #error check
        try:
            splitting_ratio = float(self.splitting_rate.get())
        except ValueError:
            messagebox.showerror("Error", "Splitting ratio must be a number.")
            flag = 0
            
        if splitting_ratio <= 0:
            messagebox.showerror('Error', 'Splitting ratio must be larger than zero.')
            flag = 0
            
        if path == 'No data selected':
            messagebox.showerror('Error', 'No data selected.')
            flag = 0
            
        if not len(query[:-1]):
            messagebox.showerror('Error', 'Query is empty.')
            flag = 0      
        else:
            query = query.split('\n')[:-1]

        try:
            with open(path, 'r', encoding = 'utf-8') as f:
                pass
        except FileNotFoundError:
            messagebox.showerror('Error', 'File not found.')
            flag = 0

        #executing
        if flag:
            #start output, make textbox editable
            self.output.configure(state = 'normal')
            self.output.insert(INSERT, 'Data from : ' + path + '\n')
            self.output.insert(INSERT, 'Model: ' + model + '\n')
            self.output.insert(INSERT, 'Splitting ratio : ' + str(splitting_ratio) + '%\n\n')
            
            #training model
            myTrainer = Trainer(model)
            recall, precision, f_score = myTrainer.train(path,test_size = splitting_ratio / 100)
            self.output.insert(INSERT, f'recall: {recall}\nprecision: {precision}\nf score: {f_score}\n\n')
            
            #predict query
            result = myTrainer.predict(query)
            for i in range(len(query)):
                self.output.insert(INSERT, query[i] + '\nPrediction: ' + ('Spam\n\n' if int(result[i]) else 'Ham\n\n'))
            
            #end output, return textbox state to uneditable
            self.output.configure(state = 'disabled')
        
        
    def load_data(self):
        fname = askopenfilename()   #open file
        if fname:  #if file name is not empty
            try:
                self.path.configure(state = 'normal')   #change file path output to editable
                self.path.delete(0, END)                #delete default text
                self.path.insert(0, fname)              #insert file path
                self.path.configure(state = 'disabled') #return file path to unediatble
            except:
                messagebox.showerror('Error', 'File not found.') 

    def load_sample(self):
        fname = askopenfilename()  #open file
        if fname:  #if file name is not empty
            try:
                with open(fname, 'r', encoding = 'utf-8') as f:  #open file
                    query = f.read()
                sample = self.entry.get('1.0', END)  #get existing input
                if self.entry_changed:               #if there's existing input
                    query = sample + query           #append input to query
                self.entry.configure(fg = 'black')   #font to black
                self.entry.delete('1.0', END)        #delete default text or existing input
                self.entry.insert(INSERT, query)     #insert query to input
            except:
                showerror('Error', 'File not found.') 

    def clear(self):
        self.output.configure(state = 'normal')   #change output to editable
        self.output.delete('1.0', END)            #delete all of output
        self.output.configure(state = 'disabled') #change output to unediatble

In [38]:
if __name__ == '__main__':
    app = GUI()
    app.mainloop()

