In [2]:
## import packages required for this project
import pandas as pd
import html
import numpy as np
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding, LSTM, Dropout
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

Using TensorFlow backend.


In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
## read SMS Spam dataset from UCI ML repository https://archive.ics.uci.edu/ml/datasets/sms+spam+collection
#sms = pd.read_csv("/content/gdrive/My Drive/spam.csv", encoding = "latin-1")
sms = pd.read_csv("spam.csv", encoding = "latin-1")

## data cleaning
## convert html entities to regular characters. e.g. &amp; &lt; &gt; etc.
for index, msg in sms.v2.iteritems():
    sms.v2[index] = html.unescape(msg)

In [4]:
## Recurrent Neural Network LSTM
spam = sms.loc[sms["v1"] == "spam"]
ham = sms.loc[sms["v1"] == "ham"]
spam = spam.v2.tolist()
ham = ham.v2.tolist()
x = np.asarray(spam + ham)
y = np.concatenate((np.ones(len(spam)), np.zeros(len(ham))))
tokenizer = Tokenizer(num_words = 3800)
tokenizer.fit_on_texts(x)
x = tokenizer.texts_to_sequences(x)
x = sequence.pad_sequences(x, maxlen = 380)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.5, random_state = 4450)

In [5]:
model_text_rnn = Sequential()
model_text_rnn.add(Embedding(input_dim = 3800, output_dim = 32, input_length = 380))
model_text_rnn.add(Dropout(0.2))
model_text_rnn.add(LSTM(64))
model_text_rnn.add(Dense(1, activation = "sigmoid"))
model_text_rnn.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
model_text_rnn.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 146,497
Trainable params: 146,497
Non-trainable params: 0
_________________________________________________________________


In [10]:
model_text_rnn.fit(x_train, y_train, epochs = 10, batch_size = 128, validation_data = (x_test, y_test))

Train on 2786 samples, validate on 2786 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f50b1f06e48>

In [16]:
## Naive Bayes Classifier
sms["v1"] = sms["v1"].map({"spam":1, "ham":0})

In [18]:
count_vector = CountVectorizer()
x_train, x_test, y_train, y_test = train_test_split(sms["v2"], sms["v1"], test_size = 0.8, random_state = 4421)
x_train = count_vector.fit_transform(x_train)
x_test = count_vector.transform(x_test)
naive_bayes = GaussianNB()
model_nb = naive_bayes.fit(x_train.toarray(), y_train)
pred = model_nb.predict(x_test.toarray())
accuracy_score(y_test, pred)

0.9201435621354868

In [19]:
## my own tokenizer
pattern_punctuation = r'''(?x)
[\/#!$%\^&\*;:{}=\_`~()] # basically all punctuations except , . -
'''
sms["puncPerSent"] = sms["v2"].apply(lambda x: len(nltk.regexp_tokenize(x, pattern_punctuation))/len(x))

In [20]:
def longCapital(s):
    list_of_uppercase_runs = re.findall(r"[A-Z0-9]+", s)

    # find out what the longest string is in your list
    try:
        longest_string = max(list_of_uppercase_runs, key = len)
    except ValueError:
        return 0
    # return the length of this string to the user
    return len(longest_string)
sms["longCapital"] = sms["v2"].apply(longCapital)

In [21]:
def numbers(s):
    list_of_numbers = re.findall(r"[0-9]", s)
    if list_of_numbers == None:
        return 0
    else:
        return len(list_of_numbers)
sms["numbers"] = sms["v2"].apply(lambda x: numbers(x)/len(x))

In [22]:
sms["length"] = sms["v2"].apply(len)

In [23]:
def refind(string, text):
    myregex = re.escape(string)
    l = re.findall(myregex, text)
    if l == None:
        return 0
    else:
        return len(l)
sms["v2"] = sms["v2"].apply(lambda x: x.lower())
sms["txt"] = sms["v2"].apply(lambda x: refind("txt", x))
sms["http"] = sms["v2"].apply(lambda x: refind("http", x))
sms["credit"] = sms["v2"].apply(lambda x: refind("credit", x))
sms["congrat"] = sms["v2"].apply(lambda x: refind("congrat", x))
sms["subscri"] = sms["v2"].apply(lambda x: refind("subscri", x))
sms["guarantee"] = sms["v2"].apply(lambda x: refind("guarantee", x))
sms["account"] = sms["v2"].apply(lambda x: refind("account", x))
sms["prize"] = sms["v2"].apply(lambda x: refind("prize", x))
sms["bonus"] = sms["v2"].apply(lambda x: refind("bonus", x))
sms["award"] = sms["v2"].apply(lambda x: refind("award", x))
sms["ansr"] = sms["v2"].apply(lambda x: refind("ansr", x))
sms["pobox"] = sms["v2"].apply(lambda x: refind("pobox", x))
sms["msg"] = sms["v2"].apply(lambda x: refind("msg", x))
## and more

In [24]:
def findtokens(string, text):
    list_of_tokens = re.findall(r"[a-z0-9]", text)
    return list_of_tokens.count(string)
sms["com"] = sms["v2"].apply(lambda x: findtokens("com", x))
sms["net"] = sms["v2"].apply(lambda x: findtokens("net", x))
sms["www"] = sms["v2"].apply(lambda x: findtokens("www", x))
sms["wap"] = sms["v2"].apply(lambda x: findtokens("wap", x))
sms["click"] = sms["v2"].apply(lambda x: findtokens("click", x))
sms["win"] = sms["v2"].apply(lambda x: findtokens("win", x))
sms["won"] = sms["v2"].apply(lambda x: findtokens("won", x))
sms["password"] = sms["v2"].apply(lambda x: findtokens("password", x))
sms["urgent"] = sms["v2"].apply(lambda x: findtokens("urgent", x))
sms["winner"] = sms["v2"].apply(lambda x: findtokens("winner", x))
sms["private"] = sms["v2"].apply(lambda x: findtokens("private", x))
sms["text"] = sms["v2"].apply(lambda x: findtokens("text", x))
sms["call"] = sms["v2"].apply(lambda x: findtokens("call", x))
sms["code"] = sms["v2"].apply(lambda x: findtokens("code", x))
sms["valid"] = sms["v2"].apply(lambda x: findtokens("valid", x))
sms["cash"] = sms["v2"].apply(lambda x: findtokens("cash", x))
sms["claim"] = sms["v2"].apply(lambda x: findtokens("claim", x))

In [25]:
## Linear Discriminant Analysis on my own feature extraction
x = sms.iloc[:,5:]
y = sms["v1"]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.8, random_state = 4236)

In [31]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
accuracy_score(y_test, pred)



0.9688200986989681

In [32]:
## Naive Bayes Classifier on my own feature extraction
clf = GaussianNB()
clf.fit(x_train, y_train)
pred = clf.predict(x_test)
accuracy_score(y_test, pred)

0.9515477792732167