> # <center> SPAM DETECTION ANALYSIS

In [1]:
#Import libraries
import numpy as np
import pandas as pd

# word libraries
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# pre-processing and model libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.utils import shuffle

#Save and load the model
import joblib

from warnings import filterwarnings
filterwarnings('ignore')
%matplotlib inline

In [2]:
#Load the dataset
data = pd.read_csv("../data/main_data.csv", encoding="latin_1")
data2 = pd.read_csv("../data/spam.csv", encoding="latin_1")
data.head()

Unnamed: 0,label,message
0,ham,Shuhui has bought ron's present it's a swatch ...
1,ham,"What should i eat fo lunch senor,,,"
2,ham,"Yeah it's jus rite...,,,"
3,ham,She told to hr that he want posting in chennai...
4,ham,Hhahhaahahah rofl wtf nig was leonardo in your...


In [3]:
data2.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data2.rename(columns={'v1':'label','v2':'message'},inplace=True)
data2.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,"Ok lar... Joking wif u oni...,,,"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
#Concatenate the data and shuffle
df = pd.concat([data, data2], ignore_index=True, sort=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11174 entries, 0 to 11173
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    11174 non-null  object
 1   message  11174 non-null  object
dtypes: object(2)
memory usage: 174.7+ KB


## <center> DATA PREPROCESSING </center>

> The method i intend using is as follows
1. Shuffle the data
2. preprocess the data using nltk porter stemmer function and remove stop words
3. Encode the labels
4. Split the data and then vectorize the text

In [7]:
#Shuffle the data
df = shuffle(df, random_state=41).reset_index(drop=True)

In [8]:
# Function to use for the dataframes
stem = PorterStemmer() #text stemming
stopwordSet = set(stopwords.words('english')) #create a set of the stopwords

def preprocess(text):
    data = text
    for i in range(len(data)):
        message = data[i].lower() #using reqular expression to clean the text
        message = message.split() #split the words
        message = [stem.stem(word) for word in message if not word in stopwordSet]# stem each word not found in the stopword set
        data[i] = ' '.join(message) #join the data
    return data #return the data

In [9]:
# will preprocess df2 and clean df and replace it. then replace them with the outcome
df['message'] = preprocess(df['message'])

In [10]:
#preview the dataframe
df.head()

Unnamed: 0,label,message
0,ham,let know chang next 6hrs. even appendix age ra...
1,ham,jesu christ bitch i'm tri give drug answer fuc...
2,ham,"u talk about?,,,"
3,ham,"oh k...i'm watch here:),,,"
4,ham,co lar i'm ba dao ok... 1 pm lor... u never as...


> Label encoding for the target variable (LABEL)

In [11]:
#Label encoding
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

> Use TfidfVectorizer to transform the data into what the models can understand

In [12]:
# vectorizer
vectorizer =  TfidfVectorizer(strip_accents='unicode', use_idf=True, smooth_idf=True, sublinear_tf=1)

In [13]:
# Separate the columns
X = df['message']
y = df['label']

In [14]:
#Using the first dataframe, split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
#for model selection and testing
train = vectorizer.fit_transform(X_train)
test = vectorizer.transform(X_test)

#for deployment
deploy = vectorizer.fit_transform(X)

In [16]:
#Model list for testing different model performance and use the best performing model
modelList = []
modelList.append(("LogisticReg", LogisticRegression()))
modelList.append(("MultinomialNB", MultinomialNB()))
modelList.append(("GradBoostClf", GradientBoostingClassifier()))
modelList.append(("DecisionTree", DecisionTreeClassifier()))
modelList.append(("RandomForest", RandomForestClassifier()))
modelList.append(("KNeighbors", KNeighborsClassifier(n_neighbors=5)))
modelList.append(("SVC",  SVC()))
modelList.append(("XGB", XGBClassifier()))
modelList.append(("LightGBM", LGBMClassifier()))

In [17]:
#Train and predict function
def train_predict(x_train, x_test, y_train, y_test):
    for name, classifier in modelList:
        classifier.fit(x_train,y_train)
        y_pred = classifier.predict(x_test)
        print("{} Accuracy: {}".format(name,accuracy_score(y_test,y_pred)))
        print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
        print()

In [18]:
#test the function
train_predict(train, test, y_train, y_test)

LogisticReg Accuracy: 0.9698777214434835
confusion matrix:
 [[2794   10]
 [  91  458]]

MultinomialNB Accuracy: 0.9791231732776617
confusion matrix:
 [[2797    7]
 [  63  486]]

GradBoostClf Accuracy: 0.9660005964807635
confusion matrix:
 [[2793   11]
 [ 103  446]]

DecisionTree Accuracy: 0.9838950193856248
confusion matrix:
 [[2779   25]
 [  29  520]]

RandomForest Accuracy: 0.9892633462570832
confusion matrix:
 [[2801    3]
 [  33  516]]

KNeighbors Accuracy: 0.8917387414255891
confusion matrix:
 [[2804    0]
 [ 363  186]]

SVC Accuracy: 0.9901580674023263
confusion matrix:
 [[2797    7]
 [  26  523]]

XGB Accuracy: 0.9806143751864003
confusion matrix:
 [[2790   14]
 [  51  498]]

LightGBM Accuracy: 0.9898598270205786
confusion matrix:
 [[2791   13]
 [  21  528]]



> The best model from the accuracy and confusion matrix is the svc, so this is what will be used in the pipeline

In [19]:
#fit in the dataset
model = SVC(probability=True)

#Using the whole dataset to train the deployment model
model.fit(deploy, y)

In [20]:
# save the model
joblib.dump(model, '../model/model.pkl')

#save the vectorizer
joblib.dump(vectorizer, '../model/vectorizer.pkl')

['../model/vectorizer.pkl']

In [21]:
# Prediction function
def predictions(text):
    model = joblib.load('../model/model.pkl')
    vectorizer = joblib.load('../model/vectorizer.pkl')
    vectorized_text = vectorizer.transform(text)
    predict = model.predict(vectorized_text)
    return predict

In [22]:
#test
text = ['Dear Customer This is a confirmation that the password for your onlineAccess account has just been change If you didnt request or make this password change, Kindly copy this link bellow and PASTE on your browser http://x.co/Accesbank for security purpose.If you made this password change kindly copy this link bellow and PASTE on your browser to review your account informations http://x.co/Accesbank.Thank you for banking with us.Terms & Conditions Do Not Call Registry  Disclaimer | Multilingual Disclaimer | Code of CommitmentGroup Code of Business Conduct and Ethics,Use of Unparliamentary Language by Customers | Privacy | USA Patriot Act Certification']
result = predictions(text)
print(np.around(result))

[1]
