# SMS spam detector

### Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np

# for preprocessing
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

# for machine learning models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

# to convert words to feature vector
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package wordnet to /Users/maanas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Importing the dataset

In [2]:
# check encoding for importing dataset
import chardet
with open("smsspam.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.7270322499829184, 'language': ''}


In [3]:
# Reading data and manipulating the dataframe
data = pd.read_csv("smsspam.csv", encoding = "Windows-1252")
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data.rename(columns = {'v1': 'label', 'v2': 'sms'}, inplace = True)
display(data)

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Preprocessing the data

In [4]:
# checking for missing values
data.isna().sum()

label    0
sms      0
dtype: int64

In [5]:
# convert to lowercase, strip and remove punctuations
def preprocess(sentence):
    sentence = sentence.lower() 
    sentence = sentence.strip()  
    sentence = re.compile('<.*?>').sub('', sentence) 
    sentence = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', sentence)  
    sentence = re.sub('\s+', ' ', sentence)  
    sentence = re.sub(r'\[[0-9]*\]',' ',sentence) 
    sentence = re.sub(r'[^\w\s]', '', str(sentence).lower().strip())
    sentence = re.sub(r'\d',' ',sentence) 
    sentence = re.sub(r'\s+',' ',sentence) 
    return sentence

 
# removing stop words
def remove_stopwords(sentence):
    word_list = [i for i in sentence.split() if i not in stopwords.words('english')]
    return ' '.join(word_list)
 
# map nltk pos tags to wordnet pos tags
def get_wordnet_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# tokenization and lemmatization
def lemmatizer(sentence):
    lem = WordNetLemmatizer()
    pos_tags = nltk.pos_tag(word_tokenize(sentence))
    a = [lem.lemmatize(tag[0], get_wordnet_pos_tag(tag[1])) for idx, tag in enumerate(pos_tags)]
    return " ".join(a)

def finalpreprocess(sentence):
    return lemmatizer(remove_stopwords(preprocess(sentence)))

data['clean_sms'] = data['sms'].apply(lambda x: finalpreprocess(x))
df_temp = pd.get_dummies(data['label'])
data = pd.concat((data, df_temp), axis=1)
data = data.drop(["ham"], axis=1)
display(data)

Unnamed: 0,label,sms,clean_sms,spam
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf life around though,0
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time try contact u u å pound prize claim ea...,1
5568,ham,Will Ì_ b going to esplanade fr home?,ì b go esplanade fr home,0
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestion,0
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like interested buying something...,0


### Split the data into training and testing data

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data["clean_sms"], data["spam"], test_size=0.2, shuffle = True)
print("X train: ", X_train.shape)
print("X test: ", X_test.shape)
print("y train: ", y_train.shape)
print("y test: ", y_test.shape)

X train:  (4457,)
X test:  (1115,)
y train:  (4457,)
y test:  (1115,)


### Generate feature vectors for the given data using a TF-IDF vectorizer

In [7]:
tfidf_vectorizer = TfidfVectorizer(use_idf = True)
X_train_feature_vector = tfidf_vectorizer.fit_transform(X_train) 
X_test_feature_vector = tfidf_vectorizer.transform(X_test)

### Classification using Machine Learning models

In [8]:
# Classification using a logistic regression model
model = LogisticRegression()
model.fit(X_train_feature_vector, y_train) 

# Prediction on test set
y_pred = model.predict(X_test_feature_vector)
print(classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.97      0.76      0.85       149

    accuracy                           0.96      1115
   macro avg       0.96      0.88      0.91      1115
weighted avg       0.96      0.96      0.96      1115

Confusion Matrix:
 [[962   4]
 [ 36 113]]


In [9]:
# Classification using a Naive Bayes model
model = MultinomialNB()
model.fit(X_train_feature_vector, y_train) 

# Prediction on test set
y_pred = model.predict(X_test_feature_vector)
print(classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
 [[966   0]
 [ 34 115]]


In [10]:
# Classification using SVM
model = SVC()
model.fit(X_train_feature_vector, y_train) 

# Prediction on test set
y_pred = model.predict(X_test_feature_vector)
print(classification_report(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.98      0.85      0.91       149

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[964   2]
 [ 22 127]]


### Results

The best results(precision and f1-score) were produced using the support vector machine model in comparison to Linear Regression and Naive Bayes.

In [11]:
pd.set_option('max_rows', None)
result = pd.concat([X_test, y_test], axis=1, join='inner')
result['spam_prediction'] = y_pred
display(result)
pd.reset_option('max_columns')
pd.reset_option('max_rows')

Unnamed: 0,clean_sms,spam,spam_prediction
2092,oh love soooo good hear omg miss much today so...,0,0
5285,urgent week free membership å prize jackpot tx...,1,1
575,cash prize claim call,1,1
341,take post come must texts happy read one wiv h...,0,0
3426,haha okay today weekend leh,0,0
1816,go write ccna exam week,0,0
5364,call send girl erotic ecstacy p min stop text ...,1,1
5461,ok thk get u wan come wat,0,0
1849,get job wipro get every thing life year,0,0
2645,sorry call later,0,0
