In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('spam.csv', encoding="ISO-8859-1")
data = data[['v1', 'v2']]
data.columns = ['Label', 'Msg']

data.Label = data.Label.map({'ham': 0, 'spam': 1})
print(data.head())

   Label                                                Msg
0      0  Go until jurong point, crazy.. Available only ...
1      0                      Ok lar... Joking wif u oni...
2      1  Free entry in 2 a wkly comp to win FA Cup fina...
3      0  U dun say so early hor... U c already then say...
4      0  Nah I don't think he goes to usf, he lives aro...


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

punctuations = string.punctuation
stopwords = set(stopwords.words('english'))

def preprocess_text(text):
    
    # Normalize text
    text = text.lower()
    
    # Strip non-ascii accents
    text = text.encode('ascii', 'ignore').decode('ascii') 
    
    # Retain specific words that could characterize spam msgs
    for word in ["t&c's", 't&cs', 't&c']:
        if word in text:
            text = text.replace(word, 'termsandconditions')
    
    def remove_punctuations(word):
        if (len(word)==2 or len(word)==3) and \
                all([ch in punctuations for ch in word]) and \
                not all([ch == word[0] for ch in word]):
            # Cond1 - Ensures smileys are only 2/3 char long
            # Cond2 - All chars of word are punctuations
            # Cond3 - All punctuations are not the same (Ex. || , ... , !! , etc)
            return "smiley"
        return ''.join([ch for ch in word if ch not in punctuations])
    
    # Tokenize & eliminate punctuations
    tokens = [remove_punctuations(word) for sent in sent_tokenize(text) for word in sent.split()]
    
    # Remove numeric tokens
    tokens = [token for token in tokens if token and not token.isdigit()]
    
    # Remove alphanumeric tokens
    new_tokens = []
    for token in tokens:
        if not (token.isalnum() and not (token.isalpha() or token.isdigit())):
            new_tokens.append(token)
        else:
            new_tokens.append('alphanumeric')
    
    # Remove stopwords
    # tokens = [token for token in tokens if token not in stopwords]
    
    return ' '.join(new_tokens)
    
    

In [4]:
data['Processed_Msg'] = data['Msg'].apply(preprocess_text)
data['Msg_len'] = data['Msg'].apply(len)

In [5]:
data[data.Processed_Msg.apply(len) == 0]

Unnamed: 0,Label,Msg,Processed_Msg,Msg_len
1611,0,645,,3


In [6]:
## Dictribution of class labels
data['Label'].value_counts()

0    4825
1     747
Name: Label, dtype: int64

##### Import necessary sklearn libraries

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [8]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def print_scores(y_test, y_pred):
    print(f'Accuracy : {accuracy_score(y_test, y_pred)}')
    print(f'\nclassification_report :\n {classification_report(y_test, y_pred)}')
    print(f'\nconfusion_matrix :\n {confusion_matrix(y_test, y_pred)}')

In [9]:
X = data.Processed_Msg
y = data.Label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

#### Using Tfidf Vectors

In [10]:
tfv = TfidfVectorizer()
train_vectors = tfv.fit_transform(X_train)
test_vectors = tfv.transform(X_test)

In [11]:
# Support Vector classifier 
svc = SVC()
svc.fit(train_vectors, y_train)
y_pred = svc.predict(test_vectors)
print_scores(y_test, y_pred)

Accuracy : 0.9820627802690582

classification_report :
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


confusion_matrix :
 [[963   2]
 [ 18 132]]


In [12]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(train_vectors, y_train)
y_pred = rfc.predict(test_vectors)
print_scores(y_test, y_pred)

Accuracy : 0.9748878923766816

classification_report :
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115


confusion_matrix :
 [[965   0]
 [ 28 122]]


#### Using Count Vectors

In [13]:
cv = CountVectorizer(decode_error='ignore', stop_words='english', ngram_range=(1,2))
train_vectors = cv.fit_transform(X_train)
test_vectors = cv.transform(X_test)

In [14]:
mnb_clf = MultinomialNB()
mnb_clf.fit(train_vectors, y_train)
y_pred = mnb_clf.predict(test_vectors)
print_scores(y_test, y_pred)

Accuracy : 0.9901345291479821

classification_report :
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.97      0.96      0.96       150

    accuracy                           0.99      1115
   macro avg       0.98      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115


confusion_matrix :
 [[960   5]
 [  6 144]]
