In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
# nltk.download('stopwords')
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
import string
from nltk import PorterStemmer as Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
spam_dataset_path = 'spam.csv'

df = pd.read_csv(spam_dataset_path, encoding='latin-1', names=['label', 'line1', 'line2', 'line3', 'line4'], skiprows=1)
df = df.fillna("")
df['text'] = df['line1'] + df['line2'] + df['line3'] + df['line4']
del df['line1']
del df['line2']
del df['line3']
del df['line4']
df['text']  = df['text'].str.replace('\n', ' ')
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
def custom_analyzer(text):
    keep_punctuations = set('!?$%')
    remove_punctuations = set(string.punctuation) - keep_punctuations
    text = text.lower()
    text = ''.join([t for t in text if t not in remove_punctuations])
    for keep_punctuation in keep_punctuations:
        text = text.replace(keep_punctuation, f" {keep_punctuation} ")
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    stemmer = Stemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

spam_ham_classifier = Pipeline([
    ('vectorizer', TfidfVectorizer(analyzer=custom_analyzer)),
    ('classifier', SVC())
])

In [4]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.20, random_state = 21)
spam_ham_classifier.fit(x_train, y_train)
y_pred = spam_ham_classifier.predict(x_test)
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

         ham       1.00      0.98      0.99       992
        spam       0.87      0.99      0.93       123

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115

