In [173]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [174]:
df = pd.read_csv('/Users/mac/Desktop/Spam SMS Classification/spam.csv', encoding='ISO-8859-1')

In [175]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [176]:
X = df['v2']
y = df['v1']

Converting the text to lower case and removing all the non alphanumric characcters leacing only alphabets and numbers

In [177]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text
X = X.apply(preprocess_text)

In [178]:
def feature_engineering(df):
    df_fe = df.copy()
    # Feature: Text Length
    df_fe['text_length'] = df_fe['v2'].apply(len)
    # Feature: Word Count
    df_fe['word_count'] = df_fe['v2'].apply(lambda x: len(str(x).split()))
    # Feature: Special Characters Count
    df_fe['special_chars_count'] = df_fe['v2'].apply(lambda x: len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', str(x))))
    # Feature: Uppercase Percentage
    df_fe['uppercase_percentage'] = df_fe['v2'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)))
    # Feature: Digit Count
    df_fe['digit_count'] = df_fe['v2'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))
    # Feature: URL Count 
    df_fe['url_count'] = df_fe['v2'].apply(lambda x: len(re.findall(r'http|www', str(x))))
    return df_fe
data = feature_engineering(df)

In [179]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('v1', axis=1), data['v1'], test_size=0.2, random_state=42)

Term Frequency-Inverse Document Frequency to convert text data numerical features

In [180]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000) 
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train['v2'])
X_test_tfidf = tfidf_vectorizer.transform(X_test['v2'])


In [181]:
param_grid = {
    'C': [0.1, 1, 10], 
    'kernel': ['linear', 'rbf'] 
}
svm = SVC()

In [182]:
grid_search = GridSearchCV(svm, param_grid, cv=5)  
grid_search.fit(X_train_tfidf, y_train)

In [183]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_tfidf)

Evaluating the model

In [184]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
classification_report_result = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_report_result)

Accuracy: 0.979372197309417
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.98      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



Printing classification results

In [185]:
def custom_filter(text):
    if re.search(r'congratulations!|free|win|prize', text, re.IGNORECASE):
        return 'spam'
    else:
        return 'ham'

new_sms = ["Congratulations! You won an iPhone", "Hi, when are we meeting for coffee?"]

custom_predictions = [custom_filter(sms) for sms in new_sms]

for sms, label in zip(new_sms, custom_predictions):
    if label == 'spam':
        print(f"Spam: {sms}")
    else:
        print(f"Legitimate: {sms}")

Spam: Congratulations! You won an iPhone
Legitimate: Hi, when are we meeting for coffee?
