**Spam/Ham SMS Classifier**

Import libraries

In [35]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Load the dataset

In [54]:
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')

# Display the first few rows
print(data.head())

# Check for missing values and basic info
print(data.info())
print(data['v1'].value_counts())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
m

Clean the dataset

In [33]:
# Drop unnecessary columns (if present)
data = data[['v1', 'v2']]

# Rename columns for clarity
data.columns = ['label', 'message']

# Convert labels to binary (ham: 0, spam: 1)
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Check the distribution of spam vs. ham
print(data['label'].value_counts())

label
0    4825
1     747
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].map({'ham': 0, 'spam': 1})


Preprocess the Text Data

In [36]:
# Initialize stemmer and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and stem
    tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the message column
data['processed_message'] = data['message'].apply(preprocess_text)

# View a few processed messages
print(data[['message', 'processed_message']].head())

                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                   processed_message  
0  go jurong point crazi avail bugi n great world...  
1                              ok lar joke wif u oni  
2  free entri wkli comp win fa cup final tkt st m...  
3                u dun say earli hor u c alreadi say  
4               nah think goe usf live around though  


Feature Extraction

In [37]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features

# Transform the processed text into TF-IDF features
X = tfidf.fit_transform(data['processed_message']).toarray()
y = data['label']

# Check the shape of the feature matrix
print(X.shape)

(5572, 5000)


Split the Data

In [38]:
# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (4457, 5000)
Testing set size: (1115, 5000)


Train a Model

In [39]:
# Initialize and train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.968609865470852

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
 [[965   0]
 [ 35 115]]


Handle Imbalanced Data

In [40]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Retrain the model
model.fit(X_train_res, y_train_res)
y_pred = model.predict(X_test)
print("Accuracy after SMOTE:", accuracy_score(y_test, y_pred))
print("\nClassification Report after SMOTE:\n", classification_report(y_test, y_pred))

Accuracy after SMOTE: 0.9668161434977578

Classification Report after SMOTE:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       965
           1       0.83      0.95      0.88       150

    accuracy                           0.97      1115
   macro avg       0.91      0.96      0.93      1115
weighted avg       0.97      0.97      0.97      1115



Test with New Data

In [53]:
def predict_sms(text, model, tfidf):
    # Preprocess the input text
    processed_text = preprocess_text(text)
    # Transform using the same TF-IDF vectorizer
    text_vector = tfidf.transform([processed_text]).toarray()
    # Predict
    prediction = model.predict(text_vector)
    return 'Spam' if prediction[0] == 1 else 'Ham'

# Test with a sample message
sample_text = "Free entry to win $1000! Click now!"
print(predict_sms(sample_text, model, tfidf))

Spam


Save the Model and Vectorizer

In [52]:
import joblib

# Save the model and vectorizer
joblib.dump(model, 'spam_classifier_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Load them later (example)
# model = joblib.load('spam_classifier_model.pkl')
# tfidf = joblib.load('tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']