In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import string
import re

# Text Processing
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# ML Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Oversampling because of imbalanced dataset
from imblearn.over_sampling import SMOTE

df = pd.read_csv('spam.csv')
df = df[['Category', 'Message']]  

# Step 2: Encode Labels
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Category'])  # ham=0, spam=1



# Step 3: Clean Text Messages
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['Cleaned_Message'] = df['Message'].apply(clean_text)

# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['Cleaned_Message']).toarray()
y = df['Label']

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 6: SMOTE Oversampling on Training Data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled =X_train, y_train

print("After Oversampling:")
print(pd.Series(y_train_resampled).value_counts())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


After Oversampling:
Label
0    3859
1     598
Name: count, dtype: int64


In [None]:
# Import Models & Metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# -------------------learnt that for svm oversampling is not needed as it works well with imbalanced data----------------
# Train SVM Model
svm_model = SVC(kernel='linear')  
svm_model.fit(X_train_resampled, y_train_resampled)

# Predict on test data
svm_preds = svm_model.predict(X_test)

# Evaluate SVM
print(" SVM Results:")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_preds))
print("Classification Report:\n", classification_report(y_test, svm_preds))



 SVM Results:
Accuracy: 0.9829596412556054
Confusion Matrix:
 [[964   2]
 [ 17 132]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.93       149

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:

print("🚫 SPAM" if svm_model.predict(vectorizer.transform([clean_text("₹1.8-L Cr added in 14 days—are you Invested or planning yet?")]).toarray())[0] else "✅ Not Spam")


NameError: name 'svm_model' is not defined

In [None]:
# ------------------------
# Train KNN Model
knn_model = KNeighborsClassifier(n_neighbors=9)
knn_model.fit(X_train_resampled, y_train_resampled)

# Predict on test data
knn_preds = knn_model.predict(X_test)

# Evaluate KNN
print("\n KNN Results:")
print("Accuracy:", accuracy_score(y_test, knn_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_preds))
print("Classification Report:\n", classification_report(y_test, knn_preds))



 KNN Results:
Accuracy: 0.9049327354260089
Confusion Matrix:
 [[966   0]
 [106  43]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95       966
           1       1.00      0.29      0.45       149

    accuracy                           0.90      1115
   macro avg       0.95      0.64      0.70      1115
weighted avg       0.91      0.90      0.88      1115

