# SPAM CLASSIFICATION MODEL

### Importing libraries and Dataset

In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"

In [None]:
df = pd.read_csv(url, sep='\t', names=['label', 'message'])
df['label_num'] = df.label.map({'ham': 0, 'spam': 1})

X = df.message
y = df.label_num

In [None]:
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


### Testing Different models and chekcing which one is the best

    Models Tested

    1.   Logistic Regression
    2.   Linear SVM
    3.   NaiveBayes
    4.   SGD Classifier

    




In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score,cross_validate
#Pipeline for Logistic Regression

Pipeline_lr = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('model',LogisticRegression(max_iter = 1000))
])

#Pipeline for Linear SVM

Pipeline_svm = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('model',LinearSVC())
])

#Pipeline for NaiveBayes
Pipeline_NB  = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('model',MultinomialNB())
])

#Pipeline for SGDClassifier

Pipeline_SGD = Pipeline([
    ('tfidf',TfidfVectorizer()),
    ('model',SGDClassifier())
])

In [None]:
pipelines = {
    'Logistic Regression': Pipeline_lr,
    'Linear SVM': Pipeline_svm,
    'Naive Bayes': Pipeline_NB,
    'SGD Classifier': Pipeline_SGD
}

# Checking which model is best using Cross-validation

for name, pipeline in pipelines.items():
    scores = cross_validate(pipeline,X,y,cv = 4,scoring = ['precision','recall','f1'],return_train_score=False)
    print(f"\n🔍 {name}")
    print(f"  Precision: {scores['test_precision'].mean():.4f} ± {scores['test_precision'].std():.4f}")
    print(f"  Recall:    {scores['test_recall'].mean():.4f} ± {scores['test_recall'].std():.4f}")
    print(f"  F1 Score:  {scores['test_f1'].mean():.4f} ± {scores['test_f1'].std():.4f}")


🔍 Logistic Regression
  Precision: 0.9950 ± 0.0056
  Recall:    0.7778 ± 0.0107
  F1 Score:  0.8730 ± 0.0048

🔍 Linear SVM
  Precision: 0.9871 ± 0.0084
  Recall:    0.9077 ± 0.0182
  F1 Score:  0.9455 ± 0.0064

🔍 Naive Bayes
  Precision: 1.0000 ± 0.0000
  Recall:    0.6774 ± 0.0073
  F1 Score:  0.8076 ± 0.0052

🔍 SGD Classifier
  Precision: 0.9858 ± 0.0093
  Recall:    0.9103 ± 0.0178
  F1 Score:  0.9464 ± 0.0056


Due to SGD classifier having a better recall than everyone which is a very` important aspect of a spam classifier model and with relatively good Precision we choose it as our model

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state = 42)

In [None]:
Final_model = Pipeline_SGD
Final_model.fit(x_train,y_train)

y_pred = Final_model.predict(x_test)
print(f"\n Final Stats of the model")
print("\n", classification_report(y_test,y_pred))
# print(accuracy_score(y_test,y_pred))


 Final Stats of the model

               precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.97      0.95      0.96       149

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



### Saving the Model for further use


In [None]:
import joblib

joblib.dump(Final_model, 'spam_classifier.pkl')


['spam_classifier.pkl']

In [None]:
from google.colab import files

files.download('spam_classifier.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>