<a href="https://colab.research.google.com/github/myndsol-sarvagya/ML-SmsClassification-Python/blob/main/SPAM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing the required libraries

import numpy as np
import xlrd
import re
import pandas as pd
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.pipeline import Pipeline

In [None]:
# Data cleaning step
import pandas as pd
import re

def clean_text(text):
    # Remove special characters and symbols
    text = re.sub(r'[^\w\s]', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

def clean_and_preprocess(df):
    # Drop rows with null values
    df.dropna(subset=['SMS', 'Target'], inplace=True)

    # Clean the 'SMS' column
    df['cleaned_SMS'] = df['SMS'].apply(clean_text)

    return df

# Example usage
data = {'SMS': ["Hello, this is a spam message!", "Please confirm your appointment.", None, "Important news!"],
        'Target': ["spam", "not spam", "not spam", "spam"]}
df = pd.DataFrame(data)

cleaned_df = clean_and_preprocess(df)
print(cleaned_df)


In [None]:
# Reading the data /excel
df = pd.read_csv('spam.csv')
target_column = df.Target
sms_column = df.SMS
cleaned_df = clean_and_preprocess(df)



In [None]:
df.head()

Unnamed: 0,Target,SMS,Unnamed: 2,Unnamed: 3,Unnamed: 4,cleaned_SMS
0,ham,"Go until jurong point, crazy.. Available only ...",,,,go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,,,,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,,,,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,,nah i don t think he goes to usf he lives arou...


In [None]:
# Check for the existence of NaN values in a cell:
df.isnull().sum()

Target            0
SMS               0
Unnamed: 2     5522
Unnamed: 3     5560
Unnamed: 4     5566
cleaned_SMS       0
dtype: int64

In [None]:
#To see counts of Intents
df['Target'].value_counts()

ham     4825
spam     747
Name: Target, dtype: int64

In [None]:
#Splitting the data into train test

from sklearn.model_selection import train_test_split

X = df['SMS']
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42,shuffle=True,stratify=y)

In [None]:
# print(y_train.value_counts())
# print(y_test.value_counts())

In [None]:
#Create a Model Pipeline

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

#MLP:
# mlp = Pipeline([('tfidf', TfidfVectorizer()),
#                      ('clf', MLPClassifier(solver='adam', hidden_layer_sizes=(200,), random_state=13,activation='relu',alpha=0.1)),
# ])
#SVC

svm =  LinearSVC(loss='hinge',
                  penalty='l2', max_iter = 1000)
clf = CalibratedClassifierCV(svm)
svc_clf = Pipeline([('tfidf', TfidfVectorizer()),
                # ("scaler", StandardScaler(with_mean=False)),
                ('clf',clf),
])

In [None]:
#Fitting the data on trained split to check the performamce
# mlp.fit(X_train, y_train)
svc_clf.fit(X_train, y_train)


In [None]:
# Form a prediction set
# prediction = mlp.predict(X_test)
prediction = svc_clf.predict(X_test)

In [None]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,prediction))

[[962   4]
 [ 12 137]]


In [None]:
# Print a classification report
print(metrics.classification_report(y_test,prediction))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       966
        spam       0.97      0.92      0.94       149

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,prediction))

0.9856502242152466


In [None]:
#To fit the model on the entire dataset
# mlp.fit(X, y)
svc_clf.fit(X, y)


In [None]:
#Create the final pickle

with open('spammed.pkl', 'wb') as handle:
    pickle.dump(svc_clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#Load and Test the model

import pickle
import numpy as np

with open('spammed.pkl','rb') as handle:
    loaded_model=pickle.load(handle)




In [None]:
message="""U don't know how stubborn I am. I didn't even want to go to the hospital. I kept telling Mark I'm not a weak sucker. Hospitals are for weak suckers."""
estimation = loaded_model.predict([message])[0]

con = loaded_model.predict_proba([message])[0]

print(estimation,max(con))

ham 0.9917965868381566


In [None]:
message="""Congrats! 1 year special cinema pass for 2 is yours. call 09061209465 now! C Suprman V, Matrix3, StarWars3, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! """
estimation = loaded_model.predict([message])[0]

con = loaded_model.predict_proba([message])[0]

print(estimation,max(con))

spam 0.9895514863129561
