In [1]:
# import the libraries
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk

In [2]:
ls

 Volume in drive C is Windows 
 Volume Serial Number is C47D-F2FC

 Directory of C:\Users\COMD\Desktop\AI-ML Project\Spam-detect

09/19/2024  01:49 PM    <DIR>          .
09/19/2024  01:49 PM    <DIR>          ..
09/19/2024  12:56 PM    <DIR>          .ipynb_checkpoints
09/19/2024  01:33 PM             1,733 app.py
09/19/2024  11:30 AM           215,934 archive.zip
09/19/2024  01:32 PM               102 requirements.txt
09/19/2024  01:04 PM            16,632 SMS Spam Detection Analysis - NLP.ipynb
09/19/2024  01:49 PM            24,125 Spam Detection.ipynb
09/20/2019  02:54 AM           503,663 spam.csv
09/19/2024  01:42 PM                 0 spam_classifier_model.pkl
09/19/2024  01:41 PM                 0 spam_classifier_random_forest.pkl
09/20/2019  02:54 AM           503,663 spam1.csv
               9 File(s)      1,265,852 bytes
               3 Dir(s)  33,263,153,152 bytes free


In [3]:
# upload th dataset
df = pd.read_csv('spam.csv', encoding='latin1')


In [4]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [6]:
df.nunique()

v1               2
v2            5169
Unnamed: 2      43
Unnamed: 3      10
Unnamed: 4       5
dtype: int64

In [7]:
df = df[['v1','v2']]
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# rename the columns
df = df.rename(columns={'v1':'label', 'v2':'message'})
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [10]:
df.isnull().sum()

label      0
message    0
dtype: int64

In [11]:
df.duplicated().sum()

403

In [26]:
# Download the stopwords resource
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\COMD\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [12]:
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # Convert the text to lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stopwords
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [13]:
#clean the messages
df['clean_text'] = df['message'].apply(clean_text)
df.head()

Unnamed: 0,label,message,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


# Model Training

In [14]:
X = df['clean_text']
y = df['label']

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [16]:
def classify(model, X, y):
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True, stratify=y)
    
    # Create the pipeline
    pipeline_model = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', model)
    ])
    
    # Train the model
    pipeline_model.fit(x_train, y_train)
    
    # Evaluate the model
    accuracy = pipeline_model.score(x_test, y_test) * 100
    print('Accuracy:', accuracy)
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))
    
    return pipeline_model  # Return the trained pipeline model

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
logistic_score = classify(model, X, y)
logistic_score

Accuracy: 96.12347451543431
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       0.99      0.72      0.83       187

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [18]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
Multinomial_score = classify(model, X, y)
Multinomial_score

Accuracy: 95.97989949748744
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.70      0.82       187

    accuracy                           0.96      1393
   macro avg       0.98      0.85      0.90      1393
weighted avg       0.96      0.96      0.96      1393



In [19]:
from sklearn.svm import SVC
model = SVC(C=3)
svc_score = classify(model, X, y)
svc_score

Accuracy: 98.34888729361091
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       1.00      0.88      0.93       187

    accuracy                           0.98      1393
   macro avg       0.99      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



In [20]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
randonforest_score = classify(model, X, y)

Accuracy: 97.12849964106246
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       1.00      0.79      0.88       187

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393



In [21]:
import pickle

# Save the trained pipeline model to a file using pickle
model_filename = 'spam_classifier_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(svc_score, file)  # Save the Logistic Regression model
    # Use pipeline_model_rf if you want to save the Random Forest model instead

In [22]:
df.head()

Unnamed: 0,label,message,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [23]:
X.head()

0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry 2 wkly comp win fa cup final tkts 2...
3                  u dun say early hor u c already say
4          nah dont think goes usf lives around though
Name: clean_text, dtype: object

In [24]:
y.head()

0     ham
1     ham
2    spam
3     ham
4     ham
Name: label, dtype: object

In [25]:
# Load the saved model
model_filename = 'spam_classifier_model.pkl'
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

In [27]:
# Assuming you have your new dataset in a DataFrame called df
# Prepare your new dataset
# Create a DataFrame with your new message
new_data = pd.DataFrame({'clean_text': ['you won five million']})

In [28]:

# Prepare your new dataset
X_new = new_data['clean_text']  # Use the cleaned text for predictions

In [29]:
# Make predictions
y_pred = loaded_model.predict(X_new)

In [31]:
# Add predictions to the DataFrame
y_pred

array(['ham'], dtype=object)

In [33]:
# upload th dataset
df1 = pd.read_csv('spam.csv', encoding='latin1')
df.loc[2]

label                                                      spam
message       Free entry in 2 a wkly comp to win FA Cup fina...
clean_text    free entry 2 wkly comp win fa cup final tkts 2...
Name: 2, dtype: object