<a href="https://colab.research.google.com/github/oogu2020/spam_detection_app/blob/main/Spam_Detector_App.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB,GaussianNB
import zipfile
import io
import requests
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"

# Extract the zip file
with zipfile.ZipFile(io.BytesIO(requests.get(url).content)) as z:
    # the CSV file is named 'SMSSpamCollection'. since there are multiple files in the zip file we specify which one we want.
    with z.open('SMSSpamCollection') as f:
        df = pd.read_csv(f, sep='\t', names=["label", "message"])

df.head(20)


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
for i in range(10):
  print('Message :{}\n, Label: {} '.format(df.message[i],df.label[i]))

Message :Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
, Label: ham 
Message :Ok lar... Joking wif u oni...
, Label: ham 
Message :Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
, Label: spam 
Message :U dun say so early hor... U c already then say...
, Label: ham 
Message :Nah I don't think he goes to usf, he lives around here though
, Label: ham 
Message :FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
, Label: spam 
Message :Even my brother is not like to speak with me. They treat me like aids patent.
, Label: ham 
Message :As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
, Label: ham 
Message :WINNER!!

In [None]:
## Convert labels to binary values



In [None]:
# Convert labels to binary values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [5]:
## Define Target and Predictors

X = df['message']
y = df['label']

In [6]:
## Split Dataset

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
## Transfor Text

# Transform the text data to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [8]:
## Train Multiple models and measure accuracy
models = {
    'MultinomialNB': MultinomialNB(),
    'GaussianNB': GaussianNB()
}


In [9]:
models.items()

dict_items([('MultinomialNB', MultinomialNB()), ('GaussianNB', GaussianNB())])

In [10]:
# Initialize and train different Naive Bayes models
from sklearn.metrics import accuracy_score


models = {
    'MultinomialNB': MultinomialNB(),
    'GaussianNB': GaussianNB()
}

results = {}

for model_name, model in models.items():
    if model_name == 'GaussianNB':
          # Convert sparse matrices to dense arrays for GaussianNB
        X_train_dense = X_train_tfidf.toarray()  # Convert to dense array
        X_test_dense = X_test_tfidf.toarray()    # Convert to dense array
        model.fit(X_train_dense, y_train)        # Fit using dense arrays
        y_pred = model.predict(X_test_dense)     # Predict using dense arrays
    else:
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy*100:.2f}%")

Model: MultinomialNB
Accuracy: 97.85%
Model: GaussianNB
Accuracy: 89.42%


In [11]:
## Predict Sample Text

for model in models:
  print(model)

MultinomialNB
GaussianNB


In [12]:
# Define sample inputs
sample_inputs = [
    "good morning, the time for the meeting is rescheduled to 3pm",
    "Congratualtions, you won the big prize for this year's event, click here to claim your prize and also receive a 50% discount on your next order"
]

# Transform sample inputs
sample_text = vectorizer.transform(sample_inputs)


# Predict using both models
for model_name, model in models.items():
  if model_name == 'GaussianNB':
   sample_preds = model.predict(sample_text.toarray()) # Convert to dense array for GaussianNB
  else:
    sample_preds = model.predict(sample_text)

  # Print the results
  for i, sample in enumerate(sample_inputs):
      print(f"Model: {model_name}")
      print(f"Message: {sample}")
      print(f"Predicted: {'Spam' if sample_preds[i] == 1 else 'Ham'}")
      print("-" * 50)

Model: MultinomialNB
Message: good morning, the time for the meeting is rescheduled to 3pm
Predicted: Ham
--------------------------------------------------
Model: MultinomialNB
Message: Congratualtions, you won the big prize for this year's event, click here to claim your prize and also receive a 50% discount on your next order
Predicted: Ham
--------------------------------------------------
Model: GaussianNB
Message: good morning, the time for the meeting is rescheduled to 3pm
Predicted: Ham
--------------------------------------------------
Model: GaussianNB
Message: Congratualtions, you won the big prize for this year's event, click here to claim your prize and also receive a 50% discount on your next order
Predicted: Ham
--------------------------------------------------


In [14]:
#Save the model
import joblib

##Save the model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']