<a href="https://colab.research.google.com/github/pkmariya/Python/blob/main/EMail_SpamClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install nltk --quiet

## 1. Import Libraries

In [2]:
import nltk
import random
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## 2. Data Collection/Preparation

In [4]:
data = [
          ("Congratulations, you have been selected for this AI position", "ham"),
          ("Congratulations, you are the winner of this lottery!", "spam"),
          ("Congratulations, You have won an iPhone!", "spam"),
          ("Congratulations, You have won a Samsung Galaxy!", "spam"),
          ("Congratulations, You have won a $1000 Amazon Gift Card!", "spam"),
          ("Please call me tomorrow", "ham"),
          ("Limited time offer", "spam"),
          ("can you send the notes from AI class", "ham"),
          ("Urgent your account has been compromised", "spam")
        ]

In [6]:
df = pd.DataFrame(data, columns=["Message", "Label"])
df

Unnamed: 0,Message,Label
0,"Congratulations, you have been selected for th...",ham
1,"Congratulations, you are the winner of this lo...",spam
2,"Congratulations, You have won an iPhone!",spam
3,"Congratulations, You have won a Samsung Galaxy!",spam
4,"Congratulations, You have won a $1000 Amazon G...",spam
5,Please call me tomorrow,ham
6,Limited time offer,spam
7,can you send the notes from AI class,ham
8,Urgent your account has been compromised,spam


## 3. Data Preparation

In [8]:
stop_words = set(stopwords.words('english'))

def preprocess(message):
    words = word_tokenize(message.lower())
    return {word: True for word in words if word.isalpha() and word not in stop_words}

# prepare feature sets
feature_sets = [(preprocess(message), label) for message, label in data]

# Shuffle feature sets
random.shuffle(feature_sets)

# Split into train and test data
train_data = feature_sets[:7]
test_data = feature_sets[7:]

print("\n Sample preprocessed data:")
print(train_data[0])


 Sample preprocessed data:
({'urgent': True, 'account': True, 'compromised': True}, 'spam')


## 4. Model Building & Training

In [9]:
model_clf = NaiveBayesClassifier.train(train_data)
print("\n Model Trainining Completed!")


 Model Trainining Completed!


## 5. Model Testing & Evaluation

In [10]:
acc = accuracy(model_clf, test_data)
print(f"\n Model Accuracy: {acc*100:.2f}%")

print("\n Most Informative Words are:")
model_clf.show_most_informative_features(5)


 Model Accuracy: 100.00%

 Most Informative Words are:
Most Informative Features
                      ai = None             spam : ham    =      1.8 : 1.0
                    call = None             spam : ham    =      1.8 : 1.0
                  please = None             spam : ham    =      1.8 : 1.0
                position = None             spam : ham    =      1.8 : 1.0
                selected = None             spam : ham    =      1.8 : 1.0


## 6. Model Inference (using new data)

In [12]:
new_msgs = ["Congratulations, you won a free ticket to Switserland",
           "Can we go there tomorrow?",
           "Hey, are we still meeting this Friday?"]

print("\nSample Predictions")
for msg in new_msgs:
  label = model_clf.classify(preprocess(msg))
  print(f"Message: {msg}: \nPrediction: {label}\n")


Sample Predictions
Message: Congratulations, you won a free ticket to Switserland: 
Prediction: spam

Message: Can we go there tomorrow?: 
Prediction: ham

Message: Hey, are we still meeting this Friday?: 
Prediction: spam

