<a href="https://colab.research.google.com/github/mohmdumer/Spam_-_No_Spam_Emails/blob/main/Spam_%26_no_spam_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:


import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/content/spam_or_not_spam.csv')

# Display basic info about the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   2999 non-null   object
 1   label   3000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 47.0+ KB
None


In [None]:
# Display the first few rows of the DataFrame
print(df.head())


                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [None]:
# Check for missing values
print(df.isnull().sum())

email    1
label    0
dtype: int64


In [None]:
# prompt: remove null values

df = df.dropna()
print(df.isnull().sum())

email    0
label    0
dtype: int64


In [None]:
print(df['label'].value_counts())


label
0    2500
1     499
Name: count, dtype: int64


### Data Preprocessing

In [None]:
# Lowercasing
# Removing punctuation, special characters, and extra spaces.

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply preprocessing
df['cleaned_text'] = df['email'].apply(preprocess_text)

print(df.head())


                                               email  label  \
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0   
1  martin a posted tassos papadopoulos the greek ...      0   
2  man threatens explosion in moscow thursday aug...      0   
3  klez the virus that won t die already the most...      0   
4   in adding cream to spaghetti carbonara which ...      0   

                                        cleaned_text  
0  date wed number aug number number number numbe...  
1  martin a posted tassos papadopoulos the greek ...  
2  man threatens explosion in moscow thursday aug...  
3  klez the virus that won t die already the most...  
4  in adding cream to spaghetti carbonara which h...  


### Unigrams and Bigrams

In [None]:
# Tokenization and n-grams generation
def generate_ngrams(text, n):
    tokens = word_tokenize(text)
    return list(ngrams(tokens, n))

# Generating unigrams and bigrams
df['unigrams'] = df['cleaned_text'].apply(lambda x: generate_ngrams(x, 1))
df['bigrams'] = df['cleaned_text'].apply(lambda x: generate_ngrams(x, 2))


### Split Dataset and Apply SMOTE

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
# Convert text into feature vectors
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['label']

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# Convert back to DataFrame for n-gram analysis
resampled_data = pd.DataFrame(X_smote, columns=vectorizer.get_feature_names_out())
resampled_data['label'] = y_smote

# Check the balanced distribution
print(resampled_data['label'].value_counts())


label
0    2500
1    2500
Name: count, dtype: int64


### Calculate Probabilities

In [None]:
# Function to calculate n-gram probabilities
def calculate_ngram_probabilities(df, labels, ngram_type):
    spam_ngrams = Counter()
    non_spam_ngrams = Counter()

    for i, text in enumerate(data):
        ngrams_list = generate_ngrams(text, ngram_type)
        if labels[i] == 1:  # Spam
            spam_ngrams.update(ngrams_list)
        else:  # Non-Spam
            non_spam_ngrams.update(ngrams_list)

    total_spam = sum(spam_ngrams.values())
    total_non_spam = sum(non_spam_ngrams.values())

    spam_probs = {ngram: (count + 1) / (total_spam + len(spam_ngrams)) for ngram, count in spam_ngrams.items()}
    non_spam_probs = {ngram: (count + 1) / (total_non_spam + len(non_spam_ngrams)) for ngram, count in non_spam_ngrams.items()}

    return spam_probs, non_spam_probs

# Recalculate probabilities
unigram_probs_spam, unigram_probs_non_spam = calculate_ngram_probabilities(df['cleaned_text'], df['label'], 1)
bigram_probs_spam, bigram_probs_non_spam = calculate_ngram_probabilities(df['cleaned_text'], df['label'], 2)


In [None]:
# Define Prediction Function
# The function will:

# Preprocess the input sentence.
# Generate unigrams and bigrams.
# Compute posterior probabilities.
# Predict the class with the highest posterior probability.

# Define the prediction function
def predict_class(sentence, unigram_probs_spam, unigram_probs_non_spam, bigram_probs_spam, bigram_probs_non_spam):
    sentence = preprocess_text(sentence)
    unigrams = generate_ngrams(sentence, 1)
    bigrams = generate_ngrams(sentence, 2)

    spam_prob = 1
    non_spam_prob = 1
    alpha = 1  # Laplace smoothing

    # Calculate unigram probabilities
    for unigram in unigrams:
        spam_prob *= unigram_probs_spam.get(unigram, alpha)
        non_spam_prob *= unigram_probs_non_spam.get(unigram, alpha)

    # Calculate bigram probabilities
    for bigram in bigrams:
        spam_prob *= bigram_probs_spam.get(bigram, alpha)
        non_spam_prob *= bigram_probs_non_spam.get(bigram, alpha)

    # Normalize probabilities
    total_prob = spam_prob + non_spam_prob
    spam_posterior = spam_prob / total_prob
    non_spam_posterior = non_spam_prob / total_prob

    # Predict the class
    predicted_class = 1 if spam_posterior > non_spam_posterior else 0

    return predicted_class, spam_posterior, non_spam_posterior



In [None]:
# Test the Function
# Test the function with three user-defined sentences.

# Test the function
sentences = [
    "Congratulations Mr. Muhammad Umer Naseem! Your visa for United States of America has been approved."
    "Congratulations! You've won a free ticket.",
    "Can we schedule a meeting for tomorrow?",
    "This is your last chance to claim the offer.",
    "Mr. Muhammad Umer Naseem! you have win a free ticket. Congratulations!",
    "Hi John, can we meet for lunch tomorrow at 1 PM?"
]

for sentence in sentences:
    predicted_class, spam_prob, non_spam_prob = predict_class(
        sentence,
        unigram_probs_spam,
        unigram_probs_non_spam,
        bigram_probs_spam,
        bigram_probs_non_spam
    )
    print(f"Sentence: {sentence}")
    print(f"Predicted Class: {'Spam' if predicted_class == 1 else 'Not Spam'}")
    print(f"Spam Probability: {spam_prob:.4f}, Non-Spam Probability: {non_spam_prob:.4f}\n")



Sentence: Congratulations Mr. Muhammad Umer Naseem! Your visa for United States of America has been approved.Congratulations! You've won a free ticket.
Predicted Class: Spam
Spam Probability: 0.9027, Non-Spam Probability: 0.0973

Sentence: Can we schedule a meeting for tomorrow?
Predicted Class: Spam
Spam Probability: 0.9027, Non-Spam Probability: 0.0973

Sentence: This is your last chance to claim the offer.
Predicted Class: Not Spam
Spam Probability: 0.5000, Non-Spam Probability: 0.5000

Sentence: Mr. Muhammad Umer Naseem! you have win a free ticket. Congratulations!
Predicted Class: Spam
Spam Probability: 0.9027, Non-Spam Probability: 0.0973

Sentence: Hi John, can we meet for lunch tomorrow at 1 PM?
Predicted Class: Not Spam
Spam Probability: 0.5000, Non-Spam Probability: 0.5000

