In [64]:
import pandas as pd, numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [65]:
data = {
    'message': [
        "Win a brand new car! Text WIN to 99999 now!",
        "Hi John, are we meeting today?",
        "Exclusive offer just for you. Claim your free prize!",
        "Please call me when you reach office.",
        "Limited-time deal! Buy 1 get 1 free now!",
        "Can you send me the report by evening?",
        "You have been selected for a $1000 gift card!",
        "Lunch at 1 pm?",
        "Congratulations! You’ve won free tickets to Maldives!",
        "Hey, don't forget about the meeting tomorrow."
        "Hey, don't forget about me."
    ],
    'label': [
        "Spam", "Valid Email", "Spam", "Valid Email", "Spam",
        "Valid Email", "Spam", "Valid Email", "Spam", "Valid Email"
    ]
}

In [66]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,message,label
0,Win a brand new car! Text WIN to 99999 now!,Spam
1,"Hi John, are we meeting today?",Valid Email
2,Exclusive offer just for you. Claim your free ...,Spam
3,Please call me when you reach office.,Valid Email
4,Limited-time deal! Buy 1 get 1 free now!,Spam


In [67]:
def cleaning(text):
  text = text.lower()
  text=re.sub(r'http\S+','',text)
  text=re.sub(r'www\S+','',text)
  text=re.sub(r'[^a-z\s]','',text)
  text=" ".join([word for word in text.split() if word not in stopwords.words('english')])
  # text=" ".join([WordNetLemmatizer().lemmatize(word) for word in text.split
  return text

In [68]:
df['clean_message'] = df['message'].apply(cleaning)
df['clean_message']

Unnamed: 0,clean_message
0,win brand new car text win
1,hi john meeting today
2,exclusive offer claim free prize
3,please call reach office
4,limitedtime deal buy get free
5,send report evening
6,selected gift card
7,lunch pm
8,congratulations youve free tickets maldives
9,hey dont forget meeting tomorrowhey dont forget


In [69]:
x_train,x_test,y_train,y_test = train_test_split(df['clean_message'],df['label'],test_size=0.2,random_state=42)

In [70]:
#tokenizer
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [71]:
model=MultinomialNB()
model.fit(x_train_vec,y_train)

In [72]:
y_pred=model.predict(x_test_vec)
accuracy=accuracy_score(y_test,y_pred)
conf_matrix=confusion_matrix(y_test,y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:\n",accuracy)
print("\nConfusion Matrix:\n",conf_matrix)
print("\nClassification Report:\n",report)

Accuracy:
 1.0

Confusion Matrix:
 [[1 0]
 [0 1]]

Classification Report:
               precision    recall  f1-score   support

        Spam       1.00      1.00      1.00         1
 Valid Email       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [73]:
new_email=input("Enter Email: ")
new_email_vec = vectorizer.transform([cleaning(new_email)])
prediction = model.predict(new_email_vec)
print("Prediction:",prediction[0])

Enter Email: hi
Prediction: Spam


In [79]:
new_msg=[
    "Hi John, are we meeting today?",
    "Congratulations! You’ve won free tickets to Maldives!",
    " Win a brand new car! Text WIN to 99999 now!"
]

new_msg_vec = vectorizer.transform(new_msg)
predictions = model.predict(new_msg_vec)
for msg, prediction in zip(new_msg, predictions):
    print(f"Message: {msg}\nPrediction: {prediction}\n")

Message: Hi John, are we meeting today?
Prediction: Valid Email

Message: Congratulations! You’ve won free tickets to Maldives!
Prediction: Spam

Message:  Win a brand new car! Text WIN to 99999 now!
Prediction: Spam



In [80]:
new_msg=[
    "Hi John, are we meeting today?",
    "Congratulations! You’ve won free tickets to Maldives!",
    " Win a brand new car! Text WIN to 99999 now!"
]
new_clean_msg=[cleaning(msg) for msg in new_msg]
new_msg_vec = vectorizer.transform(new_clean_msg)
predictions = model.predict(new_msg_vec)
print("Predictions are:\n")
for msg, prediction in zip(new_msg, predictions):
    print(f"Message: {msg}\nPrediction: {prediction}\n")

Predictions are:

Message: Hi John, are we meeting today?
Prediction: Valid Email

Message: Congratulations! You’ve won free tickets to Maldives!
Prediction: Spam

Message:  Win a brand new car! Text WIN to 99999 now!
Prediction: Spam

