<a href="https://colab.research.google.com/github/mebirtukan/NLP-Email-Spam-Detection/blob/main/NLP_Email_Spam_Detection_with_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Email Spam Detection Project
This project focuses on building an email spam detection system using natural language processing (NLP) and machine learning techniques.

#Project Overview
The goal is to classify emails as spam or ham (not spam) using the SMSSpamCollection dataset. The process involves data preprocessing, vectorization, model training, and evaluation.

#Key Libraries and Tools
Numpy and Pandas for data manipulation

NLTK for text processing

Scikit-learn for machine learning models and evaluation

Regular Expressions for text cleaning

In [None]:
!pip install scikit-learn




In [None]:
# nltk.download('stopwords')

In [None]:
# @title Import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as lm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import re
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import sklearn
print(dir(sklearn.model_selection))

['BaseCrossValidator', 'BaseShuffleSplit', 'GridSearchCV', 'GroupKFold', 'GroupShuffleSplit', 'KFold', 'LearningCurveDisplay', 'LeaveOneGroupOut', 'LeaveOneOut', 'LeavePGroupsOut', 'LeavePOut', 'ParameterGrid', 'ParameterSampler', 'PredefinedSplit', 'RandomizedSearchCV', 'RepeatedKFold', 'RepeatedStratifiedKFold', 'ShuffleSplit', 'StratifiedGroupKFold', 'StratifiedKFold', 'StratifiedShuffleSplit', 'TimeSeriesSplit', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__getattr__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_plot', '_search', '_split', '_validation', 'check_cv', 'cross_val_predict', 'cross_val_score', 'cross_validate', 'learning_curve', 'permutation_test_score', 'train_test_split', 'typing', 'validation_curve']


In [None]:
# @title data loading
df=pd.read_csv('/content/SMSSpamCollection', sep='\t', names=['label', 'msg'] )
df

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [None]:
df.info()#information about the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   msg     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [None]:
df.isna().sum()#check for null values

label    0
msg      0
dtype: int64

In [None]:
#Count of labels
df.label.value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

#Text Cleaning and Lemmatization:

Removal of non-alphanumeric characters

Conversion to lowercase

Removal of stopwords

Lemmatization of words

In [None]:
# @title Text Cleaning and Lemmatization:
corpus=[]
for i in range(len(df)):
  review = re.sub('^a-zA-Z0-9',' ',df['msg'][i])
  review=review.lower()
  review=review.split()
  review = [data for data in review if data not in stopwords.words('english')]
  review = [lm.lemmatize(data) for data in review]
  review = " ".join(review)
  corpus.append(review)




In [None]:
df['msg'][0]

'go jurong point crazy available bugis n great world la e buffet cine get amore wat'

In [None]:
len(df['msg'][0]), len(corpus)

(82, 5572)

In [None]:
df['msg']=corpus

In [None]:
df.head()

Unnamed: 0,label,msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joke wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


#MODEL BUILDIG

In [None]:
# @title Data Spliting
x = df['msg']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 10)


In [None]:
len(x_train), len(y_train),len(x_test),len(y_test)



(3900, 3900, 1672, 1672)

In [None]:
# @title Vectorization (Convert Text Data Into The Vectors)

tf_obj = TfidfVectorizer()
x_train_tfidf = tf_obj.fit_transform(x_train).toarray()
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# @title Model Training: Naive Bayes Classifier

text_mnb = Pipeline([('tfidf',TfidfVectorizer()),('mnb',MultinomialNB())])
text_mnb.fit(x_train,y_train)


In [None]:
# Pipeline(steps=[('tfidf', TfidfVectorizer()), ('mnb', MultinomialNB())])


In [None]:
# @title Model Evaluation: Accuracy Score: Evaluated on both training and testing datasets.
y_pred_test = text_mnb.predict(x_test)
print("Accuracy Score:", accuracy_score(y_test,y_pred_test)*100)

Accuracy Score: 95.8732057416268


In [None]:
#Accuracy Score on Training Data
y_pred_train = text_mnb.predict(x_train)
print("Accuracy Score:",accuracy_score(y_train,y_pred_train)*100)

Accuracy Score: 98.28205128205128


In [None]:
# @title#Confusion Matrix and Classification Report on Testing Data

y_pred_test = text_mnb.predict(x_test)
print("Confusion Matrix on Test Data:\n", confusion_matrix(y_test,y_pred_test))

y_pred_test = text_mnb.predict(x_test)
print("Classification Reportx on Test Data:\n", classification_report(y_test,y_pred_test))

Confusion Matrix on Test Data:
 [[1457    0]
 [  69  146]]


Classification Reportx on Test Data:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1457
        spam       1.00      0.68      0.81       215

    accuracy                           0.96      1672
   macro avg       0.98      0.84      0.89      1672
weighted avg       0.96      0.96      0.96      1672



#Prediction on user data


In [None]:
#Prediction on user data
def preprocess_data(text):
    review = re.sub('^a-zA-Z0-9',' ',text)
    review = review.lower()
    review = review.split()
    review = [data for data in review if data not in stopwords.words('english')]
    review = [lm.lemmatize(data) for data in review]
    review = " ".join(review)
    return [review]

In [None]:
user_data = df['msg'][9]
print(user_data)
user_data = preprocess_data(user_data)
user_data

mobile 11 month u r entitle update latest colour mobile camera free call mobile update co free 08002986030


['mobile 11 month u r entitle update latest colour mobile camera free call mobile update co free 08002986030']

In [None]:
text_mnb.predict(user_data)[0]


'spam'

In [None]:
#    review = re.sub(r'\W', ' ', df['msg'][i])  # Corrected regular expression


In [None]:
class prediction:

    def __init__(self,data):
        self.data = data

    def user_data_preprocessing(self):
        lm = WordNetLemmatizer()
        review = re.sub('^a-zA-Z0-9',' ',self.data)
        review = review.lower()
        review = review.split()
        review = [data for data in review if data not in stopwords.words('english')]
        review = [lm.lemmatize(data) for data in review]
        review = " ".join(review)
        return [review]

    def user_data_prediction(self):
        preprocess_data = self.user_data_preprocessing()

        if text_mnb.predict(preprocess_data)[0] == 'spam':
            return 'This Message is Spam'

        else:
            return 'This Message is Ham'

In [None]:
df.head()


Unnamed: 0,label,msg
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joke wif u oni
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor u c already say
4,ham,nah think go usf life around though


In [None]:
user_data = df['msg'][9]
print(user_data)
prediction(user_data).user_data_prediction()

mobile 11 month u r entitle update latest colour mobile camera free call mobile update co free 08002986030


'This Message is Spam'

In [None]:
user_data = df['msg'][8]
print(user_data)
prediction(user_data).user_data_prediction()

winner value network customer select receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hour


'This Message is Spam'