In [65]:
!pip install nltk scikit-learn regex numpy pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [66]:
import pandas as pd
import numpy as np
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [67]:
df=pd.read_csv('/content/Spam Email raw text for NLP.csv')
df.head()

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6


In [68]:
df.drop('FILE_NAME',axis=1,inplace=True)

In [69]:
df.CATEGORY.value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [70]:
nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [72]:
nltk.download('wordnet')
lemmatizer=WordNetLemmatizer()
corpus=[]
for i in range(len(df)):
    # removing all non-alphanumeric characters
    message=re.sub('[^a-zA-Z0-9]',' ',df['MESSAGE'][i]) 
    # converting the message to lowercase
    message=message.lower() 
    # spliting the sentence into words for lemmatization                 
    message=message.split()      
    # removing stopwords and lemmatizing            
    message=[lemmatizer.lemmatize(word) for word in message
             if word not in set(stopwords.words('english'))] 
    # Converting the words back into sentences
    message=' '.join(message)    
    # Adding the preprocessed message to the corpus list            
    corpus.append(message)                 


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1:Using Bag of Words

In [73]:
 # Take the top 2500 features 
cv=CountVectorizer(max_features=2500,ngram_range=(1,3))
X=cv.fit_transform(corpus).toarray()
y=df['CATEGORY']

In [74]:
x_train,x_test,y_train,y_test=train_test_split(
    X,y,test_size=0.20,random_state=1,stratify=y)

## 2:Using TF-IDF Technique

In [75]:
tf=TfidfVectorizer(ngram_range=(1,3),max_features=2500)
X=tf.fit_transform(corpus).toarray()
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=1,stratify=y)

### Naive Bayes model

In [76]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()

In [77]:
model.fit(x_train,y_train)

MultinomialNB()

In [78]:
train_pred=model.predict(x_train)
test_pred=model.predict(x_test)

In [79]:
print(classification_report(train_pred,y_train))
print(classification_report(test_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      3258
           1       0.89      0.98      0.94      1378

    accuracy                           0.96      4636
   macro avg       0.94      0.97      0.95      4636
weighted avg       0.96      0.96      0.96      4636

              precision    recall  f1-score   support

           0       0.99      0.96      0.97       812
           1       0.91      0.99      0.94       348

    accuracy                           0.96      1160
   macro avg       0.95      0.97      0.96      1160
weighted avg       0.97      0.96      0.97      1160



In [80]:
print('Predicting...')
message = ["You won 10000 dollars, please provide your account details,So that we can transfer the money"]
message_vector = tf.transform(message)
category = model.predict(message_vector)
print("The message is", "spam" if category == 1 else "not spam")


Predicting...
The message is spam


In [81]:
print('Predicting...')
message = ["hey Racheal, the meeting is postponed to Monday"]
message_vector = tf.transform(message)
category = model.predict(message_vector)
print("The message is", "spam" if category == 1 else "not spam")

Predicting...
The message is not spam
