In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/xperiment/spam_emails.csv', encoding= 'ISO-8859-1', encoding_errors = 'strict')

In [3]:
data.head()

Unnamed: 0,email,spam
0,on thu may escapenumber escapenumber at escape...,0
1,vip replica we offer a free gift box with ever...,1
2,on wed jun escapenumber escapenumber at escape...,0
3,author metze date escapenumber escapenumber es...,0
4,hi i' trying to learn how to use lme for linea...,0


In [4]:
data.shape

(31376, 2)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31376 entries, 0 to 31375
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   email   31376 non-null  object
 1   spam    31376 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 490.4+ KB


In [6]:
data.tail

<bound method NDFrame.tail of                                                    email  spam
0      on thu may escapenumber escapenumber at escape...     0
1      vip replica we offer a free gift box with ever...     1
2      on wed jun escapenumber escapenumber at escape...     0
3      author metze date escapenumber escapenumber es...     0
4      hi i' trying to learn how to use lme for linea...     0
...                                                  ...   ...
31371  dear list i have a dataset that provides sampl...     0
31372  sure , let me send you the details :\n- up to ...     0
31373  thee this people riches of shamgar the royal c...     1
31374  hi\nwould you reflnance if you knew you ' d sa...     1
31375  squirting liver christ dowie and others must b...     1

[31376 rows x 2 columns]>

In [7]:
test_data = data.select_dtypes([np.object]).apply(lambda x: x.str.contains('Â').any())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_data = data.select_dtypes([np.object]).apply(lambda x: x.str.contains('Â').any())


In [8]:
test_data = data[data['email'].str.contains('Â')]
print(test_data)

                                                   email  spam
1      vip replica we offer a free gift box with ever...     1
16     interestingly lenovo still redirects support r...     0
23     alert name bush bush reiterates vow to veto wa...     0
25     we present you the official results of the res...     1
29     nice to meet you visit our new online pharmacy...     1
...                                                  ...   ...
31326  you deserve it aerobacter attend just and for ...     1
31354  nice to meet you look at the assortment of our...     1
31355  alert name bush bush and putin set to meet esc...     0
31356  Ã¤ÃºÂºÃ£Â£Â¡Ã¶Ã¡Â¹Ã³Â¹Â«Ã«Â¾Â¸ÂºÃ´Ã°Ã¨Ã« Â²Ã¦Ã...     1
31367  anatrim Â the latest and most exciting produc...     1

[2744 rows x 2 columns]


In [9]:
data['spam'].value_counts()

1    16360
0    15016
Name: spam, dtype: int64

In [10]:
data.duplicated().sum()

0

In [11]:
ps = PorterStemmer()

def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))


    return " ".join(y)

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
data['after_transformation'] = data['email'].apply(transform_text)

In [21]:
def preprocess(data):
  data = data.str.replace('escapenumb', '')
  data = data.str.replace('\n', ' ')
  data = data.str.replace('Â', '')
  data = data.str.replace('\d', '')
  data = data.str.replace('[^\w\s]','')
  return data

In [17]:
data.head()

Unnamed: 0,email,spam,after_transformation
0,on thu may escapenumber escapenumber at escape...,0,thu may escapenumb escapenumb escapenumb escap...
1,vip replica we offer a free gift box with ever...,1,vip replica offer free gift box everi vip watc...
2,on wed jun escapenumber escapenumber at escape...,0,wed jun escapenumb escapenumb escapenumb escap...
3,author metze date escapenumber escapenumber es...,0,author metz date escapenumb escapenumb escapen...
4,hi i' trying to learn how to use lme for linea...,0,hi tri learn use lme linear mix model problem ...


In [22]:
data["email"] = preprocess(data["after_transformation"])

  data = data.str.replace('\d', '')
  data = data.str.replace('[^\w\s]','')


In [23]:
data.head()

Unnamed: 0,email,spam,after_transformation
0,thu may erpm gerald jerri carter wrote co...,0,thu may escapenumb escapenumb escapenumb escap...
1,vip replica offer free gift box everi vip watc...,1,vip replica offer free gift box everi vip watc...
2,wed jun erpm bob roger wrote jonathan wor...,0,wed jun escapenumb escapenumb escapenumb escap...
3,author metz date mon may new revis w...,0,author metz date escapenumb escapenumb escapen...
4,hi tri learn use lme linear mix model problem ...,0,hi tri learn use lme linear mix model problem ...


In [24]:
data = data.drop('after_transformation', axis=1)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [27]:
X = tfidf.fit_transform(data['email']).toarray()

In [28]:
y = data['spam'].values

In [30]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [31]:
def acc_report(actual,predicted):
    acc_score=accuracy_score(actual,predicted)
    cm_matrix=confusion_matrix(actual,predicted)
    class_rep=classification_report(actual,predicted)
    print('the accuracy of tha model is ',acc_score)
    print(cm_matrix)
    print(class_rep)

In [32]:
mnb = MultinomialNB()
dtree = DecisionTreeClassifier(max_depth=5)
ada = AdaBoostClassifier(n_estimators=50, random_state=2)
rf = RandomForestClassifier(n_estimators=50, random_state=2)

In [33]:
mnb.fit(X_train, y_train)
dtree.fit(X_train, y_train)
ada.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [35]:
mnbtrain = mnb.predict(X_train)
mnbtest = mnb.predict(X_test)

dtreetrain = dtree.predict(X_train)
dtreetest = dtree.predict(X_test)

adatrain = ada.predict(X_train)
adatest = ada.predict(X_test)

rftrain = rf.predict(X_train)
rftest = rf.predict(X_test)

In [36]:
print(acc_report(y_train, mnbtrain))
print(acc_report(y_test, mnbtest))

the accuracy of tha model is  0.9586852589641435
[[11452   543]
 [  494 12611]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.96     11995
           1       0.96      0.96      0.96     13105

    accuracy                           0.96     25100
   macro avg       0.96      0.96      0.96     25100
weighted avg       0.96      0.96      0.96     25100

None
the accuracy of tha model is  0.9612810707456979
[[2903  118]
 [ 125 3130]]
              precision    recall  f1-score   support

           0       0.96      0.96      0.96      3021
           1       0.96      0.96      0.96      3255

    accuracy                           0.96      6276
   macro avg       0.96      0.96      0.96      6276
weighted avg       0.96      0.96      0.96      6276

None


In [37]:
print(acc_report(y_train, dtreetrain))
print(acc_report(y_test, dtreetest))

the accuracy of tha model is  0.8535059760956175
[[ 8731  3264]
 [  413 12692]]
              precision    recall  f1-score   support

           0       0.95      0.73      0.83     11995
           1       0.80      0.97      0.87     13105

    accuracy                           0.85     25100
   macro avg       0.88      0.85      0.85     25100
weighted avg       0.87      0.85      0.85     25100

None
the accuracy of tha model is  0.8601019757807521
[[2244  777]
 [ 101 3154]]
              precision    recall  f1-score   support

           0       0.96      0.74      0.84      3021
           1       0.80      0.97      0.88      3255

    accuracy                           0.86      6276
   macro avg       0.88      0.86      0.86      6276
weighted avg       0.88      0.86      0.86      6276

None


In [38]:
print(acc_report(y_train, adatrain))
print(acc_report(y_test, adatest))

the accuracy of tha model is  0.9397609561752988
[[10953  1042]
 [  470 12635]]
              precision    recall  f1-score   support

           0       0.96      0.91      0.94     11995
           1       0.92      0.96      0.94     13105

    accuracy                           0.94     25100
   macro avg       0.94      0.94      0.94     25100
weighted avg       0.94      0.94      0.94     25100

None
the accuracy of tha model is  0.9426386233269598
[[2779  242]
 [ 118 3137]]
              precision    recall  f1-score   support

           0       0.96      0.92      0.94      3021
           1       0.93      0.96      0.95      3255

    accuracy                           0.94      6276
   macro avg       0.94      0.94      0.94      6276
weighted avg       0.94      0.94      0.94      6276

None


In [39]:
print(acc_report(y_train, rftrain))
print(acc_report(y_test, rftest))

the accuracy of tha model is  0.9994422310756972
[[11982    13]
 [    1 13104]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11995
           1       1.00      1.00      1.00     13105

    accuracy                           1.00     25100
   macro avg       1.00      1.00      1.00     25100
weighted avg       1.00      1.00      1.00     25100

None
the accuracy of tha model is  0.9788081580624601
[[2945   76]
 [  57 3198]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3021
           1       0.98      0.98      0.98      3255

    accuracy                           0.98      6276
   macro avg       0.98      0.98      0.98      6276
weighted avg       0.98      0.98      0.98      6276

None


In [40]:
import pickle

In [41]:
pickle.dump(mnb, open('spam.pkl','wb'))

In [42]:
pickle.dump(cv, open("vectorizer.pkl","wb"))

In [43]:
clf = pickle.load(open("spam.pkl","rb"))

In [44]:
clf