In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
import string
from nltk.corpus import stopwords

In [8]:
email_df = pd.read_csv('files/spam_or_not_spam.csv')
print(email_df.columns)
print(email_df.head())

Index(['email', 'label'], dtype='object')
                                               email  label
0   date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [10]:
def get_clean_data(text):
    no_punctuation = [char for char in text if char not in string.punctuation]
    no_punctuation = ''.join(no_punctuation)

    result = [word for word in no_punctuation.split() if word.lower() not in stopwords.words('english')]
    return result

In [11]:
email_df.drop_duplicates(inplace=True)
email_df.dropna(inplace=True)

In [12]:
emails = email_df['email']
vectorizer = CountVectorizer(analyzer=get_clean_data)
vectored_emails = vectorizer.fit_transform(emails)

In [13]:
x_train,x_test,y_train,y_test = train_test_split(vectored_emails,email_df['label'],train_size=0.80,random_state=42)

In [14]:
classifier = MultinomialNB()
classifier.fit(x_train,y_train)
prediction = classifier.predict(x_test)
print(prediction)

[0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1
 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1
 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 1 0 0 

In [15]:
from sklearn.metrics import mean_squared_error,classification_report,confusion_matrix,accuracy_score
print(mean_squared_error(y_test,prediction))
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))

0.01217391304347826
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       489
           1       0.96      0.95      0.96        86

    accuracy                           0.99       575
   macro avg       0.98      0.97      0.98       575
weighted avg       0.99      0.99      0.99       575

[[486   3]
 [  4  82]]
0.9878260869565217


In [17]:
new_data = ['hey how are you']
vectored_new_data = vectorizer.transform(new_data)
result = classifier.predict(vectored_new_data)
if result[0]==1:
    print("spam")
else:
    print("not a spam")

not a spam
