In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import nltk
import string
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [2]:
def load_data(test, spam):
    
    df1 = pd.read_csv('INBOX.csv', encoding = 'utf-8')
    df2 = pd.read_csv('SPAM.csv', encoding = 'utf-8')
    frames = [df1, df2]
    df = pd.concat(frames)
    df = df.sample(frac = 1).reset_index(drop = True)
    
    return(df)

In [3]:
def my_train_test_split(df):
    
    x_train, x_test, y_train, y_test = train_test_split(df.v2, df.v1, test_size = 0.3)
    
    return(x_train, x_test, y_train, y_test)

In [4]:
def build_dictionary(x_train):

    list_of_words = []

    # Collecting all words from those emails
    for index, line in x_train.iteritems():
        words = line.split()
        list_of_words += words
    
    # Removes puctuations and non alphabets

    for index, word in enumerate(list_of_words):
        if word.isalpha() == False or len(word) < 2:
            del list_of_words[index]

    dictionary = Counter(list_of_words)
    
    return(dictionary)

In [5]:
def build_features(x_train, dictionary):
    
  # ndarray to have the features

    feature_matrix = np.zeros((x_train.shape[0], len(dictionary)))

  # collecting the number of occurances of each of the words in the emails

    for index, line in x_train.iteritems():
        words = line.split()
        for word_index, word in enumerate(dictionary):
            feature_matrix[index, word_index] = words.count(word)
    
    return(feature_matrix)

In [6]:
def roc_curve_plot(classifier, X_test, y_test):
    probs = classifier.predict_proba(X_test)
    preds = probs[:, 1]
    y_new_test = label_binarize(y_test, classes = ['spam', 'ham'])
    fpr, tpr, threshold = metrics.roc_curve(y_new_test, preds, pos_label = 0)
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [7]:
df = load_data('INBOX.csv', 'SPAM.csv')

In [8]:
x_train, x_test, y_train, y_test = my_train_test_split(df)
x_train = x_train.reset_index(drop = True)
x_test = x_test.reset_index(drop = True)
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [None]:
dictionary = build_dictionary(x_train)
feature_matrix = build_features(x_train, dictionary)

In [None]:
classifier_MNB = MultinomialNB()
classifier_MNB.fit(feature_matrix, y_train)

In [None]:
test_features = build_features(x_test, dictionary)
result_MNB = classifier_MNB.predict(test_features)

In [None]:
print('Confusion Matrix of Naive Bayes Algorithm is\n{0}'.format(confusion_matrix(y_test, result_MNB)))
print('Accuracy of Naive Bayes Algorithm is {0}'.format(classifier_MNB.score(test_features, y_test)))

In [None]:
roc_curve_plot(classifier_MNB, test_features, y_test)

In [None]:
classifier_LR = LogisticRegression()
classifier_LR.fit(feature_matrix, y_train)
result_LR = classifier_LR.predict(test_features)
print('Confusion Matrix of Support Vector Machine is\n{0}'.format(confusion_matrix(y_test, result_LR)))
print('Accuracy of Support Vector Machine Algorithm is {0}'.format(classifier_LR.score(test_features, y_test)))
roc_curve_plot(classifier_LR, test_features, y_test)

In [None]:
classifier_BNB = BernoulliNB()
classifier_BNB.fit(feature_matrix, y_train)
result_BNB = classifier_BNB.predict(test_features)
print('Confusion Matrix of Bernoulli\'s Naive Bayes Algorithm is\n{0}'.format(confusion_matrix(y_test, result_BNB)))
print('Accuracy of Bernoulli\'s Naive Bayes Algorithm is {0}'.format(classifier_BNB.score(test_features, y_test)))
roc_curve_plot(classifier_BNB, test_features, y_test)

In [None]:
v, k = max((v, k) for k, v in dictionary.items())