## HAM vs. SPAM Text Classification
In this example we will perform email text classification and predict the category of an email as HAM or SPAM. We will use a dataset of emails and their categories that you can download from: https://www.kaggle.com/uciml/sms-spam-collection-dataset
This is a binary discrimination task on which several classification algorithms reach high classification scores of over 90 % accuracy. We will try some of them.

In [None]:
import random, os
import pandas as pd
import numpy as np

# to ensure reproducibility of results depending on random factors
sd = 7 ; np.random.seed(sd) ; random.seed(sd) ; os.environ['PYTHONHASHSEED'] = str(sd)

# load the data from file into a pandas dataframe
filepath = './spam.csv' ; df = pd.read_csv(filepath, encoding='latin-1')
# print the length of the dataset 
print("Number of documents: ", len(df))

In [None]:
# remove redundant columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
# rename valuable columns: email will contain the email texts, labels the category ham or spam
df = df.rename(columns={'v1':'labels','v2': 'email'})
# show first 5 rows of the dataset
df.head()

In [None]:
# add number of email characters in an extra "length" column
df['length'] = df['email'].apply(len)

# get an email as illustration example and print its text and category
email7 = df['email'][6] ; label7 = df['labels'][6]
print("Example text:  ", email7) ; print("Example class: ", label7)

In [None]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# text preprocessing: implement tokenization and stopword removal in a function
# no lowercasing or punctuation removal since capitalization and symbols are important
# features of SPAM emails
def text_preprocess(sample):
    stops = stopwords.words('english')  # english stopwords
    words = word_tokenize(sample)  # word-level tokenization
    clean = [word for word in words if word.lower() not in stops] # remove stopwords
    return clean

# print original email7 example
print("Original example:\n", email7)

# apply the preprocessing function to emails
df['email'].apply(text_preprocess)

# print preprocessed email7 example
email7 = text_preprocess(email7)
print("\nProcessed example:\n", ' '.join(email7))

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# vectorize example email and whole set with count and tfidf vectorizers
cv_transf = CountVectorizer()
tfidf_transf = TfidfTransformer()
cv_transf.fit(df['email'])
cv_metrix = cv_transf.transform(df['email'])
cv_email7 = cv_transf.transform(email7)
tfidf_transf.fit(cv_metrix)
tfidf_email7 = tfidf_transf.transform(cv_email7) 
tfidf_matrix = tfidf_transf.transform(cv_metrix)

# split the entire dataset in train and test cuts: 85% and 15% respectively
from sklearn.model_selection import train_test_split
email_train, email_test, label_train, label_test = train_test_split(tfidf_matrix, 
                                            df['labels'], test_size=0.15)

# 85 % training data and 15 % test data
print("Train size: ", len(label_train))
print("Test size: ", len(label_test))

email_train, email_dev, label_train, label_dev = train_test_split(email_train, 
                                            label_train, test_size=0.10)

# 8.5 % development part and 76.5 % training set and 15 % test set 

# print size of train and test cut
print("\nTrain size: ", len(label_train))
print("Dev size: ", len(label_dev))
print("Test size: ", len(label_test))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

# train a multinomial naive bayes model
mnb = MultinomialNB()
mnb.fit(email_train, label_train)

# Train a K nearest neighbours classifier
knn = KNeighborsClassifier().fit(email_train, label_train)

# train a decision tree classifier
dct = DecisionTreeClassifier().fit(email_train, label_train)

# train a support vector classifier
svc = svm.SVC().fit(email_train, label_train)

# select one of the models
model = svc

# show prediction and actuall category of email7
print('Email7 Prediction:', model.predict(tfidf_email7)[0])
print('Email7 Expectation:', label7)

In [None]:
# get all label predictions
label_pred = model.predict(email_test)

# compute and print accuracy for the given model
from sklearn.metrics import accuracy_score

acc = accuracy_score(label_test, label_pred)

model_name = type(model).__name__
print("Accuracy of model {} is: {:.4f}".format(model_name, acc))