### Email-Text classification with hyper-P tuning

In [1]:
# IMPORTS

from timeit import default_timer as timer
import pandas as pd
from collections import Counter

# import the news dataset
from sklearn.datasets import fetch_20newsgroups 

# NLKT imports
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize

# Sckit learn imports
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# LOAD

news = fetch_20newsgroups(subset='all') # we load both "train" and "test" subsets to use later in classification

print('There are {} raw texts'.format(len(news.data))) # number of raw texts
print('There are {} news categories'.format(len(news.target_names))) # number of categories
print('The categories names are {}'.format(news.target_names)) # names of categories

# Print the above selections
for text, num_label in zip(news.data[:10], news.target[:10]):
    print('[%s]:\t\t "%s ..."' % (news.target_names[num_label], text[:100].split('\n')[0]))

There are 18846 raw texts
There are 20 news categories
The categories names are ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
[rec.sport.hockey]:		 "From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu> ..."
[comp.sys.ibm.pc.hardware]:		 "From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson) ..."
[talk.politics.mideast]:		 "From: hilmi-er@dsv.su.se (Hilmi Eren) ..."
[comp.sys.ibm.pc.hardware]:		 "From: guyd@austin.ibm.com (Guy Dawson) ..."
[comp.sys.mac.hardware]:		 "From: Alexander Samuel McDiarmid <am2o+@andrew.cmu.edu> ..."
[sci.electronics]:		 "From: tell@cs.unc.edu (Stephen Tell) ..."
[comp.sys.mac.hardware]:		 "From: lpa8921@tamuts.tamu.

In [20]:
# checking the propotion of every category
print(Counter(news.target))

Counter({10: 999, 15: 997, 8: 996, 9: 994, 11: 991, 7: 990, 13: 990, 5: 988, 14: 987, 2: 985, 12: 984, 3: 982, 6: 975, 1: 973, 4: 963, 17: 940, 16: 910, 0: 799, 18: 775, 19: 628})


In [3]:
# a function to train and fit the model
def train_model(clf, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)
    clf.fit(X_train, y_train)
    print("Accuracy: {:.6f}".format(clf.score(X_test, y_test))) # since there no class imballance, Accuracy can be used as performance metric
    return clf.score(X_test, y_test)

In [4]:
# create a dictionary for execution time when chaning parameters
tests_dic = {}

# baseline
test1 = Pipeline([
                ('vectorizer', TfidfVectorizer()),
                ('classifier', MultinomialNB()),
                ])
start = timer()
accuracy = train_model(test1, news.data, news.target)
end  = timer()

run_time = end - start
tests_dic['test'] = ['baseline']
tests_dic['time'] = [run_time]
tests_dic['accuracy'] = [accuracy]

Accuracy: 0.846350


In [5]:
# include stopwords
test2 = Pipeline([
                ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
                ('classifier', MultinomialNB()),
                ])
start = timer()
accuracy = train_model(test2, news.data, news.target)
end  = timer()

run_time = end - start

run_time = end - start
tests_dic['test'].append('stopwords')
tests_dic['time'].append(run_time)
tests_dic['accuracy'].append(accuracy)

Accuracy: 0.877759


In [6]:
# change alpha
test3 = Pipeline([
                ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
                ('classifier', MultinomialNB(alpha=0.05)),
                ])
start = timer()
accuracy = train_model(test3, news.data, news.target)
end  = timer()

run_time = end - start

tests_dic['test'].append('alpha')
tests_dic['time'].append(run_time)
tests_dic['accuracy'].append(accuracy)

Accuracy: 0.910229


In [7]:
# minimum frequency
test4 = Pipeline([
                ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'),
                                               min_df=3)),
                ('classifier', MultinomialNB(alpha=0.05)),
                ])
start = timer()
accuracy = train_model(test3, news.data, news.target)
end = timer()
run_time = end-start

tests_dic['test'].append('minimum_freq')
tests_dic['time'].append(run_time)
tests_dic['accuracy'].append(accuracy)


Accuracy: 0.910229


In [8]:
# stemmer
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
 
test5 = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
                             stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
start = timer()
accuracy = train_model(test5, news.data, news.target)
end = timer()
run_time = end - start

tests_dic['test'].append('stemmer')
tests_dic['time'].append(run_time)
tests_dic['accuracy'].append(accuracy)


Accuracy: 0.910866


In [9]:
df = pd.DataFrame(tests_dic)

In [10]:
# create improvement features that calculates the differences between different changes in the parameters and the baseline
df['improvement'] = df['accuracy'].apply(lambda x: x-df['accuracy'][0]) 

In [11]:
# it looks like the Stemmer test shows the best performance however it comes with greater costs in execution time
# with small improvement compared to previous tests
df

Unnamed: 0,accuracy,test,time,improvement
0,0.84635,baseline,14.974293,0.0
1,0.877759,stopwords,14.306695,0.031409
2,0.910229,alpha,14.548735,0.063879
3,0.910229,minimum_freq,13.9301,0.063879
4,0.910866,stemmer,567.831676,0.064516
