In [1]:

import xml.etree.ElementTree as ET
import re
import os
import pandas as pd
import numpy as np
import string
from gensim.parsing.porter import PorterStemmer 
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
def get_documents(tmpdir,label,df):
    for filename in sorted(os.listdir(tmpdir)):
        with open(tmpdir+filename, encoding="utf8", errors='ignore') as f:
            lines = f.read()
            df.loc[len(df)] = [label,lines]
            


In [6]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'
labels = {}
df = df = pd.DataFrame({"label": [], "documents": [] })
def get_senteces_from_path_20N(path)->list:
    for dirs in sorted(os.listdir(path)):
        labels[dirs] = len(labels)
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            get_documents(tmpdir,labels[dirs],df)

get_senteces_from_path_20N(path)
df.head()     

Unnamed: 0,label,documents
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...


In [7]:
def preprocessing(document: str) -> list:
    """
    clean data by removing non-latin characters
    stem data sentences
    remove stop words from a document
    """
    document = document.lower()
    document = remove_stopwords(document)
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    p = PorterStemmer()
    document = p.stem_sentence(document)
    return document

In [8]:
df['documents_processed'] = df.documents.apply(preprocessing)

display(df)


Unnamed: 0,label,documents,documents_processed
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,from i dbstu rz tu bs de benedikt rosenau subj...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...,from mathew mathew manti co uk subject re univ...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,from strom watson ibm com rob strom subject re...
...,...,...,...
18823,19,From: sbuckley@fraser.sfu.ca (Stephen Buckley)...,from sbucklei fraser sfu ca stephen bucklei su...
18824,19,From: bakerj@gtephx.UUCP (Jon Baker)\nSubject:...,from bakerj gtephx uucp jon baker subject re m...
18825,19,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,from pharvei quack kfu com paul harvei subject...
18826,19,From: <KEVXU@CUNYVM.BITNET>\nSubject: Re: Info...,from kevxu cunyvm bitnet subject re info new a...


In [9]:
train, validation, test = np.split(df.sample(frac=1, random_state=42), 
                            [int(.6*len(df)), int(.7*len(df))])

print("Original size: {}".format(df.shape))
print("train shape: {}".format(train.shape))
print("validation shape: {}".format(validation.shape))
print("test shape: {}".format(test.shape))


Original size: (18828, 3)
train shape: (11296, 3)
validation shape: (1883, 3)
test shape: (5649, 3)


In [10]:
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(train['documents_processed'].values)
validation_data_tf = count_vector.transform(validation['documents_processed'].values)
testing_data_tf = count_vector.transform(test['documents_processed'].values)

training_validation_x_tf = np.concatenate((train['documents_processed'].values,validation['documents_processed'].values))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((train['label'].values,validation['label'].values))

In [11]:
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(train['documents_processed'].values)
validation_data_tfidf = count_vector_tfidf.transform(validation['documents_processed'].values)
testing_data_tfidf = count_vector_tfidf.transform(test['documents_processed'].values)

training_validation_x_tfidf = np.concatenate((train['documents_processed'].values,validation['documents_processed'].values))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((train['label'].values,validation['label'].values))

## Naive Bayes

In [12]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf,train['label'].values)
scores = cross_validate(naive_bayes, cross_validation_x_tf, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.09873724, 0.10571623, 0.09873128, 0.10671091, 0.12266803,
        0.09873509, 0.14361453, 0.10870862, 0.09476423, 0.14956546]),
 'score_time': array([0.01296473, 0.01296043, 0.01299524, 0.02992153, 0.01495957,
        0.01595807, 0.01994658, 0.01498842, 0.01394463, 0.0179522 ]),
 'test_accuracy': array([0.85356601, 0.88391502, 0.84522003, 0.87101669, 0.85356601,
        0.86418816, 0.86191199, 0.87025797, 0.87329287, 0.86028853]),
 'test_precision_macro': array([0.86966458, 0.89371184, 0.85394464, 0.88582491, 0.86616487,
        0.87887166, 0.88133039, 0.88018104, 0.88705615, 0.87112448]),
 'test_recall_macro': array([0.84680464, 0.87620575, 0.83841742, 0.86517072, 0.84550489,
        0.85616903, 0.85325946, 0.86471207, 0.86678354, 0.85269931]),
 'test_f1_macro': array([0.84245793, 0.87479524, 0.83265917, 0.86478761, 0.83979475,
        0.85544158, 0.84608133, 0.8622628 , 0.86262036, 0.84594671])}

In [13]:
predictions = naive_bayes.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(test['label'].values, predictions)))
print('Precision score: ', format(precision_score(test['label'].values, predictions,average='macro')))
print('Recall score: ', format(recall_score(test['label'].values, predictions,average='macro')))
print('F1 score: ', format(f1_score(test['label'].values, predictions,average='macro')))

Accuracy score:  0.8557266772880155
Precision score:  0.87326708830818
Recall score:  0.8534836602339386
F1 score:  0.847581436359263


In [14]:
naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf,train['label'].values)
scores = cross_validate(naive_bayes, cross_validation_x_tfidf, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.13364244, 0.09674072, 0.10671711, 0.0907495 , 0.09574413,
        0.09973407, 0.09674501, 0.09674144, 0.09674168, 0.1266613 ]),
 'score_time': array([0.01595783, 0.01495957, 0.01395988, 0.01696324, 0.01598883,
        0.01496148, 0.01798391, 0.01399589, 0.01698756, 0.01695514]),
 'test_accuracy': array([0.85356601, 0.88391502, 0.84522003, 0.87101669, 0.85356601,
        0.86418816, 0.86191199, 0.87025797, 0.87329287, 0.86028853]),
 'test_precision_macro': array([0.86966458, 0.89371184, 0.85394464, 0.88582491, 0.86616487,
        0.87887166, 0.88133039, 0.88018104, 0.88705615, 0.87112448]),
 'test_recall_macro': array([0.84680464, 0.87620575, 0.83841742, 0.86517072, 0.84550489,
        0.85616903, 0.85325946, 0.86471207, 0.86678354, 0.85269931]),
 'test_f1_macro': array([0.84245793, 0.87479524, 0.83265917, 0.86478761, 0.83979475,
        0.85544158, 0.84608133, 0.8622628 , 0.86262036, 0.84594671])}

In [15]:
predictions = naive_bayes_2.predict(testing_data_tfidf)
print('Accuracy score: ', format(accuracy_score(test['label'].values, predictions)))
print('Precision score: ', format(precision_score(test['label'].values, predictions,average='macro')))
print('Recall score: ', format(recall_score(test['label'].values, predictions,average='macro')))
print('F1 score: ', format(f1_score(test['label'].values, predictions,average='macro')))

Accuracy score:  0.8629845990440786
Precision score:  0.8842414508084777
Recall score:  0.8510266495885734
F1 score:  0.8474723928859029


## Logistic Regresion

In [28]:
clf_log_0 = LogisticRegression(random_state=0,multi_class='multinomial').fit(training_data_tf, train['label'].values)
scores_0 = cross_validate(clf_log_0, cross_validation_x_tf, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores_0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([45.50137115, 46.14166117, 41.7095294 , 42.11690855, 40.38604426,
        46.77610946, 43.41295457, 44.59412193, 45.69684792, 44.03130102]),
 'score_time': array([0.01795316, 0.01695395, 0.01693082, 0.01995349, 0.02094388,
        0.01994753, 0.01795101, 0.01496172, 0.02194381, 0.01495886]),
 'test_accuracy': array([0.88543247, 0.88846737, 0.87253414, 0.88239757, 0.8801214 ,
        0.89150228, 0.89301973, 0.87481032, 0.89757208, 0.89066059]),
 'test_precision_macro': array([0.88755238, 0.89167206, 0.87205017, 0.88468477, 0.8773183 ,
        0.89377206, 0.890412  , 0.87670491, 0.89851077, 0.89334661]),
 'test_recall_macro': array([0.8823568 , 0.88456409, 0.86938008, 0.88150316, 0.87416059,
        0.88889432, 0.88792547, 0.87392769, 0.8932437 , 0.88903774]),
 'test_f1_macro': array([0.88421627, 0.88679451, 0.86938292, 0.88207696, 0.87470656,
        0.88998523, 0.88863844, 0.87498501, 0.89366201, 0.89015005])}

In [29]:
clf_log = LogisticRegression(random_state=0, max_iter=250, multi_class='multinomial').fit(training_data_tf, train['label'].values)
scores = cross_validate(clf_log, cross_validation_x_tf, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([ 93.45119953,  98.64232278,  91.14887762,  81.11816621,
         91.78565168,  97.13934159,  87.05890322,  97.85941529,
        100.65893221,  98.8966434 ]),
 'score_time': array([0.01695466, 0.01894951, 0.02393508, 0.019943  , 0.01695299,
        0.01795173, 0.01496053, 0.01496124, 0.01894903, 0.01495981]),
 'test_accuracy': array([0.88694992, 0.88770865, 0.87253414, 0.88239757, 0.87936267,
        0.89150228, 0.89301973, 0.87481032, 0.89757208, 0.88990129]),
 'test_precision_macro': array([0.88885929, 0.89053554, 0.87191653, 0.88468477, 0.87632181,
        0.89377206, 0.89035845, 0.87680551, 0.89851077, 0.89236727]),
 'test_recall_macro': array([0.88425489, 0.88340129, 0.86938008, 0.88150316, 0.87297012,
        0.88889432, 0.88805639, 0.87392769, 0.8932437 , 0.88823965]),
 'test_f1_macro': array([0.88595803, 0.88549605, 0.86938499, 0.88207696, 0.8735194 ,
        0.88998523, 0.88869109, 0.87502677, 0.89366201, 0.88921305])}

In [30]:
predictions = clf_log.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(test['label'].values, predictions)))
print('Precision score: ', format(precision_score(test['label'].values, predictions,average='macro')))
print('Recall score: ', format(recall_score(test['label'].values, predictions,average='macro')))
print('F1 score: ', format(f1_score(test['label'].values, predictions,average='macro')))

Accuracy score:  0.8773234200743495
Precision score:  0.8781323314007391
Recall score:  0.8753214903344413
F1 score:  0.8763249880558377


In [31]:
clf_log_2 = LogisticRegression(random_state=0, max_iter=250, multi_class='multinomial').fit(training_data_tfidf, train['label'].values)
scores = cross_validate(clf_log, cross_validation_x_tfidf, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([89.03699827, 95.82884336, 92.87672639, 83.81794858, 94.37820458,
        98.33614302, 87.72251344, 96.61374426, 99.14449477, 99.78358293]),
 'score_time': array([0.01496029, 0.01894927, 0.02294254, 0.01994562, 0.01994658,
        0.02393508, 0.02094603, 0.01695538, 0.02393126, 0.01595926]),
 'test_accuracy': array([0.88694992, 0.88770865, 0.87253414, 0.88239757, 0.87936267,
        0.89150228, 0.89301973, 0.87481032, 0.89757208, 0.88990129]),
 'test_precision_macro': array([0.88885929, 0.89053554, 0.87191653, 0.88468477, 0.87632181,
        0.89377206, 0.89035845, 0.87680551, 0.89851077, 0.89236727]),
 'test_recall_macro': array([0.88425489, 0.88340129, 0.86938008, 0.88150316, 0.87297012,
        0.88889432, 0.88805639, 0.87392769, 0.8932437 , 0.88823965]),
 'test_f1_macro': array([0.88595803, 0.88549605, 0.86938499, 0.88207696, 0.8735194 ,
        0.88998523, 0.88869109, 0.87502677, 0.89366201, 0.88921305])}

In [32]:
predictions = clf_log_2.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(test['label'].values, predictions)))
print('Precision score: ', format(precision_score(test['label'].values, predictions,average='macro')))
print('Recall score: ', format(recall_score(test['label'].values, predictions,average='macro')))
print('F1 score: ', format(f1_score(test['label'].values, predictions,average='macro')))

Accuracy score:  0.8413878562577447
Precision score:  0.8511797899772114
Recall score:  0.8382191237793639
F1 score:  0.8374518856871092
