In [69]:

import xml.etree.ElementTree as ET
import re
import os
import pandas as pd
import numpy as np
import string
from gensim.parsing.porter import PorterStemmer 
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [70]:
def get_documents(tmpdir,label,df):
    for filename in sorted(os.listdir(tmpdir)):
        with open(tmpdir+filename, encoding="utf8", errors='ignore') as f:
            lines = f.read()
            df.loc[len(df)] = [label,lines]
            


In [71]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'
labels = {}
df = df = pd.DataFrame({"label": [], "documents": [] })
def get_senteces_from_path_20N(path)->list:
    for dirs in sorted(os.listdir(path)):
        labels[dirs] = len(labels)
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            get_documents(tmpdir,labels[dirs],df)

get_senteces_from_path_20N(path)
df.head()     

Unnamed: 0,label,documents
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...


In [72]:
def preprocessing(document: str) -> list:
    """
    clean data by removing non-latin characters
    stem data sentences
    remove stop words from a document
    """
    document = document.lower()
    document = remove_stopwords(document)
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    p = PorterStemmer()
    document = p.stem_sentence(document)
    return document

In [73]:
df['documents_processed'] = df.documents.apply(preprocessing)

display(df)


Unnamed: 0,label,documents,documents_processed
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,from i dbstu rz tu bs de benedikt rosenau subj...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...,from mathew mathew manti co uk subject re univ...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,from strom watson ibm com rob strom subject re...
...,...,...,...
18823,19,From: sbuckley@fraser.sfu.ca (Stephen Buckley)...,from sbucklei fraser sfu ca stephen bucklei su...
18824,19,From: bakerj@gtephx.UUCP (Jon Baker)\nSubject:...,from bakerj gtephx uucp jon baker subject re m...
18825,19,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,from pharvei quack kfu com paul harvei subject...
18826,19,From: <KEVXU@CUNYVM.BITNET>\nSubject: Re: Info...,from kevxu cunyvm bitnet subject re info new a...


In [74]:
X_train_validation, X_test, y_train_validation, y_test = train_test_split(df['documents_processed'], 
                                                    df['label'], 
                                                    random_state=1,test_size=0.3)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=1,test_size=0.143)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


Number of rows in the total set: 18828
Number of rows in the training set: 11294
Number of rows in the validation set: 1885
Number of rows in the test set: 5649


In [75]:
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [76]:
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [77]:
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [78]:
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Naive Bayes

In [79]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.11566401, 0.09676361, 0.10223269, 0.11998487, 0.10903692,
        0.10533309, 0.11847925, 0.10412741, 0.10813546, 0.09676623]),
 'score_time': array([0.02333117, 0.01296496, 0.01705003, 0.01529741, 0.01595592,
        0.01595926, 0.01563501, 0.01795244, 0.01568174, 0.01395631]),
 'test_accuracy': array([0.83915023, 0.8262519 , 0.81335357, 0.83687405, 0.82245827,
        0.83080425, 0.82549317, 0.83383915, 0.82397572, 0.81852696]),
 'test_precision_macro': array([0.86546707, 0.86060998, 0.84402195, 0.85992666, 0.85951652,
        0.86058952, 0.85894189, 0.85916675, 0.85461907, 0.84838882]),
 'test_recall_macro': array([0.8215002 , 0.80866256, 0.79435201, 0.81977415, 0.80106467,
        0.8093632 , 0.806046  , 0.81672769, 0.80640455, 0.80158858]),
 'test_f1_macro': array([0.81738756, 0.80678346, 0.78902361, 0.81302181, 0.79173655,
        0.80003935, 0.80011176, 0.80976988, 0.80305998, 0.79427992])}

In [80]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8979108465263049
best parameter :  {'alpha': 0.01}


In [81]:
predictions = naive_bayes.predict(testing_data_tf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('F1 score: ', format(f1_score(y_test, predictions,average='macro')))

Accuracy score:  0.8148344839794653
Precision score:  0.8497235627763603
Recall score:  0.7992941108197578
F1 score:  0.7893371741391044


In [82]:
naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.10323739, 0.11960053, 0.09362483, 0.16005301, 0.12537265,
        0.09412718, 0.11175966, 0.14100099, 0.12577128, 0.13204122]),
 'score_time': array([0.02035427, 0.        , 0.05270576, 0.01564074, 0.03124404,
        0.0156188 , 0.01400661, 0.01562309, 0.01563025, 0.01562428]),
 'test_accuracy': array([0.83915023, 0.8262519 , 0.81335357, 0.83687405, 0.82245827,
        0.83080425, 0.82549317, 0.83383915, 0.82397572, 0.81852696]),
 'test_precision_macro': array([0.86546707, 0.86060998, 0.84402195, 0.85992666, 0.85951652,
        0.86058952, 0.85894189, 0.85916675, 0.85461907, 0.84838882]),
 'test_recall_macro': array([0.8215002 , 0.80866256, 0.79435201, 0.81977415, 0.80106467,
        0.8093632 , 0.806046  , 0.81672769, 0.80640455, 0.80158858]),
 'test_f1_macro': array([0.81738756, 0.80678346, 0.78902361, 0.81302181, 0.79173655,
        0.80003935, 0.80011176, 0.80976988, 0.80305998, 0.79427992])}

In [83]:
params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tfidf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8991506859090554
best parameter :  {'alpha': 0.01}


In [84]:
predictions = naive_bayes_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('F1 score: ', format(f1_score(y_test, predictions,average='macro')))

Accuracy score:  0.85519560984245
Precision score:  0.874226295065692
Recall score:  0.8429497747880884
F1 score:  0.8378240079223621


## Logistic Regresion

In [86]:
clf_log = LogisticRegression(random_state=0,multi_class='multinomial').fit(training_data_tf_norm, y_train)
scores = cross_validate(clf_log, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

KeyboardInterrupt: 

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [30]:
predictions = clf_log.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('F1 score: ', format(f1_score(y_test, predictions,average='macro')))

Accuracy score:  0.8773234200743495
Precision score:  0.8781323314007391
Recall score:  0.8753214903344413
F1 score:  0.8763249880558377


In [23]:
clf_log_2 = LogisticRegression(random_state=0, max_iter=250, multi_class='multinomial').fit(training_data_tfidf_norm, y_train)
scores = cross_validate(clf_log_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([58.21049595, 60.42855811, 56.58727145, 60.68442798, 63.94622207,
        62.15066838, 57.19216442, 59.63363886, 61.07420874, 66.0457294 ]),
 'score_time': array([0.03468251, 0.03705406, 0.02597952, 0.02541804, 0.0214572 ,
        0.03434634, 0.0279274 , 0.02872157, 0.03333807, 0.03268409]),
 'test_accuracy': array([0.86494689, 0.86342944, 0.85887709, 0.86191199, 0.85735964,
        0.87177542, 0.85963581, 0.86646434, 0.84673748, 0.85649203]),
 'test_precision_macro': array([0.87123872, 0.86500708, 0.86225159, 0.86523053, 0.85725527,
        0.86981803, 0.86117677, 0.86845336, 0.84594393, 0.86026744]),
 'test_recall_macro': array([0.85923957, 0.8593614 , 0.85321705, 0.85756171, 0.84756283,
        0.86276777, 0.85359453, 0.86014987, 0.84047093, 0.84954364]),
 'test_f1_macro': array([0.86171751, 0.86100895, 0.85525114, 0.85950587, 0.84916469,
        0.86376013, 0.85525203, 0.86138409, 0.84125054, 0.8517437 ])}

In [None]:
grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tfidf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [32]:
predictions = clf_log_2.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(test['label'].values, predictions)))
print('Precision score: ', format(precision_score(test['label'].values, predictions,average='macro')))
print('Recall score: ', format(recall_score(test['label'].values, predictions,average='macro')))
print('F1 score: ', format(f1_score(test['label'].values, predictions,average='macro')))

Accuracy score:  0.8413878562577447
Precision score:  0.8511797899772114
Recall score:  0.8382191237793639
F1 score:  0.8374518856871092
