#### Importación de librerias

In [98]:

import xml.etree.ElementTree as ET
import re
import os
import pandas as pd
import numpy as np
import string
from gensim.parsing.porter import PorterStemmer 
from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [99]:
def get_documents(tmpdir,label,df):
    """
        lee cada uno de los documentos y crea un registro en el dataframe en el que una fila es el label,
        y la siguiente es el corpus del documento. Modifica el dataframe ingresado como parametro.
    """
    for filename in sorted(os.listdir(tmpdir)):
        with open(tmpdir+filename, encoding="utf8", errors='ignore') as f:
            lines = f.read()
            df.loc[len(df)] = [label,lines]
            


In [100]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'
labels = {}
df = df = pd.DataFrame({"label": [], "documents": [] })

def get_documents_from_path_20N(path)->list:
    """
       Obtiene las rutas de los archivos, ademas de leer cada carpeta de las 
       categorias de los archivos y crear un label para los documentos leidos. 
    """
    for dirs in sorted(os.listdir(path)):
        labels[dirs] = len(labels)
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            get_documents(tmpdir,labels[dirs],df)

get_documents_from_path_20N(path)
df.head()     

Unnamed: 0,label,documents
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...


In [101]:
def preprocessing(document: str) -> list:
    """
    pasa el texto a minusculas
    remueve caracteres no latinos
    elimina los saltos de linea
    elimina los espacios extra
    hace steming
    """
    document = document.lower()
    document = remove_stopwords(document)
    document = re.sub('\n', ' ', document)
    document = re.sub('[^a-zA-Z]|[0-9]', ' ', document)
    document = re.sub('\s+', ' ', document)
    p = PorterStemmer()
    document = p.stem_sentence(document)
    return document

In [102]:
df['documents_processed'] = df.documents.apply(preprocessing)

display(df)


Unnamed: 0,label,documents,documents_processed
0,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
1,0,From: mathew <mathew@mantis.co.uk>\nSubject: A...,from mathew mathew manti co uk subject alt ath...
2,0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,from i dbstu rz tu bs de benedikt rosenau subj...
3,0,From: mathew <mathew@mantis.co.uk>\nSubject: R...,from mathew mathew manti co uk subject re univ...
4,0,From: strom@Watson.Ibm.Com (Rob Strom)\nSubjec...,from strom watson ibm com rob strom subject re...
...,...,...,...
18823,19,From: sbuckley@fraser.sfu.ca (Stephen Buckley)...,from sbucklei fraser sfu ca stephen bucklei su...
18824,19,From: bakerj@gtephx.UUCP (Jon Baker)\nSubject:...,from bakerj gtephx uucp jon baker subject re m...
18825,19,From: pharvey@quack.kfu.com (Paul Harvey)\nSub...,from pharvei quack kfu com paul harvei subject...
18826,19,From: <KEVXU@CUNYVM.BITNET>\nSubject: Re: Info...,from kevxu cunyvm bitnet subject re info new a...


In [103]:
# generar los data set de entrenamiento, validación y test correspondientes al 60, 10 y 30 por ciento
# respectivamente 

X_train_validation, X_test, y_train_validation, y_test = train_test_split(df['documents_processed'], 
                                                    df['label'], 
                                                    random_state=1,test_size=0.3)

X_train, X_validation, y_train, y_validation = train_test_split(X_train_validation, 
                                                    y_train_validation, 
                                                    random_state=1,test_size=0.143)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the validation set: {}'.format(X_validation.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))


Number of rows in the total set: 18828
Number of rows in the training set: 11294
Number of rows in the validation set: 1885
Number of rows in the test set: 5649


In [104]:
#Genera la representación tf del vocabulario
count_vector = CountVectorizer()
training_data_tf = count_vector.fit_transform(X_train)
validation_data_tf = count_vector.transform(X_validation)
testing_data_tf = count_vector.transform(X_test)

training_validation_x_tf = np.concatenate((X_train,X_validation))
cross_validation_x_tf = count_vector.transform(training_validation_x_tf)
cross_validation_y_tf = np.concatenate((y_train,y_validation))

In [105]:
# se normaliza la representación vectorial
training_data_tf_norm = Normalizer().fit_transform(training_data_tf)
validation_data_tf_norm = Normalizer().fit_transform(validation_data_tf)
testing_data_tf_norm = Normalizer().fit_transform(testing_data_tf)
cross_validation_data_tf_norm = Normalizer().fit_transform(cross_validation_x_tf)

In [106]:
#Genera la representación tfidf del vocabulario
count_vector_tfidf = TfidfVectorizer()
training_data_tfidf = count_vector_tfidf.fit_transform(X_train)
validation_data_tfidf = count_vector_tfidf.transform(X_validation)
testing_data_tfidf = count_vector_tfidf.transform(X_test)

training_validation_x_tfidf = np.concatenate((X_train,X_validation))
cross_validation_x_tfidf = count_vector.transform(training_validation_x_tfidf)
cross_validation_y_tfidf = np.concatenate((y_train,y_validation))

In [107]:
# se normaliza la representación vectorial
training_data_tfidf_norm = Normalizer().fit_transform(training_data_tfidf) 
validation_data_tfidf_norm = Normalizer().fit_transform(validation_data_tfidf)
testing_data_tfidf_norm = Normalizer().fit_transform(testing_data_tfidf)
cross_validation_data_tfidf_norm = Normalizer().fit_transform(cross_validation_x_tfidf)

## Naive Bayes

In [108]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tf
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data_tf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.11108851, 0.12026143, 0.12538719, 0.0941658 , 0.11488318,
        0.2203052 , 0.19093871, 0.10986996, 0.1027317 , 0.11037683]),
 'score_time': array([0.02100134, 0.02263021, 0.01561832, 0.01562262, 0.02637482,
        0.03124356, 0.01357245, 0.02397037, 0.        , 0.00528955]),
 'test_accuracy': array([0.83915023, 0.8262519 , 0.81335357, 0.83687405, 0.82245827,
        0.83080425, 0.82549317, 0.83383915, 0.82397572, 0.81852696]),
 'test_precision_macro': array([0.86546707, 0.86060998, 0.84402195, 0.85992666, 0.85951652,
        0.86058952, 0.85894189, 0.85916675, 0.85461907, 0.84838882]),
 'test_recall_macro': array([0.8215002 , 0.80866256, 0.79435201, 0.81977415, 0.80106467,
        0.8093632 , 0.806046  , 0.81672769, 0.80640455, 0.80158858]),
 'test_f1_macro': array([0.81738756, 0.80678346, 0.78902361, 0.81302181, 0.79173655,
        0.80003935, 0.80011176, 0.80976988, 0.80305998, 0.79427992])}

In [109]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8979108465263049
best parameter :  {'alpha': 0.01}


In [110]:
# Evaluación del modelo
predictions = naive_bayes.predict(testing_data_tf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.8148344839794653
Macro-Precision score:  0.8497235627763603
Macro-Recall score:  0.7992941108197578
Macro-F1 score:  0.7893371741391044
Micro-Precision score:  0.8148344839794653
Micro-Recall score:  0.8148344839794653
Micro-F1 score:  0.8148344839794653


In [111]:
# Entrenamiento del algoritmo de naive bayes y cross validation para la
# representación vectoria de tfidf

naive_bayes_2 = MultinomialNB()
naive_bayes_2.fit(training_data_tfidf_norm,y_train)
scores = cross_validate(naive_bayes, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([0.21950388, 0.12575865, 0.15664077, 0.10976577, 0.10974836,
        0.15705442, 0.10977006, 0.10966754, 0.15661693, 0.15663671]),
 'score_time': array([0.01562095, 0.03124857, 0.01601887, 0.01562428, 0.03124428,
        0.01561761, 0.0156219 , 0.01563692, 0.01602459, 0.03163981]),
 'test_accuracy': array([0.83915023, 0.8262519 , 0.81335357, 0.83687405, 0.82245827,
        0.83080425, 0.82549317, 0.83383915, 0.82397572, 0.81852696]),
 'test_precision_macro': array([0.86546707, 0.86060998, 0.84402195, 0.85992666, 0.85951652,
        0.86058952, 0.85894189, 0.85916675, 0.85461907, 0.84838882]),
 'test_recall_macro': array([0.8215002 , 0.80866256, 0.79435201, 0.81977415, 0.80106467,
        0.8093632 , 0.806046  , 0.81672769, 0.80640455, 0.80158858]),
 'test_f1_macro': array([0.81738756, 0.80678346, 0.78902361, 0.81302181, 0.79173655,
        0.80003935, 0.80011176, 0.80976988, 0.80305998, 0.79427992])}

In [112]:
# Busqueda de hiperaparametros variando el alpha

params = {'alpha': [0.01, 0.1, 0.5, 1.0, 10.0, ],}

multinomial_nb_grid = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5)
multinomial_nb_grid.fit(training_data_tfidf_norm,y_train)

print(f'Best Accuracy Through Grid Search : {multinomial_nb_grid.best_score_}')
print('best parameter : ', multinomial_nb_grid.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Accuracy Through Grid Search : 0.8991506859090554
best parameter :  {'alpha': 0.01}


In [113]:
# Evaluación del modelo

predictions = naive_bayes_2.predict(testing_data_tfidf_norm)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.85519560984245
Macro-Precision score:  0.874226295065692
Macro-Recall score:  0.8429497747880884
Macro-F1 score:  0.8378240079223621
Micro-Precision score:  0.85519560984245
Micro-Recall score:  0.85519560984245
Micro-F1 score:  0.85519560984245


## Logistic Regresion

In [87]:
# Entrenamiento del algoritmo de logisrtic regression y cross validation para la
# representación vectoria de tf

clf_log = LogisticRegression(random_state=0,multi_class='multinomial').fit(training_data_tf_norm, y_train)
scores = cross_validate(clf_log, cross_validation_data_tf_norm, cross_validation_y_tf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'fit_time': array([42.34047914, 42.92745209, 40.99378872, 41.27219057, 41.24794364,
        42.91863942, 39.38292599, 40.77265573, 41.74754548, 46.80591846]),
 'score_time': array([0.01585126, 0.01016736, 0.03091812, 0.02097106, 0.01795244,
        0.01496029, 0.01538253, 0.01695466, 0.03291249, 0.01894832]),
 'test_accuracy': array([0.86494689, 0.86342944, 0.85887709, 0.86191199, 0.85735964,
        0.87177542, 0.85963581, 0.86646434, 0.84673748, 0.85649203]),
 'test_precision_macro': array([0.87123872, 0.86500708, 0.86225159, 0.86523053, 0.85725527,
        0.86981803, 0.86117677, 0.86845336, 0.84594393, 0.86026744]),
 'test_recall_macro': array([0.85923957, 0.8593614 , 0.85321705, 0.85756171, 0.84756283,
        0.86276777, 0.85359453, 0.86014987, 0.84047093, 0.84954364]),
 'test_f1_macro': array([0.86171751, 0.86100895, 0.85525114, 0.85950587, 0.84916469,
        0.86376013, 0.85525203, 0.86138409, 0.84125054, 0.8517437 ])}

In [88]:
# Busqueda de hiperaparametros variando la regularización y el penalty

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in

tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
accuracy : 0.8910044914051906


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
# Evaluación del modelo

predictions = clf_log.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.8366082492476544
Macro-Precision score:  0.8427869790558236
Macro-Recall score:  0.8336493934105711
Macro-F1 score:  0.832086303582134
Micro-Precision score:  0.8366082492476544
Micro-Recall score:  0.8366082492476544
Micro-F1 score:  0.8366082492476544


In [94]:
# Entrenamiento del algoritmo de logisrtic regression y cross validation para la
# representación vectoria de tfidf

clf_log_2 = LogisticRegression(random_state=0, max_iter=250, multi_class='multinomial').fit(training_data_tfidf_norm, y_train)
scores = cross_validate(clf_log_2, cross_validation_data_tfidf_norm, cross_validation_y_tfidf, cv=10, scoring=('accuracy','precision_macro','recall_macro','f1_macro'))
scores

{'fit_time': array([39.15028811, 39.02197862, 41.14971733, 41.85209155, 41.10620785,
        39.59886289, 37.70591044, 40.3715117 , 39.606004  , 40.76563263]),
 'score_time': array([0.01602077, 0.01561713, 0.01601577, 0.01602793, 0.01562142,
        0.03125215, 0.01762986, 0.03124857, 0.01142073, 0.01561952]),
 'test_accuracy': array([0.86494689, 0.86342944, 0.85887709, 0.86191199, 0.85735964,
        0.87177542, 0.85963581, 0.86646434, 0.84673748, 0.85649203]),
 'test_precision_macro': array([0.87123872, 0.86500708, 0.86225159, 0.86523053, 0.85725527,
        0.86981803, 0.86117677, 0.86845336, 0.84594393, 0.86026744]),
 'test_recall_macro': array([0.85923957, 0.8593614 , 0.85321705, 0.85756171, 0.84756283,
        0.86276777, 0.85359453, 0.86014987, 0.84047093, 0.84954364]),
 'test_f1_macro': array([0.86171751, 0.86100895, 0.85525114, 0.85950587, 0.84916469,
        0.86376013, 0.85525203, 0.86138409, 0.84125054, 0.8517437 ])}

In [95]:
# Busqueda de hiperaparametros variando la regularización y el penalty

grid={"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg=LogisticRegression()
logreg_cv=GridSearchCV(logreg,grid,cv=10)
logreg_cv.fit(training_data_tfidf_norm,y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 443, in _check_solver
    raise ValueError("Solver %s supports only 'l2' or 'none' penalties, "
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\camilo\anaconda3\envs\nlp\lib\site-packages\sklearn\linear_model\_logistic.py", line 1306, in

tuned hpyerparameters :(best parameters)  {'C': 1000.0, 'penalty': 'l2'}
accuracy : 0.911723116235685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [96]:
# Evaluación del modelo

predictions = clf_log_2.predict(testing_data_tf)
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Macro-Precision score: ', format(precision_score(y_test, predictions,average='macro')))
print('Macro-Recall score: ', format(recall_score(y_test, predictions,average='macro')))
print('Macro-F1 score: ', format(f1_score(y_test, predictions,average='macro')))

print('Micro-Precision score: ', format(precision_score(y_test, predictions,average='micro')))
print('Micro-Recall score: ', format(recall_score(y_test, predictions,average='micro')))
print('Micro-F1 score: ', format(f1_score(y_test, predictions,average='micro')))

Accuracy score:  0.8378474066206408
Macro-Precision score:  0.8484108119288869
Macro-Recall score:  0.835071538343235
Macro-F1 score:  0.8337097623345013
Micro-Precision score:  0.8378474066206408
Micro-Recall score:  0.8378474066206408
Micro-F1 score:  0.8378474066206408
