In [1]:
from glob import glob
from functions import read_file, build_preprocess_pipeline
from pandas import DataFrame

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.pipeline import Pipeline

## Topic Inference

In [2]:
cates_dir = glob('data/20news-18828/*')

topics_path = {c.split('/')[-1]: glob(f'{c}/*') for c in cates_dir}

print("Total topics (classes): ", len(topics_path)) 

df = DataFrame([(k, v) for k, v in topics_path.items()], columns=['topics', 'files'])

df = df.explode('files')

print("Total files: ", len(df))

df['text'] = df['files'].apply(read_file)

df.value_counts('topics')

Total topics (classes):  20
Total files:  18828


topics
rec.sport.hockey            999
soc.religion.christian      997
rec.motorcycles             994
rec.sport.baseball          994
sci.crypt                   991
rec.autos                   990
sci.med                     990
sci.space                   987
comp.os.ms-windows.misc     985
comp.sys.ibm.pc.hardware    982
sci.electronics             981
comp.windows.x              980
comp.graphics               973
misc.forsale                972
comp.sys.mac.hardware       961
talk.politics.mideast       940
talk.politics.guns          910
alt.atheism                 799
talk.politics.misc          775
talk.religion.misc          628
Name: count, dtype: int64

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['topics'], 
                                                    train_size=0.7, random_state=42)

print("Train: ", len(x_train))
print("Test: ", len(x_test))

Train:  13179
Test:  5649


In [4]:
cnt_pipeline = build_preprocess_pipeline('count').fit(x_train)
tfidf_pipeline = build_preprocess_pipeline('tfidf').fit(x_train)

In [5]:
cv = StratifiedShuffleSplit(n_splits=10, random_state=42, 
                            test_size=1/7) # Validation is around 10% of original dataset

In [6]:
X_train_tfidf_transformed = tfidf_pipeline.transform(x_train)
X_train_cnt_transformed = cnt_pipeline.transform(x_train)

## TF - IDF

#### Regresión Logística 

In [7]:
# Entrenar un clasificador
logistic_estimator = LogisticRegression(n_jobs=-1, random_state=42, 
                                        class_weight='balanced', solver='saga',
                                        max_iter=1000, penalty='l2',
                                        tol=1e-2,
                                        )

logistic_param_grid = {
    'C': [1, 10],
}


grid_search_best_tfidf_lr_estimator = GridSearchCV(
    estimator=logistic_estimator,
    param_grid=logistic_param_grid,
    cv=cv,
    scoring='f1_macro',
    n_jobs=-1,
    return_train_score=False,
    refit=True
).fit(X_train_tfidf_transformed, y_train)

grid_search_best_tfidf_lr_estimator.cv_results_

{'mean_fit_time': array([3.38539026, 2.51609988]),
 'std_fit_time': array([0.26490418, 0.77298931]),
 'mean_score_time': array([0.03264205, 0.01881719]),
 'std_score_time': array([0.00356123, 0.01154198]),
 'param_C': masked_array(data=[1, 10],
              mask=[False, False],
        fill_value=999999),
 'params': [{'C': 1}, {'C': 10}],
 'split0_test_score': array([0.87930025, 0.9001992 ]),
 'split1_test_score': array([0.87784715, 0.89756575]),
 'split2_test_score': array([0.86912299, 0.88878176]),
 'split3_test_score': array([0.88287883, 0.89907949]),
 'split4_test_score': array([0.86221038, 0.88897247]),
 'split5_test_score': array([0.86454146, 0.88715902]),
 'split6_test_score': array([0.87892903, 0.89912519]),
 'split7_test_score': array([0.89251666, 0.91182899]),
 'split8_test_score': array([0.874674  , 0.89981077]),
 'split9_test_score': array([0.88293258, 0.90564576]),
 'mean_test_score': array([0.87649533, 0.89781684]),
 'std_test_score': array([0.0087127, 0.0073763]),
 'ran

In [8]:
# Evaluar el modelo
X_test_transformed_tfidf = tfidf_pipeline.transform(x_test)
y_pred = grid_search_best_tfidf_lr_estimator.predict(X_test_transformed_tfidf)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.91      0.88      0.89       249
           comp.graphics       0.80      0.86      0.83       279
 comp.os.ms-windows.misc       0.86      0.80      0.83       280
comp.sys.ibm.pc.hardware       0.80      0.84      0.82       316
   comp.sys.mac.hardware       0.91      0.88      0.90       283
          comp.windows.x       0.86      0.87      0.87       292
            misc.forsale       0.85      0.89      0.87       298
               rec.autos       0.91      0.92      0.92       305
         rec.motorcycles       0.97      0.95      0.96       276
      rec.sport.baseball       0.96      0.96      0.96       302
        rec.sport.hockey       0.96      0.98      0.97       301
               sci.crypt       0.98      0.91      0.95       301
         sci.electronics       0.88      0.90      0.89       292
                 sci.med       0.92      0.94      0.93       320
         

#### Naive Bayes

In [9]:
nb_estimator = MultinomialNB()

nb_param_grid = {
    'alpha': [0.01, 0.1, 1],
}


grid_search_best_tfidf_nb_estimator = GridSearchCV(
    estimator=nb_estimator,
    param_grid=nb_param_grid,
    cv=cv,
    scoring='f1_macro',
    n_jobs=-1,
    return_train_score=False,
    refit=True
).fit(X_train_tfidf_transformed, y_train)

grid_search_best_tfidf_nb_estimator.cv_results_

{'mean_fit_time': array([0.09111302, 0.07599483, 0.07769463]),
 'std_fit_time': array([0.01943291, 0.01732374, 0.02015989]),
 'mean_score_time': array([0.0183924 , 0.01903548, 0.01129441]),
 'std_score_time': array([0.00230426, 0.00402743, 0.00407774]),
 'param_alpha': masked_array(data=[0.01, 0.1, 1.0],
              mask=[False, False, False],
        fill_value=1e+20),
 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}],
 'split0_test_score': array([0.88913242, 0.88264894, 0.84739137]),
 'split1_test_score': array([0.88891316, 0.88568149, 0.85248198]),
 'split2_test_score': array([0.88056591, 0.87790917, 0.84179237]),
 'split3_test_score': array([0.8941979 , 0.89112104, 0.86179421]),
 'split4_test_score': array([0.86861596, 0.86122972, 0.8322441 ]),
 'split5_test_score': array([0.87497521, 0.8756686 , 0.84596561]),
 'split6_test_score': array([0.87336639, 0.87381385, 0.84106798]),
 'split7_test_score': array([0.88968498, 0.89033201, 0.85584369]),
 'split8_test_score': array([

In [10]:
# Evaluar el modelo
y_pred = grid_search_best_tfidf_nb_estimator.predict(X_test_transformed_tfidf)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.93      0.90      0.91       249
           comp.graphics       0.73      0.87      0.80       279
 comp.os.ms-windows.misc       0.82      0.78      0.80       280
comp.sys.ibm.pc.hardware       0.80      0.83      0.81       316
   comp.sys.mac.hardware       0.89      0.90      0.90       283
          comp.windows.x       0.86      0.88      0.87       292
            misc.forsale       0.87      0.83      0.85       298
               rec.autos       0.94      0.93      0.93       305
         rec.motorcycles       0.95      0.95      0.95       276
      rec.sport.baseball       0.96      0.95      0.95       302
        rec.sport.hockey       0.96      0.98      0.97       301
               sci.crypt       0.97      0.91      0.94       301
         sci.electronics       0.90      0.85      0.88       292
                 sci.med       0.95      0.93      0.94       320
         

## Tf

#### Regresión Logística 

In [11]:
grid_search_best_cnt_lr_estimator = GridSearchCV(
    estimator=logistic_estimator,
    param_grid=logistic_param_grid,
    cv=cv,
    scoring='f1_macro',
    n_jobs=-1,
    return_train_score=False,
    refit=True
).fit(X_train_cnt_transformed, y_train)

grid_search_best_cnt_lr_estimator.cv_results_

{'mean_fit_time': array([11.91115923,  8.02337012]),
 'std_fit_time': array([0.63712802, 2.59289698]),
 'mean_score_time': array([0.03099525, 0.01737478]),
 'std_score_time': array([0.00334988, 0.01030024]),
 'param_C': masked_array(data=[1, 10],
              mask=[False, False],
        fill_value=999999),
 'params': [{'C': 1}, {'C': 10}],
 'split0_test_score': array([0.84074035, 0.84515215]),
 'split1_test_score': array([0.83339301, 0.83852541]),
 'split2_test_score': array([0.82657941, 0.83288213]),
 'split3_test_score': array([0.83085266, 0.83210439]),
 'split4_test_score': array([0.81295536, 0.8159114 ]),
 'split5_test_score': array([0.81231889, 0.81689367]),
 'split6_test_score': array([0.8210875 , 0.82483526]),
 'split7_test_score': array([0.83510687, 0.84122617]),
 'split8_test_score': array([0.81969603, 0.82091896]),
 'split9_test_score': array([0.82714804, 0.82899527]),
 'mean_test_score': array([0.82598781, 0.82974448]),
 'std_test_score': array([0.00895141, 0.0095958 ]),
 

In [12]:
# Evaluar el modelo
X_test_transformed_cnt = cnt_pipeline.transform(x_test)
y_pred = grid_search_best_cnt_lr_estimator.predict(X_test_transformed_cnt)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.92      0.87      0.89       249
           comp.graphics       0.54      0.72      0.62       279
 comp.os.ms-windows.misc       0.79      0.72      0.75       280
comp.sys.ibm.pc.hardware       0.72      0.72      0.72       316
   comp.sys.mac.hardware       0.77      0.78      0.78       283
          comp.windows.x       0.63      0.76      0.69       292
            misc.forsale       0.69      0.87      0.77       298
               rec.autos       0.89      0.86      0.87       305
         rec.motorcycles       0.94      0.91      0.93       276
      rec.sport.baseball       0.88      0.88      0.88       302
        rec.sport.hockey       0.94      0.92      0.93       301
               sci.crypt       0.97      0.89      0.93       301
         sci.electronics       0.83      0.78      0.80       292
                 sci.med       0.89      0.82      0.85       320
         

#### Naive Bayes

In [13]:
grid_search_best_cnt_nb_estimator = GridSearchCV(
    estimator=nb_estimator,
    param_grid=nb_param_grid,
    cv=cv,
    scoring='f1_macro',
    n_jobs=-1,
    return_train_score=False,
    refit=True
).fit(X_train_cnt_transformed, y_train)

grid_search_best_cnt_nb_estimator.cv_results_

{'mean_fit_time': array([0.08484521, 0.07945714, 0.0769522 ]),
 'std_fit_time': array([0.020741  , 0.01090585, 0.01583733]),
 'mean_score_time': array([0.01946864, 0.0156971 , 0.01218255]),
 'std_score_time': array([0.00250013, 0.00177999, 0.00416096]),
 'param_alpha': masked_array(data=[0.01, 0.1, 1.0],
              mask=[False, False, False],
        fill_value=1e+20),
 'params': [{'alpha': 0.01}, {'alpha': 0.1}, {'alpha': 1}],
 'split0_test_score': array([0.83808854, 0.84154247, 0.83470547]),
 'split1_test_score': array([0.81970393, 0.82954796, 0.81935783]),
 'split2_test_score': array([0.83195907, 0.83179974, 0.81658051]),
 'split3_test_score': array([0.82412385, 0.83357243, 0.81593027]),
 'split4_test_score': array([0.81230462, 0.81370069, 0.802438  ]),
 'split5_test_score': array([0.82081095, 0.82712503, 0.81868469]),
 'split6_test_score': array([0.82564492, 0.83281951, 0.81364724]),
 'split7_test_score': array([0.83058794, 0.83873596, 0.84143887]),
 'split8_test_score': array([

In [14]:
# Evaluar el modelo
y_pred = grid_search_best_cnt_nb_estimator.predict(X_test_transformed_cnt)
print(classification_report(y_test, y_pred))

                          precision    recall  f1-score   support

             alt.atheism       0.85      0.87      0.86       249
           comp.graphics       0.64      0.72      0.67       279
 comp.os.ms-windows.misc       0.84      0.60      0.70       280
comp.sys.ibm.pc.hardware       0.67      0.78      0.72       316
   comp.sys.mac.hardware       0.72      0.82      0.77       283
          comp.windows.x       0.73      0.79      0.76       292
            misc.forsale       0.81      0.79      0.80       298
               rec.autos       0.89      0.90      0.89       305
         rec.motorcycles       0.92      0.93      0.93       276
      rec.sport.baseball       0.91      0.91      0.91       302
        rec.sport.hockey       0.95      0.94      0.95       301
               sci.crypt       0.94      0.92      0.93       301
         sci.electronics       0.76      0.80      0.78       292
                 sci.med       0.91      0.82      0.86       320
         

### Best Model

In [20]:
best_model_pipeline = Pipeline([
                            ('preprocess',tfidf_pipeline),
                            ('classifier', grid_search_best_tfidf_lr_estimator)
                        ])

In [38]:
best_model_pipeline.predict(['hi! I suffer very painful stomach pains'])

array(['sci.med'], dtype=object)