In [1]:
import pandas as pd

In [2]:
main_path = '../Datasets/CSV/Clean/'

# English
data_training_en_path = f'{main_path}data_training_en.csv'
data_test_en_path = f'{main_path}data_test_en.csv'

# Spanish
data_training_es_path = f'{main_path}data_training_es.csv'
data_test_es_path = f'{main_path}data_test_en.csv'

In [3]:
# English
data_training_en = pd.read_csv(data_training_en_path).dropna()
data_test_en = pd.read_csv(data_test_en_path).dropna()

# Spanish
data_training_es = pd.read_csv(data_training_es_path).dropna()
data_test_es = pd.read_csv(data_test_es_path).dropna()

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp, fmin, tpe, Trials
from sklearn.model_selection import cross_val_score, KFold

# Cargar los datos preprocesados
train_data = data_training_en
test_data = data_test_en

# Definir la lista de modelos a evaluar y sus espacios de hiperparámetros para hyperopt
models = {
    'logistic_regression': {
        'model': LogisticRegression,
        'space': {
            'C': hp.loguniform('C', -10, 10),
            'penalty': hp.choice('penalty', ['l1', 'l2']),
            'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'saga']),
            'max_iter': hp.choice('max_iter', [1000, 5000, 10000])
        }
    },
    'linear_svc': {
        'model': LinearSVC,
        'space': {
            'C': hp.loguniform('C', -10, 10),
            'penalty': hp.choice('penalty', ['l1', 'l2']),
            'max_iter': hp.choice('max_iter', [1000, 5000, 10000])
        }
    },
    'naive_bayes': {
        'model': MultinomialNB,
        'space': {
            'alpha': hp.loguniform('alpha', -10, 10)
        }
    },
    'random_forest': {
        'model': RandomForestClassifier,
        'space': {
            'n_estimators': hp.quniform('n_estimators', 50, 500, 10),
            'max_depth': hp.quniform('max_depth', 5, 50, 1)
        }
    }
}

# Definir el vectorizador TF-IDF con distintas frecuencias mínimas de aparición
min_df_values = [1, 2, 3, 4, 5]
vectorizers = {min_df: TfidfVectorizer(min_df=min_df) for min_df in min_df_values}

# Función para entrenar y evaluar un modelo con cross-validation
def evaluate_model(model_name, X, y):
    model_info = models[model_name]
    model_class = model_info['model']
    space = model_info['space']
    clf = fmin(
        fn=lambda params: -cross_val_score(model_class(**params), X, y, cv=KFold(10)).mean(),
        space=space,
        algo=tpe.suggest,
        max_evals=100,
        trials=Trials()
    )
    print('Best hyperparameters for {} model: {}'.format(model_name, clf))
    best_model = model_class(**clf)
    scores = cross_val_score(best_model, X, y, cv=KFold(10))
    print('Cross-validation scores for {} model: {}'.format(model_name, scores))
    print('Mean score: {}'.format(scores.mean()))
    print('Standard deviation: {}'.format(scores.std()))

# Entrenar y evaluar los modelos para cada frecuencia mínima de aparición en el vectorizador
for min_df, vectorizer in vectorizers.items():
    X_train = vectorizer.fit_transform(train_data['tweet'])
    y_train = train_data['author']
    print('Evaluating models for min_df={}'.format(min_df))
    for model_name in models:
        evaluate_model(model_name, X_train, y_train)

Evaluating models for min_df=1
  1%|          | 1/100 [00:00<00:23,  4.28trial/s, best loss=?]

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
Value

  4%|▍         | 4/100 [00:32<12:33,  7.85s/trial, best loss: -0.759831782212711] 

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
Value

  5%|▌         | 5/100 [02:04<1:00:04, 37.95s/trial, best loss: -0.8573831682675559]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver o

  6%|▌         | 6/100 [02:59<1:08:39, 43.83s/trial, best loss: -0.8573831682675559]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



  8%|▊         | 8/100 [03:18<38:37, 25.19s/trial, best loss: -0.8573831682675559]  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver o

  9%|▉         | 9/100 [04:12<51:58, 34.27s/trial, best loss: -0.8573831682675559]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

