In [1]:
!git clone https://github.com/CCogS-Mx/text-preprocessing.git

Cloning into 'text-preprocessing'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 54 (delta 16), reused 44 (delta 11), pack-reused 0[K
Unpacking objects: 100% (54/54), 21.99 KiB | 1.37 MiB/s, done.


## Imports

In [1]:
import sys
sys.path.append('text-preprocessing/script/')
sys.path.append('Scripts/')

In [2]:
import os
import pandas as pd
import numpy as np

# Import classes
import text_preprocessing as tp
import feature_extraction as fe
import robust_classic_model as rcm
import cleaning_twitter_data as ctd

# Algoritmos de ML
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV

# Vectorizadores
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qk_le\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data

In [3]:
main_path = 'Data/'
training = pd.read_csv(f'{main_path}homomex_training.csv').drop('index', axis=1)

In [4]:
training.head()

Unnamed: 0,tweets,label
0,"Me quise ligar a una chava ayer y no me pelo, ...",P
1,"@papaya_rockera eres un puñal, Papayita.",P
2,Magnate ofrece 130 mdd al hombre que conquiste...,P
3,Los trolebuses del desgobierno de @EPN son idi...,P
4,"En época de Hitler no se decía ""eres gay"" y, s...",P


In [5]:
prep = tp.Preprocessing('spanish')

In [6]:
training_p = prep.main_preprocess(data=training, 
                                  column='tweets', 
                                  tweet=True, 
                                  tweet_tags=True, 
                                  remove_stop_words=False, 
                                  lemmatize=False, 
                                  translate_emojis=True)

In [7]:
training_p.head()

Unnamed: 0,tweets,label
0,me quise ligar a una chava ayer y no me pelo l...,P
1,eres un punal papayita,P
2,magnate ofrece mdd al hombre que conquiste a s...,P
3,los trolebuses del desgobierno de son idiotas ...,P
4,en epoca de hitler no se decia eres gay y si e...,P


In [8]:
training_p.label = training_p.label.fillna('NR')

In [9]:
dictionary_list = [{'P': 0, 'NP': 1, 'NR': 2}]

In [10]:
training_p.label.replace(list(dictionary_list[0].keys()), 
            list(dictionary_list[0].values()), 
            inplace=True)

In [11]:
training_p.label.value_counts()

label
1    4360
2    1778
0     862
Name: count, dtype: int64

## Finding best hyperparams (TP)


In [17]:
from sklearn.model_selection import KFold, GridSearchCV

In [18]:
models = [LogisticRegression(), RandomForestClassifier(), LinearSVC(), KNeighborsClassifier()]
model_names = ['Regresión logística', 'Random Forest', 'SVM Lineal', 'KNN']
x_label_column = 'tweets'
y_label_column = 'label'
folds = 5
grid_params  = [{"C":np.logspace(-3,3,7), "penalty":["l2"], "max_iter": [1000, 10000], "random_state":[42]},
                {"n_estimators": [10, 50, 100], "max_depth":[10, 50, 100], "random_state": [42]},
                {"C":np.logspace(-3,3,7), "penalty":["l2"], "max_iter": [1000, 10000], "random_state":[42]},
                {"n_neighbors":[3], "weights":['uniform', 'distance']}]
path_to_save_models = ''
path_to_save_vectorizers = ''
vectorizers = [CountVectorizer(ngram_range = (1,1), lowercase = False), TfidfVectorizer(ngram_range = (1,1), lowercase = False)]
vectorizers_names = ['CountVectorizer', 'TfidfVectorizer']

In [20]:
for vectorizer in vectorizers:
    print(f'Embedding: {vectorizers_names[vectorizers.index(vectorizer)]}')
    X_train = vectorizer.fit_transform(training_p[x_label_column])
    y_train = training_p[y_label_column].tolist()
    for j, model in enumerate(models):
        print(f'Model {j}: {model_names[j]}')
        kFold_cv = KFold(n_splits=folds,
                        shuffle=True, 
                        random_state=42)
        param_grid = grid_params
        grid_search = GridSearchCV(model,
                                param_grid[j],
                                cv=kFold_cv.split(X_train, y_train),
                                scoring='f1_macro',
                                verbose=3,
                                return_train_score=True)
        grid_search.fit(X_train, y_train)

        print("Best parameters: ", grid_search.best_params_)
        print("Accuracy :", grid_search.best_score_)

Embedding: CountVectorizer
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.306, test=0.297) total time=   0.1s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.316, test=0.306) total time=   0.3s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.302, test=0.293) total time=   0.1s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.311, test=0.295) total time=   0.2s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.287, test=0.294) total time=   0.2s
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.306, test=0.297) total time=   0.1s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.316, test=0.306) total time=   0.2s
[CV 3/5] END C=0.001, max_iter=10000, penalty=l2, rando



[CV 1/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.960, test=0.737) total time=   0.5s




[CV 2/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.957, test=0.753) total time=   0.5s




[CV 3/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.959, test=0.762) total time=   0.5s




[CV 4/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.957, test=0.746) total time=   0.5s




[CV 5/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.957, test=0.749) total time=   0.5s
[CV 1/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.960, test=0.737) total time=   0.5s
[CV 2/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.957, test=0.753) total time=   0.6s
[CV 3/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.959, test=0.762) total time=   0.8s
[CV 4/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.957, test=0.746) total time=   0.7s
[CV 5/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.957, test=0.749) total time=   0.7s




[CV 1/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.995, test=0.712) total time=   0.4s




[CV 2/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.994, test=0.743) total time=   0.5s




[CV 3/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.994, test=0.757) total time=   0.5s




[CV 4/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.996, test=0.729) total time=   0.5s




[CV 5/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.993, test=0.739) total time=   0.5s
[CV 1/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.995, test=0.712) total time=   1.9s
[CV 2/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.994, test=0.743) total time=   2.3s
[CV 3/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.994, test=0.755) total time=   2.4s
[CV 4/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.996, test=0.732) total time=   2.4s
[CV 5/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.993, test=0.739) total time=   2.1s




[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.670) total time=   0.4s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.700) total time=   0.5s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.715) total time=   0.4s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.706) total time=   0.4s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.687) total time=   0.4s




[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.671) total time=   3.0s




[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.702) total time=   3.8s
[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.709) total time=   3.0s




[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.708) total time=   3.2s




[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.682) total time=   3.3s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.638) total time=   0.4s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.688) total time=   0.4s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.672) total time=   0.4s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.670) total time=   0.4s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.670) total time=   0.4s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.640) total time=   3.5s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.684) total time=   4.0s




[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.670) total time=   3.8s




[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.674) total time=   3.3s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.670) total time=   3.8s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.990, test=0.624) total time=   0.4s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.678) total time=   0.4s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.992, test=0.657) total time=   0.4s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.995, test=0.670) total time=   0.4s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.995, test=0.662) total time=   0.4s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.984, test=0.624) total time=   3.4s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.976, test=0.638) total time=   3.6s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.987, test=0.649) total time=   3.5s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.984, test=0.650) total time=   3.6s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.970, test=0.634) total time=   3.4s




Best parameters:  {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7493244480246294
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.686, test=0.501) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.699, test=0.496) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.681, test=0.495) total time=   0.2s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.672, test=0.512) total time=   0.2s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.683, test=0.485) total time=   0.2s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.523) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.522) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=distance;, score=(train=1.000, test=0.500) total time=   0.2s
[CV 4/5] END n_neighbors=3, weight



[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.668) total time=   0.7s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.702) total time=   0.7s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.693) total time=   0.7s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.684) total time=   0.7s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.694) total time=   0.8s
[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.667) total time=   3.0s
[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.703) total time=   3.0s
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.691) total time=   2.7s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.681) total time=   3.0s
[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.691) total time=   3.3s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.656) total time=   0.7s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.692) total time=   0.7s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.684) total time=   0.7s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.660) total time=   0.7s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.675) total time=   0.7s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.650) total time=   6.1s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.696) total time=   6.6s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.685) total time=   6.1s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.663) total time=   6.1s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.674) total time=   6.3s
Best parameters:  {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7363699056755008
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.782, test=0.583) total time=   0.8s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.777, test=0.625) total time=   0.6s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.773, test=0.587) total time=   0.6s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.772, test=0.615) total time=   0.6s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.771, test=0.576) total time=   0.6s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.594) total time=   0.5s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.630) total time=   0.5s
[CV 3/5] END

In [21]:
vectorizers = [CountVectorizer(ngram_range = (1,2), lowercase = False), TfidfVectorizer(ngram_range = (1,2), lowercase = False)]
for vectorizer in vectorizers:
    print(f'Embedding: {vectorizers_names[vectorizers.index(vectorizer)]}')
    X_train = vectorizer.fit_transform(training_p[x_label_column])
    y_train = training_p[y_label_column].tolist()
    for j, model in enumerate(models):
        print(f'Model {j}: {model_names[j]}')
        kFold_cv = KFold(n_splits=folds,
                        shuffle=True, 
                        random_state=42)
        param_grid = grid_params
        grid_search = GridSearchCV(model,
                                param_grid[j],
                                cv=kFold_cv.split(X_train, y_train),
                                scoring='f1_macro',
                                verbose=3,
                                return_train_score=True)
        grid_search.fit(X_train, y_train)

        print("Best parameters: ", grid_search.best_params_)
        print("Accuracy :", grid_search.best_score_)

Embedding: CountVectorizer
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.317, test=0.308) total time=   1.2s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.325, test=0.307) total time=   1.1s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.311, test=0.297) total time=   1.1s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.323, test=0.299) total time=   1.1s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.302, test=0.295) total time=   1.2s
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.317, test=0.308) total time=   1.2s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.325, test=0.307) total time=   1.2s
[CV 3/5] END C=0.001, max_iter=10000, penalty=l2, rando



[CV 2/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.996, test=0.755) total time=   0.9s




[CV 3/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.765) total time=   1.0s




[CV 4/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.749) total time=   1.0s




[CV 5/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.752) total time=   1.0s
[CV 1/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.740) total time=   0.6s
[CV 2/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.996, test=0.755) total time=   1.4s
[CV 3/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.765) total time=   1.1s
[CV 4/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.749) total time=   1.2s
[CV 5/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.752) total time=   1.3s




[CV 1/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.736) total time=   0.9s




[CV 2/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.756) total time=   0.9s




[CV 3/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.752) total time=   1.0s




[CV 4/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.741) total time=   0.9s




[CV 5/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.747) total time=   1.0s
[CV 1/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.736) total time=   1.7s
[CV 2/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.757) total time=   2.5s
[CV 3/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.752) total time=   2.7s
[CV 4/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.741) total time=   2.7s
[CV 5/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.748) total time=   3.9s




[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.733) total time=   1.0s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.752) total time=   1.0s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.738) total time=   1.0s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.744) total time=   1.0s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.741) total time=   1.0s
[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.734) total time=   4.9s
[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.754) total time=   8.3s




[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.738) total time=   6.7s
[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.744) total time=   4.9s
[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.740) total time=   8.7s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.729) total time=   1.1s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.749) total time=   1.1s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.733) total time=   1.0s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.741) total time=   1.0s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.739) total time=   1.1s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.728) total time=   9.3s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.751) total time=  10.0s




[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.735) total time=   8.8s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.742) total time=   6.5s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.738) total time=  10.0s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.727) total time=   1.1s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.753) total time=   1.1s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.732) total time=   1.0s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.744) total time=   1.2s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.741) total time=   1.1s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.728) total time=  10.5s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.752) total time=  10.9s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.735) total time=   9.1s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.745) total time=   8.6s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.737) total time=  10.7s




Best parameters:  {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7522048391548936
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.542, test=0.400) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.551, test=0.380) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.531, test=0.360) total time=   0.2s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.541, test=0.400) total time=   0.2s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.547, test=0.388) total time=   0.2s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.420) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.401) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=distance;, score=(train=1.000, test=0.375) total time=   0.2s
[CV 4/5] END n_neighbors=3, weight



[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.707) total time=   1.7s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.739) total time=   1.7s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.727) total time=   1.7s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.732) total time=   1.8s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=   1.7s
[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.707) total time=   5.7s
[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.738) total time=   5.6s
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.728) total time=   4.5s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.732) total time=   4.5s
[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=   5.6s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.709) total time=   1.9s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.739) total time=   1.8s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.726) total time=   1.8s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.736) total time=   1.7s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.729) total time=   1.8s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.709) total time=  17.3s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.741) total time=  17.4s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.726) total time=  15.0s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.733) total time=  14.4s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.729) total time=  17.5s
Best parameters:  {'C': 10.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7298221323032593
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.783, test=0.593) total time=   1.8s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.781, test=0.588) total time=   1.8s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.782, test=0.568) total time=   1.8s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.773, test=0.612) total time=   1.8s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.776, test=0.587) total time=   1.9s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.611) total time=   1.7s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.592) total time=   1.8s
[CV 3/5] EN

In [22]:
vectorizers = [CountVectorizer(ngram_range = (1,3), lowercase = False), TfidfVectorizer(ngram_range = (1,3), lowercase = False)]
for vectorizer in vectorizers:
    print(f'Embedding: {vectorizers_names[vectorizers.index(vectorizer)]}')
    X_train = vectorizer.fit_transform(training_p[x_label_column])
    y_train = training_p[y_label_column].tolist()
    for j, model in enumerate(models):
        print(f'Model {j}: {model_names[j]}')
        kFold_cv = KFold(n_splits=folds,
                        shuffle=True, 
                        random_state=42)
        param_grid = grid_params
        grid_search = GridSearchCV(model,
                                param_grid[j],
                                cv=kFold_cv.split(X_train, y_train),
                                scoring='f1_macro',
                                verbose=3,
                                return_train_score=True)
        grid_search.fit(X_train, y_train)

        print("Best parameters: ", grid_search.best_params_)
        print("Accuracy :", grid_search.best_score_)

Embedding: CountVectorizer
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.326, test=0.300) total time=   2.7s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.334, test=0.314) total time=   2.6s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.318, test=0.294) total time=   2.5s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.331, test=0.306) total time=   2.7s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.305, test=0.293) total time=   2.5s
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.326, test=0.300) total time=   2.8s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.334, test=0.314) total time=   2.6s
[CV 3/5] END C=0.001, max_iter=10000, penalty=l2, rando



[CV 2/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.755) total time=   1.5s




[CV 3/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.762) total time=   1.8s




[CV 4/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.753) total time=   1.6s




[CV 5/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.756) total time=   1.7s
[CV 1/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.742) total time=   0.8s
[CV 2/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.755) total time=   2.5s
[CV 3/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.762) total time=   2.1s
[CV 4/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.753) total time=   1.9s
[CV 5/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.756) total time=   3.9s




[CV 1/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.737) total time=   1.4s




[CV 2/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.763) total time=   1.8s




[CV 3/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.749) total time=   1.7s




[CV 4/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.742) total time=   1.7s




[CV 5/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.752) total time=   1.8s
[CV 1/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.737) total time=   2.0s
[CV 2/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.763) total time=   3.9s
[CV 3/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.749) total time=   3.8s
[CV 4/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.743) total time=   3.7s
[CV 5/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.752) total time=   4.4s




[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=   1.7s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.754) total time=   1.8s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.734) total time=   1.8s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.739) total time=   1.7s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.740) total time=   1.8s
[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.727) total time=  10.4s




[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.755) total time=  15.1s




[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.734) total time=  12.1s
[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.740) total time=   9.8s




[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.740) total time=  13.6s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.729) total time=   1.9s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.751) total time=   1.9s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.726) total time=   1.7s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.741) total time=   1.9s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.735) total time=   1.9s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.727) total time=  15.3s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.753) total time=  16.6s




[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.727) total time=  14.9s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.741) total time=  10.4s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.736) total time=  16.6s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.728) total time=   2.0s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.755) total time=   2.0s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.725) total time=   1.8s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.743) total time=   1.9s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.738) total time=   2.0s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.729) total time=  18.0s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.753) total time=  18.2s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.725) total time=  14.5s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.743) total time=  14.4s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.736) total time=  17.9s




Best parameters:  {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7537197811879056
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.503, test=0.354) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.512, test=0.336) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.492, test=0.311) total time=   0.2s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.508, test=0.350) total time=   0.2s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.511, test=0.344) total time=   0.2s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.378) total time=   0.2s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.363) total time=   0.2s
[CV 3/5] END n_neighbors=3, weights=distance;, score=(train=1.000, test=0.322) total time=   0.2s
[CV 4/5] END n_neighbors=3, weight



[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.686) total time=   3.8s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.715) total time=   3.4s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.707) total time=   3.7s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.714) total time=   3.4s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.704) total time=   3.4s
[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.686) total time=   9.3s
[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.715) total time=   9.1s
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.707) total time=   7.0s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.714) total time=   7.0s
[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.704) total time=   9.2s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.689) total time=   3.0s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.719) total time=   3.1s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.708) total time=   3.4s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.717) total time=   3.5s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.703) total time=   3.1s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.687) total time=  30.7s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.719) total time=  28.1s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.706) total time=  21.9s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.718) total time=  22.1s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.704) total time=  27.8s




Best parameters:  {'C': 1000.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
Accuracy : 0.7072538338806782
Model 3: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.783, test=0.596) total time=   3.3s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.775, test=0.601) total time=   3.3s
[CV 3/5] END n_neighbors=3, weights=uniform;, score=(train=0.771, test=0.593) total time=   3.3s
[CV 4/5] END n_neighbors=3, weights=uniform;, score=(train=0.772, test=0.620) total time=   3.4s
[CV 5/5] END n_neighbors=3, weights=uniform;, score=(train=0.775, test=0.596) total time=   3.6s
[CV 1/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.610) total time=   3.4s
[CV 2/5] END n_neighbors=3, weights=distance;, score=(train=0.999, test=0.606) total time=   3.4s
[CV 3/5] END n_neighbors=3, weights=distance;, score=(train=1.000, test=0.607) total time=   3.5s
[CV 4/5] END n_neighbors=3, wei

## Finding best hyperparams (TP-LPTO)

In [13]:
models = [LogisticRegression(), RandomForestClassifier(), LinearSVC(), MultinomialNB(), KNeighborsClassifier()]
model_names = ['Regresión logística', 'Random Forest', 'SVM Lineal', 'NB Multinomial', 'KNN']
x_label_column = 'tweets'
y_label_column = 'label'
folds = 5
grid_params  = [{"C":np.logspace(-3,3,7), "penalty":["l2"], "max_iter": [1000, 10000], "random_state":[42]},
                {"n_estimators": [10, 50, 100], "max_depth":[10, 50, 100], "random_state": [42]},
                {"C":np.logspace(-3,3,7), "penalty":["l2"], "max_iter": [1000, 10000], "random_state":[42]},
                {},
                {"n_neighbors":[3], "weights":['uniform', 'distance']}]
path_to_save_models = ''
path_to_save_vectorizers = ''

In [None]:
for i in range(2):
  if i == 0:
    print('Count vectorizer')
    vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)
    oth_feats_vectorizer = CountVectorizer(ngram_range = (1,1), lowercase = False)
  else:
    print('TF-IDF')
    vectorizer = TfidfVectorizer(ngram_range = (1,1), lowercase = False)
    oth_feats_vectorizer = TfidfVectorizer(ngram_range = (1,1), lowercase = False)

  ml_homo = rcm.RobustMachineLearningModel(models=models,
                                       model_names=model_names,
                                       train_data=training_p,
                                       x_label_column=x_label_column,
                                       y_label_column=y_label_column,
                                       folds=folds,
                                       grid_params=grid_params,
                                       path_to_save_models=path_to_save_models,
                                       path_to_save_vectorizer=path_to_save_vectorizers,
                                       lemma=True,
                                       pos=True,
                                       tag=True,
                                       other_features=True,
                                       vectorizer=vectorizer,
                                       oth_feats_vectorizer=oth_feats_vectorizer,
                                       language='es')
  ml_homo.get_best_cross_validation()

n-gram range: (1, 1)
Count vectorizer


  model (Model): The parent model.


Lemma: (7000, 13822)
POS: (7000, 16)
TAG: (7000, 15)
Other features: (7000, 760)
Final vector shape: (7000, 14613)
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.335, test=0.331) total time=  26.6s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.348, test=0.336) total time=  26.2s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.329, test=0.307) total time=  25.0s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.342, test=0.340) total time=  24.1s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.327, test=0.323) total time=  24.1s
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.335, test=0.331) total time=  27.2s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.348

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time= 5.5min
[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.714) total time= 5.3min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.716) total time= 5.5min
[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.698) total time= 5.0min
[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.712) total time= 5.4min
[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time= 5.5min
[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.714) total time= 5.3min
[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.716) total time= 5.9min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.682) total time= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.696) total time= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.709) total time= 5.5min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.698) total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.695) total time= 5.5min
[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.682) total time= 6.4min
[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.698) total time= 7.5min
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.709) total time= 6.2min
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.699) total time= 6.8min
[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.695) total time= 7.3min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.680) total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.685) total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.694) total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.690) total time= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.682) total time= 5.4min
[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.679) total time= 9.3min
[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.685) total time= 9.7min
[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.695) total time= 9.0min
[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.691) total time= 8.2min
[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.678) total time= 9.7min
tuned hpyerparameters :(best parameters)  {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.7221968716840138
Model 1: Random Forest
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=10, n_estimators=10, random_state=42;



[CV 4/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.826, test=0.713) total time=   2.7s
[CV 5/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.831, test=0.711) total time=   2.6s
[CV 1/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.832, test=0.726) total time=   2.6s
[CV 2/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.828, test=0.708) total time=   2.4s
[CV 3/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.831, test=0.711) total time=   2.4s
[CV 4/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.826, test=0.713) total time=   2.7s
[CV 5/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.831, test=0.711) total time=   2.6s




[CV 1/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.965, test=0.734) total time=   2.7s




[CV 2/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.959, test=0.730) total time=   2.9s




[CV 3/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.964, test=0.756) total time=   2.8s




[CV 4/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.967, test=0.725) total time=   2.7s




[CV 5/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.964, test=0.718) total time=   2.8s
[CV 1/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.965, test=0.734) total time=   7.1s
[CV 2/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.960, test=0.730) total time=   7.2s
[CV 3/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.965, test=0.758) total time=   6.3s
[CV 4/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.966, test=0.725) total time=   6.5s
[CV 5/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.964, test=0.718) total time=   6.2s




[CV 1/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.694) total time=   2.5s




[CV 2/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.996, test=0.702) total time=   2.4s




[CV 3/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.721) total time=   2.4s




[CV 4/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.708) total time=   2.4s




[CV 5/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.997, test=0.694) total time=   2.5s
[CV 1/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.694) total time=   6.6s
[CV 2/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.996, test=0.701) total time=  10.6s
[CV 3/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.722) total time=   9.2s
[CV 4/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.708) total time=  11.5s
[CV 5/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.997, test=0.696) total time=  11.2s




[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.666) total time=   2.3s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.684) total time=   2.4s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.689) total time=   2.4s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.682) total time=   2.5s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.672) total time=   2.3s




[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.667) total time=  10.0s




[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.683) total time=  12.1s




[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.691) total time=  11.6s




[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.681) total time=  11.3s




[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.673) total time=  11.9s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.655) total time=   2.3s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.671) total time=   2.4s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.672) total time=   2.4s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.675) total time=   2.5s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.655) total time=   2.4s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.663) total time=  11.0s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.669) total time=  11.3s




[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.666) total time=  11.4s




[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.663) total time=  11.1s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.647) total time=  11.0s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.660) total time=   2.3s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.672) total time=   2.4s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.670) total time=   2.3s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.671) total time=   2.4s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.647) total time=   2.6s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.993, test=0.657) total time=  11.0s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.995, test=0.662) total time=  11.3s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.660) total time=  11.1s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.994, test=0.640) total time=  10.9s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.994, test=0.637) total time=  10.9s
tuned hpyerparameters :(best parameters)  {'C': 0.1, 'max_iter': 10000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.7330444551839149
Model 3: NB Multinomial
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..............., score=(train=0.741, test=0.602) total time=   1.7s
[CV 2/5] END ..............., score=(train=0.736, test=0.612) total time=   1.8s
[CV 3/5] END ..............., score=(train=0.738, test=0.607) total time=   1.7s
[CV 4/5] END ..............., score=(train=0.735, test=0.615) total time=   1.8s
[CV 5/5] END ..............., score=(train=0.743, test=0.614) total time=   1.7s
tuned hpyerparameters :(best parameters)  {}
accuracy : 0.6099182209230831
Model 4: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.675, test=0.435) total time=   4.5s
[CV 2/5]

  model (Model): The parent model.


Lemma: (7000, 13822)
POS: (7000, 16)
TAG: (7000, 15)
Other features: (7000, 760)
Final vector shape: (7000, 14613)
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.256, test=0.255) total time=   4.4s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.256, test=0.257) total time=   5.7s
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.256, test=0.255) total time=   4.4s
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.256, test=0.255) total time=   4.0s
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.255, test=0.258) total time=   5.4s
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.256, test=0.255) total time=   4.9s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.256



[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.668) total time=   2.3s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.678) total time=   2.2s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.680) total time=   2.1s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.690) total time=   2.2s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.998, test=0.668) total time=   2.2s
[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.668) total time=   2.4s
[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.678) total time=   2.4s
[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.680) total time=   2.4s
[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.690) total time=   2.3s
[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.998, test=0.668) total time=   2.4s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.656) total time=   2.3s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.670) total time=   2.4s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.657) total time=   2.4s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.669) total time=   2.3s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.653) total time=   2.3s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.656) total time=  14.4s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.671) total time=  14.4s
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.662) total time=  12.7s




[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.668) total time=  12.2s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.647) total time=  14.2s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.656) total time=   2.5s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.668) total time=   2.3s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.648) total time=   2.3s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.661) total time=   2.2s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.643) total time=   2.4s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.656) total time=  14.7s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.673) total time=  14.8s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.654) total time=  13.6s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.661) total time=  14.7s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.638) total time=  14.5s
tuned hpyerparameters :(best parameters)  {'C': 1.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.7015631730644489
Model 3: NB Multinomial
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..............., score=(train=0.369, test=0.311) total time=   0.6s
[CV 2/5] END ..............., score=(train=0.373, test=0.327) total time=   0.6s
[CV 3/5] END ..............., score=(train=0.365, test=0.318) total time=   0.7s
[CV 4/5] END ..............., score=(train=0.370, test=0.321) total time=   0.5s
[CV 5/5] END ..............., score=(train=0.367, test=0.323) total time=   0.5s
tuned hpyerparameters :(best parameters)  {}
accuracy : 0.3199280850767515
Model 4: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.682, test=0.401) total time=   4.7s
[CV 2/5] 

  model (Model): The parent model.


Lemma: (7000, 79388)
POS: (7000, 216)
TAG: (7000, 204)
Other features: (7000, 7304)
Final vector shape: (7000, 87112)
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.385, test=0.365) total time= 2.2min
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.390, test=0.361) total time= 2.3min
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.382, test=0.347) total time= 2.0min
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.384, test=0.368) total time= 2.5min
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.369, test=0.372) total time= 2.2min
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.385, test=0.365) total time= 2.2min
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.701) total time=26.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.711) total time=26.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.712) total time=29.0min
[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.704) total time=28.9min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.721) total time=31.6min
[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.701) total time=35.5min
[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.708) total time=40.2min
[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.713) total time=39.1min


In [14]:
for i in range(2):
  if i == 0:
    print('Count vectorizer')
    vectorizer = CountVectorizer(ngram_range = (1,2), lowercase = False)
    oth_feats_vectorizer = CountVectorizer(ngram_range = (1,2), lowercase = False)
  else:
    print('TF-IDF')
    vectorizer = TfidfVectorizer(ngram_range = (1,2), lowercase = False)
    oth_feats_vectorizer = TfidfVectorizer(ngram_range = (1,2), lowercase = False)

  ml_homo = rcm.RobustMachineLearningModel(models=models,
                                       model_names=model_names,
                                       train_data=training_p,
                                       x_label_column=x_label_column,
                                       y_label_column=y_label_column,
                                       folds=folds,
                                       grid_params=grid_params,
                                       path_to_save_models=path_to_save_models,
                                       path_to_save_vectorizer=path_to_save_vectorizers,
                                       lemma=True,
                                       pos=True,
                                       tag=True,
                                       other_features=True,
                                       vectorizer=vectorizer,
                                       oth_feats_vectorizer=oth_feats_vectorizer,
                                       language='es')
  ml_homo.get_best_cross_validation()

Count vectorizer
Lemma: (7000, 79502)
POS: (7000, 218)
TAG: (7000, 200)
Other features: (7000, 7304)
Final vector shape: (7000, 87224)
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.387, test=0.368) total time= 1.1min
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.397, test=0.362) total time= 1.3min
[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.379, test=0.338) total time= 1.1min
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.388, test=0.377) total time= 1.2min
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.366, test=0.364) total time= 1.2min
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.387, test=0.368) total time= 1.2min
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.714) total time=12.7min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=13.2min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.723) total time=12.8min
[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.713) total time=12.1min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.719) total time=12.6min
[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.714) total time=19.0min
[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=19.0min
[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.721) total time=17.1min
[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.713) total time=11.7min
[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.721) total time=15.9min
tuned hpyerparameters :(best parameters)  {'C': 10.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.7201946986285808
Model 1: Random Forest
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_depth=10, n_estimators=10, random_state=42



[CV 2/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.939, test=0.717) total time=   9.6s
[CV 3/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.945, test=0.708) total time=   8.9s
[CV 4/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.945, test=0.707) total time=   9.4s




[CV 5/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.945, test=0.721) total time=   9.6s
[CV 1/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.946, test=0.709) total time=   8.9s
[CV 2/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.939, test=0.717) total time=   9.7s
[CV 3/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.945, test=0.708) total time=   9.0s
[CV 4/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.945, test=0.707) total time=   9.5s
[CV 5/5] END C=0.01, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.945, test=0.721) total time=   9.6s




[CV 1/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.724) total time=   9.1s




[CV 2/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.739) total time=   9.2s




[CV 3/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time=   9.1s




[CV 4/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=   9.0s




[CV 5/5] END C=0.1, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.738) total time=   9.0s
[CV 1/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.724) total time=  11.8s
[CV 2/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.741) total time=  12.8s
[CV 3/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time=  12.0s
[CV 4/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.730) total time=  13.1s
[CV 5/5] END C=0.1, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.736) total time=  12.9s




[CV 1/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.720) total time=   9.0s




[CV 2/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.726) total time=   9.2s




[CV 3/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.730) total time=   9.1s




[CV 4/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.724) total time=   8.8s




[CV 5/5] END C=1.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.727) total time=   8.9s
[CV 1/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.720) total time=  15.1s
[CV 2/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.726) total time=  17.7s
[CV 3/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.730) total time=  16.4s
[CV 4/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.724) total time=  16.9s
[CV 5/5] END C=1.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.726) total time=  16.0s




[CV 1/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.717) total time=   9.1s




[CV 2/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.724) total time=   9.1s




[CV 3/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.728) total time=   8.9s




[CV 4/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.719) total time=   8.7s




[CV 5/5] END C=10.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.720) total time=   9.3s




[CV 1/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.722) total time=  25.2s




[CV 2/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time=  27.0s




[CV 3/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.728) total time=  27.0s




[CV 4/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.718) total time=  25.1s




[CV 5/5] END C=10.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.722) total time=  28.1s




[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.719) total time=   9.1s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time=   9.2s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.729) total time=   9.3s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.714) total time=   9.0s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.724) total time=   9.2s




[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.721) total time=  28.6s




[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.725) total time=  29.7s




[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.726) total time=  28.1s




[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.717) total time=  26.8s




[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.721) total time=  29.4s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.722) total time=   9.2s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.724) total time=   9.2s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.727) total time=   9.0s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.717) total time=   9.1s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.718) total time=   9.2s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.726) total time=  30.6s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.726) total time=  29.8s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.726) total time=  24.9s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.714) total time=  27.0s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.720) total time=  30.5s




tuned hpyerparameters :(best parameters)  {'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.7313094362334888
Model 3: NB Multinomial
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..............., score=(train=0.680, test=0.497) total time=  32.2s
[CV 2/5] END ..............., score=(train=0.675, test=0.516) total time=  32.4s
[CV 3/5] END ..............., score=(train=0.679, test=0.500) total time=  32.3s
[CV 4/5] END ..............., score=(train=0.672, test=0.502) total time=  32.3s
[CV 5/5] END ..............., score=(train=0.682, test=0.508) total time=  32.3s
tuned hpyerparameters :(best parameters)  {}
accuracy : 0.5045548673976843
Model 4: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.678, test=0.451) total time=  13.6s
[CV 2/5] END n_neighbors=3, weights=uniform;, score=(train=0.678, test=0.445) total time=  14.2s
[CV 3/5] END n_neighbors=3, weig



[CV 1/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.663) total time=   7.8s




[CV 2/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.692) total time=   8.0s




[CV 3/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.684) total time=   7.6s




[CV 4/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.687) total time=   7.5s




[CV 5/5] END C=100.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.690) total time=   7.7s
[CV 1/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.664) total time=  33.3s
[CV 2/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.689) total time=  33.1s
[CV 3/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.684) total time=  26.6s
[CV 4/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.687) total time=  25.8s
[CV 5/5] END C=100.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.690) total time=  32.0s




[CV 1/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.662) total time=   8.1s




[CV 2/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.689) total time=   8.0s




[CV 3/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.685) total time=   7.8s




[CV 4/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=1.000, test=0.690) total time=   7.6s




[CV 5/5] END C=1000.0, max_iter=1000, penalty=l2, random_state=42;, score=(train=0.999, test=0.692) total time=   7.8s




[CV 1/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.663) total time=  39.4s




[CV 2/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.690) total time=  39.0s




[CV 3/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.682) total time=  36.5s




[CV 4/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=1.000, test=0.687) total time=  33.6s




[CV 5/5] END C=1000.0, max_iter=10000, penalty=l2, random_state=42;, score=(train=0.999, test=0.691) total time=  37.7s
tuned hpyerparameters :(best parameters)  {'C': 10.0, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : 0.6863103279949436
Model 3: NB Multinomial
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END ..............., score=(train=0.285, test=0.267) total time=   3.1s
[CV 2/5] END ..............., score=(train=0.281, test=0.274) total time=   3.2s
[CV 3/5] END ..............., score=(train=0.286, test=0.268) total time=   3.2s
[CV 4/5] END ..............., score=(train=0.283, test=0.265) total time=   3.2s
[CV 5/5] END ..............., score=(train=0.283, test=0.273) total time=   3.1s
tuned hpyerparameters :(best parameters)  {}
accuracy : 0.2691606767822071
Model 4: KNN
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END n_neighbors=3, weights=uniform;, score=(train=0.690, test=0.420) total time=  11.9s
[CV 2/5]

In [15]:
for i in range(2):
  if i == 0:
    print('Count vectorizer')
    vectorizer = CountVectorizer(ngram_range = (1,3), lowercase = False)
    oth_feats_vectorizer = CountVectorizer(ngram_range = (1,3), lowercase = False)
  else:
    print('TF-IDF')
    vectorizer = TfidfVectorizer(ngram_range = (1,3), lowercase = False)
    oth_feats_vectorizer = TfidfVectorizer(ngram_range = (1,3), lowercase = False)

  ml_homo = rcm.RobustMachineLearningModel(models=models,
                                       model_names=model_names,
                                       train_data=training_p,
                                       x_label_column=x_label_column,
                                       y_label_column=y_label_column,
                                       folds=folds,
                                       grid_params=grid_params,
                                       path_to_save_models=path_to_save_models,
                                       path_to_save_vectorizer=path_to_save_vectorizers,
                                       lemma=True,
                                       pos=True,
                                       tag=True,
                                       other_features=True,
                                       vectorizer=vectorizer,
                                       oth_feats_vectorizer=oth_feats_vectorizer,
                                       language='es')
  ml_homo.get_best_cross_validation()

Count vectorizer
Lemma: (7000, 184433)
POS: (7000, 1986)
TAG: (7000, 1917)
Other features: (7000, 20581)
Final vector shape: (7000, 208917)
Model 0: Regresión logística
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV 1/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   7.1s
[CV 2/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   6.8s


Traceback (most recent call last):
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\metrics\_scorer.py", line 234, in __call__
    return self._score(
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\metrics\_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\metrics\_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **

[CV 3/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time= 3.3min
[CV 4/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time= 1.1min
[CV 5/5] END C=0.001, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time= 1.2min
[CV 1/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   5.9s
[CV 2/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   4.2s
[CV 3/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   4.9s
[CV 4/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   4.5s
[CV 5/5] END C=0.001, max_iter=10000, penalty=l2, random_state=42;, score=(train=nan, test=nan) total time=   4.1s
[CV 1/5] END C=0.01, max_iter=1000, penalty=l2, random_state=42;, score=(train=nan,

69 fits failed out of a total of 70.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
69 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\sklearn\linear_model\_logistic.py", line 1196, in fit
    X, y = self._validate_data(
  File "C:\Users\qk_le\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfr

tuned hpyerparameters :(best parameters)  {'C': 0.001, 'max_iter': 1000, 'penalty': 'l2', 'random_state': 42}
accuracy : nan
Model 1: Random Forest
Fitting 5 folds for each of 9 candidates, totalling 45 fits


MemoryError: Unable to allocate 8.72 GiB for an array with shape (5600, 208917) and data type int64