#Bayes

In [7]:
# manejo de datos
import pandas as pd

# división train/test
from sklearn.model_selection import train_test_split

# vectorización de texto
from sklearn.feature_extraction.text import CountVectorizer

# nltk para tokenización y stop‑words
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import uniform, randint, loguniform
from sklearn.metrics import classification_report

df = pd.read_csv("./train.csv")

df_test = pd.read_csv("./test.csv")

X = df.review_es           # Serie de criticas
y = df.sentimiento         # Serie de etiquetas

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,             # mantiene el balance positivo/negativo
    test_size=0.20,
    random_state=1234
)

pipe = make_pipeline(
    CountVectorizer(),
    MultinomialNB()
)

param_dist = {
    "countvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "countvectorizer__min_df": randint(1, 11),
    "countvectorizer__max_df": uniform(0.5, 0.5),
    "countvectorizer__max_features": [5000, 10000, None],
    "multinomialnb__alpha": loguniform(1e-3, 5.0),
    "multinomialnb__fit_prior": [True, False]
}

cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

halving_search = HalvingRandomSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_candidates=1000,
    factor=2,
    resource="n_samples",
    max_resources="auto",
    min_resources="smallest",
    cv=cv,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
    verbose=2
)

halving_search.fit(X_train_text, y_train)

print("Mejores parámetros:", halving_search.best_params_)
print("Mejor f-score (CV): {:.3f}".format(halving_search.best_score_))

best_model = halving_search.best_estimator_
y_pred = best_model.predict(X_test_text)

print(classification_report(y_test, y_pred))


n_iterations: 10
n_required_iterations: 10
n_possible_iterations: 12
min_resources_: 16
max_resources_: 40000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1000
n_resources: 16
Fitting 4 folds for each of 1000 candidates, totalling 4000 fits


696 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
696 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib

----------
iter: 1
n_candidates: 500
n_resources: 32
Fitting 4 folds for each of 500 candidates, totalling 2000 fits




----------
iter: 2
n_candidates: 250
n_resources: 64
Fitting 4 folds for each of 250 candidates, totalling 1000 fits




----------
iter: 3
n_candidates: 125
n_resources: 128
Fitting 4 folds for each of 125 candidates, totalling 500 fits




----------
iter: 4
n_candidates: 63
n_resources: 256
Fitting 4 folds for each of 63 candidates, totalling 252 fits




----------
iter: 5
n_candidates: 32
n_resources: 512
Fitting 4 folds for each of 32 candidates, totalling 128 fits




----------
iter: 6
n_candidates: 16
n_resources: 1024
Fitting 4 folds for each of 16 candidates, totalling 64 fits




----------
iter: 7
n_candidates: 8
n_resources: 2048
Fitting 4 folds for each of 8 candidates, totalling 32 fits




----------
iter: 8
n_candidates: 4
n_resources: 4096
Fitting 4 folds for each of 4 candidates, totalling 16 fits




----------
iter: 9
n_candidates: 2
n_resources: 8192
Fitting 4 folds for each of 2 candidates, totalling 8 fits




Mejores parámetros: {'countvectorizer__max_df': np.float64(0.5274413073232375), 'countvectorizer__max_features': None, 'countvectorizer__min_df': 2, 'countvectorizer__ngram_range': (1, 3), 'multinomialnb__alpha': np.float64(1.1644838640676913), 'multinomialnb__fit_prior': True}
Mejor f-score (CV): 0.859
              precision    recall  f1-score   support

    negativo       0.89      0.88      0.88      5000
    positivo       0.88      0.89      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



#Random Forest

In [None]:
# manejo de datos
import pandas as pd

# división train/test
from sklearn.model_selection import train_test_split

# vectorización de texto
from sklearn.feature_extraction.text import CountVectorizer

# nltk para tokenización y stop‑words
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')


In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import HalvingRandomSearchCV, StratifiedKFold
from scipy.stats import randint, uniform
from sklearn.metrics import classification_report
import joblib

X = df.review_es           # Serie de criticas
y = df.sentimiento         # Serie de etiquetas

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,             # mantiene el balance positivo/negativo
    test_size=0.20,
    random_state=1234
)

pipeline_rf = make_pipeline(
    CountVectorizer(),
    RandomForestClassifier(random_state=42)
)

param_dist_rf = {
    # ngram_range: unigram, bigrama o trigramas
    "countvectorizer__ngram_range": [(1, 1), (1, 2), (1, 3)],
    "countvectorizer__min_df": randint(1, 11),
    "countvectorizer__max_df": uniform(0.5, 0.5),
    "countvectorizer__max_features": [5000, 10000, None],
    # hiperparámetros del RandomForest
    "randomforestclassifier__n_estimators": randint(50, 150),
    "randomforestclassifier__max_depth": randint(5, 30),
    "randomforestclassifier__min_samples_split": randint(2, 10),
    "randomforestclassifier__min_samples_leaf": randint(1, 10),
    "randomforestclassifier__max_features": ["sqrt", "log2"],
    "randomforestclassifier__bootstrap": [True, False]
}

cv_rf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

#HalvingRandomSearchCV
halving_search_rf = HalvingRandomSearchCV(
    estimator=pipeline_rf,
    param_distributions=param_dist_rf,
    n_candidates=1000,
    factor=2,
    resource="n_samples",
    max_resources="auto",
    min_resources="smallest",
    cv=cv_rf,
    scoring="f1_macro",
    n_jobs=-1,
    random_state=42,
    verbose=2
)

halving_search_rf.fit(X_train_text, y_train)

#Resultados
print("Mejores parámetros RF:", halving_search_rf.best_params_)
print(f"Mejor F1 macro (CV): {halving_search_rf.best_score_:.3f}")

#Evaluación en test
best_rf = halving_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_text)
print(classification_report(y_test, y_pred_rf))

ruta_modelo = "/content/mejor_modelo_rf.pkl"
joblib.dump(best_rf, ruta_modelo)
print(f"≫ Modelo guardado en {ruta_modelo}")

n_iterations: 10
n_required_iterations: 10
n_possible_iterations: 12
min_resources_: 16
max_resources_: 40000
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 1000
n_resources: 16
Fitting 4 folds for each of 1000 candidates, totalling 4000 fits


744 fits failed out of a total of 4000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
744 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 654, in fit
    Xt = self._fit(X, y, routed_params, raw_params=params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib

----------
iter: 1
n_candidates: 500
n_resources: 32
Fitting 4 folds for each of 500 candidates, totalling 2000 fits




----------
iter: 2
n_candidates: 250
n_resources: 64
Fitting 4 folds for each of 250 candidates, totalling 1000 fits




----------
iter: 3
n_candidates: 125
n_resources: 128
Fitting 4 folds for each of 125 candidates, totalling 500 fits




----------
iter: 4
n_candidates: 63
n_resources: 256
Fitting 4 folds for each of 63 candidates, totalling 252 fits




----------
iter: 5
n_candidates: 32
n_resources: 512
Fitting 4 folds for each of 32 candidates, totalling 128 fits




----------
iter: 6
n_candidates: 16
n_resources: 1024
Fitting 4 folds for each of 16 candidates, totalling 64 fits




----------
iter: 7
n_candidates: 8
n_resources: 2048
Fitting 4 folds for each of 8 candidates, totalling 32 fits




----------
iter: 8
n_candidates: 4
n_resources: 4096
Fitting 4 folds for each of 4 candidates, totalling 16 fits




----------
iter: 9
n_candidates: 2
n_resources: 8192
Fitting 4 folds for each of 2 candidates, totalling 8 fits




Mejores parámetros RF: {'countvectorizer__max_df': np.float64(0.7360334725549996), 'countvectorizer__max_features': None, 'countvectorizer__min_df': 2, 'countvectorizer__ngram_range': (1, 1), 'randomforestclassifier__bootstrap': False, 'randomforestclassifier__max_depth': 23, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__min_samples_leaf': 2, 'randomforestclassifier__min_samples_split': 3, 'randomforestclassifier__n_estimators': 141}
Mejor F1 macro (CV): 0.820
              precision    recall  f1-score   support

    negativo       0.87      0.80      0.83      5000
    positivo       0.81      0.88      0.85      5000

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

≫ Modelo guardado en /content/mejor_modelo_rf.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#XGBoost

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from scipy.stats import randint, loguniform
from google.colab import files
import nltk
from nltk.tokenize import word_tokenize
import re

nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

def clean_text(text):
    return text.strip().lower()


def tokenizer(text):
    return re.findall(r'\b\w+\b', text.lower())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
X = df['review_es'].astype(str)
y = df['sentimiento']
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Label-encoding
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)   # 'negativa':0, 'positiva':1
y_test_enc  = le.transform(y_test)

# Pipeline
pipe = make_pipeline(
    CountVectorizer(
        preprocessor=clean_text,
        tokenizer=tokenizer,
        min_df=5
    ),
    XGBClassifier(
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=1234
    )
)

# Búsqueda de hiperparámetros
param_dist = {
    # vectorizador
    "countvectorizer__ngram_range": [(1,1), (1,2)],
    "countvectorizer__max_df": [0.75, 1.0],
    "countvectorizer__min_df": [1, 5],
    # XGBoost
    "xgbclassifier__n_estimators": randint(100, 500),
    "xgbclassifier__learning_rate": loguniform(0.01, 0.3),
    "xgbclassifier__max_depth": randint(3, 10),
    "xgbclassifier__subsample": [0.8, 1.0],
    "xgbclassifier__colsample_bytree": [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1234)

rand_search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=5,
    scoring='f1_macro',
    cv=cv,
    verbose=1,
    random_state=1234,
    n_jobs=-1,
    error_score='raise'
)

# Ajuste de la búsqueda
rand_search.fit(X_train_text, y_train_enc)

# Evaluación en test
print("Mejores parámetros (CV):", rand_search.best_params_)
print(f"Mejor F1-macro (CV): {rand_search.best_score_:.4f}\n")

best_pipe = rand_search.best_estimator_

# Predicción en test
y_pred_enc = best_pipe.predict(X_test_text)
y_pred = le.inverse_transform(y_pred_enc)

print(" Classification Report (test)")
print(classification_report(y_test, y_pred))

print("Confusion Matrix (test)")
print(confusion_matrix(y_test, y_pred))

# Predicción final
pred_test_enc = best_pipe.predict(df_test['review_es'])
pred_test = le.inverse_transform(pred_test_enc)

df_test['sentimiento'] = pred_test
submission_filename = "secondSubmission_xgb.csv"
df_test[['ID', 'sentimiento']].to_csv(submission_filename, index=False)
print(f"\Archivo guardado como '{submission_filename}'")

files.download(submission_filename)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


Parameters: { "use_label_encoder" } are not used.



Mejores parámetros (CV): {'countvectorizer__max_df': 1.0, 'countvectorizer__min_df': 5, 'countvectorizer__ngram_range': (1, 1), 'xgbclassifier__colsample_bytree': 0.8, 'xgbclassifier__learning_rate': np.float64(0.15291934777093275), 'xgbclassifier__max_depth': 7, 'xgbclassifier__n_estimators': 480, 'xgbclassifier__subsample': 0.8}
Mejor F1-macro (CV): 0.8667

 Classification Report (test)
              precision    recall  f1-score   support

    negativo       0.88      0.86      0.87      5000
    positivo       0.87      0.89      0.88      5000

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

Confusion Matrix (test)
[[4325  675]
 [ 568 4432]]
\Archivo guardado como 'secondSubmission_xgb.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
import joblib

# Accuracy
acc = accuracy_score(y_test, y_pred)

# F1-score macro (promedio simple entre clases)
f1 = f1_score(y_test, y_pred, average='macro')

# Precision macro
precision = precision_score(y_test, y_pred, average='macro')

# Recall macro
recall = recall_score(y_test, y_pred, average='macro')

# Mostrar resultados
print(f"Accuracy:  {acc:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

joblib.dump(best_pipe, "/content/mejor_modelo_xgb.pkl")

Accuracy:  0.8757
F1-score:  0.8757
Precision: 0.8759
Recall:    0.8757


['/content/mejor_modelo_xgb.pkl']

#Ensamble

In [None]:
import joblib
import pandas as pd
from unicodedata import normalize

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier

# Descarga de recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")

X = df.review_es   # Serie de criticas
y = df.sentimiento # Serie de etiquetas

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X,
    y,
    stratify=y,             # mantiene el balance positivo/negativo
    test_size=0.20,
    random_state=1234
)

##Xgboost

In [None]:
nltk.download('stopwords')

# stop-words sin tildes
_stoplist_raw = set(stopwords.words("spanish"))
stoplist = {
    normalize("NFKD", w).encode("ascii", "ignore").decode("ascii")
    for w in _stoplist_raw
}

# Codificar etiquetas
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

print(f"Mapping de etiquetas: {dict(zip(le.classes_, le.transform(le.classes_)))}")

pipe_xgb = Pipeline([
    ("tfidf", TfidfVectorizer(
        stop_words=list(stoplist),
        strip_accents="unicode",
        lowercase=True,
        token_pattern=r"(?u)\b\w\w+\b",
        sublinear_tf=True,
        norm="l2",
        smooth_idf=True,
        max_df=0.5586019528401167,
        min_df=0.043909556525082855,
        max_features=5000,
        ngram_range=(1,2)
    )),
    ("xgb", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        device="cuda",
        random_state=42,
        n_estimators=274,
        max_depth=3,
        learning_rate=0.1275578119564078,
        subsample=0.709635830583011,
        colsample_bytree=0.8460368235121845,
        gamma=1.031260569180763,
        reg_alpha=0.2043163833989703,
        reg_lambda=0.5078050639587791
    ))
], memory="/tmp/skl_cache")

Mapping de etiquetas: {'negativo': np.int64(0), 'positivo': np.int64(1)}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


##Bayes

In [None]:
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

pipe_nb = make_pipeline(
    CountVectorizer(
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.5274413073232375,
        max_features=None
    ),
    MultinomialNB(
        alpha=1.1644838640676913,
        fit_prior=True
    )
)


##Random Forest

In [None]:
import joblib
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

pipeline_rf = make_pipeline(
    CountVectorizer(
        ngram_range=(1, 1),           # unigrama
        min_df=2,
        max_df=0.7360334725549996,
        max_features=None
    ),
    RandomForestClassifier(
        n_estimators=141,             # número de árboles
        max_depth=23,
        min_samples_split=3,
        min_samples_leaf=2,
        max_features="sqrt",
        bootstrap=False,              # muestreo sin reemplazo
        random_state=42
    )
)

##**Voting**

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report
import joblib

voting_soft = VotingClassifier(
    estimators=[
        ('xgb', pipe_xgb),
        ('rf', pipeline_rf),
        ('nb', pipe_nb)
    ],
    voting='soft',
    n_jobs=-1
)

voting_soft.fit(X_train_text, y_train_enc)

# Evalua en test
y_pred_enc = voting_soft.predict(X_test_text)
y_pred = le.inverse_transform(y_pred_enc)   # si usaste y_train_enc
print("\n--- Reporte de clasificación Voting Soft ---")
print(classification_report(y_test, y_pred))

# Guardo ensemble
ruta_ensemble = "/content/voting_soft_ensemble.pkl"
joblib.dump(voting_soft, ruta_ensemble)
print(f"\n≫ Ensemble guardado en {ruta_ensemble}")


--- Reporte de clasificación Voting Soft ---
              precision    recall  f1-score   support

    negativo       0.89      0.88      0.89      5000
    positivo       0.88      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000


≫ Ensemble guardado en /content/voting_soft_ensemble.pkl


##**Stacking**

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import pandas as pd

stack = StackingClassifier(
    estimators=[
        ('xgb', pipe_xgb),
        ('rf', pipeline_rf),
        ('nb', pipe_nb)
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    passthrough=False,
    n_jobs=-1
)

stack.fit(X_train_text, y_train_enc)

# Evaluo en test
y_test_enc = stack.predict(X_test_text)
y_test_pred = le.inverse_transform(y_test_enc)
print("\n--- Reporte de clasificación Stacking ---")
print(classification_report(y_test, y_test_pred))

y_pred_enc = stack.predict(df_test['review_es'])

y_pred = le.inverse_transform(y_pred_enc)

df_test['sentimiento'] = y_pred

df_test[['ID', 'sentimiento']].to_csv("predicciones_stacking.csv", index=False)

# Guardo ensemble
joblib.dump(stack, "/content/stacking_ensemble.pkl")



--- Reporte de clasificación Stacking ---
              precision    recall  f1-score   support

    negativo       0.89      0.88      0.89      5000
    positivo       0.88      0.89      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



['/content/stacking_ensemble.pkl']

#Redes

In [10]:
import tensorflow as tf
from tensorflow.keras import mixed_precision

device_name = tf.test.gpu_device_name()
if not device_name:
    raise SystemError('GPU no encontrada. Activala en Entorno de ejecución > Cambiar tipo de entorno > GPU')
print('GPU detectada:', device_name)

mixed_precision.set_global_policy('mixed_float16')


GPU detectada: /device:GPU:0


In [11]:
%pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m69.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.4
    Uninstalling transformers-4.52.4:
      Successfully uninstalled transformers-4.52.4
Successfully installed transformers-4.53.0


In [12]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm

df = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando:", device)

# Dataset
df['label'] = LabelEncoder().fit_transform(df['sentimiento'])  # 0 = negativo, 1 = positivo

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['review_es'].astype(str).tolist(), df['label'].tolist(),
    test_size=0.2, stratify=df['label'], random_state=42
)

# Tokenizer
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Dataset
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'label': torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for batch in tqdm(data_loader, desc="Entrenando"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        loss = loss_fn(logits, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validando"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, preds = torch.max(logits, dim=1)
            loss = loss_fn(logits, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1 = f1_score(all_labels, all_preds)
    return correct_predictions.double() / n_examples, np.mean(losses), f1

Usando: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 2
total_steps = len(train_loader) * epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss().to(device)

for epoch in range(epochs):
    print(f"\n Epoch {epoch+1}/{epochs}")
    train_acc, train_loss = train_model(model, train_loader, loss_fn, optimizer, device, scheduler, len(train_dataset))
    print(f"Train | Loss: {train_loss:.4f} | Acc: {train_acc:.4f}")

    val_acc, val_loss, val_f1 = eval_model(model, val_loader, loss_fn, device, len(val_dataset))
    print(f"Val   | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx])
        }
    def __len__(self):
        return len(self.encodings['input_ids'])

df_test_dataset = SentimentDataset(df_test['review_es'].astype(str).tolist(), tokenizer, max_len=128)
df_test_loader = DataLoader(df_test_dataset, batch_size=32)

model.eval()
all_preds = []

with torch.no_grad():
    for batch in tqdm(df_test_loader, desc="Prediciendo"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

df_test['sentimiento'] = pd.Series(all_preds).map({0: 'negativo', 1: 'positivo'})