In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
from gensim.models import Word2Vec
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_to_json = 'full_format_recipes.json'
data = pd.read_json(path_to_json)

In [3]:
data.replace(r'^\s*$', np.nan, regex=True, inplace=True)  # Reemplazar cadenas vacías por NaN
data_clean = data.dropna()

descripciones = data_clean['desc'].tolist()
direcciones = data_clean['directions'].tolist()

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import pandas as pd

import os
os.environ["WANDB_DISABLED"] = "true"

# 1. Cargamos los datos

texts = data_clean['desc']  # Usar 'desc' como entrada, reemplazar NaN por cadenas vacías
labels = data_clean['rating']  # Usar 'rating' como salida

# 2. Dividimos los datos en entrenamiento y prueba
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# 3. Tokenizamos los datos
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=64)

# 4. Creamos el Dataset compatible con PyTorch
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
test_dataset = Dataset(test_encodings, test_labels)

# 5. Cargamos el modelo preentrenado para regresión

# 6. Configuraramos el entrenamiento
training_args = TrainingArguments(
    output_dir='./results',          # Directorio de salida
    num_train_epochs=1,              # Número de épocas
    per_device_train_batch_size=8,  # Tamaño del batch
    per_device_eval_batch_size=32,   # Tamaño del batch para evaluación
    warmup_steps = 10,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps=10,

)

# 7. Entrenamos el modelo
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# 8. Evaluaramos y guardamos el modelo
trainer.evaluate()
model.save_pretrained('./fine-tuned-bert')
tokenizer.save_pretrained('./fine-tuned-bert')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  1%|          | 10/1061 [00:16<28:37,  1.63s/it]

{'loss': 13.6635, 'grad_norm': 132.43153381347656, 'learning_rate': 5e-05, 'epoch': 0.01}


  2%|▏         | 20/1061 [00:33<28:38,  1.65s/it]

{'loss': 5.04, 'grad_norm': 36.22388458251953, 'learning_rate': 4.952426260704092e-05, 'epoch': 0.02}


  3%|▎         | 30/1061 [00:49<28:21,  1.65s/it]

{'loss': 1.4383, 'grad_norm': 10.067452430725098, 'learning_rate': 4.904852521408183e-05, 'epoch': 0.03}


  4%|▍         | 40/1061 [01:06<28:24,  1.67s/it]

{'loss': 1.1546, 'grad_norm': 4.803162574768066, 'learning_rate': 4.8572787821122746e-05, 'epoch': 0.04}


  5%|▍         | 50/1061 [01:23<28:18,  1.68s/it]

{'loss': 1.8377, 'grad_norm': 21.665586471557617, 'learning_rate': 4.8097050428163656e-05, 'epoch': 0.05}


  6%|▌         | 60/1061 [01:39<27:37,  1.66s/it]

{'loss': 2.2096, 'grad_norm': 9.642337799072266, 'learning_rate': 4.7621313035204566e-05, 'epoch': 0.06}


  7%|▋         | 70/1061 [01:56<27:22,  1.66s/it]

{'loss': 1.5278, 'grad_norm': 4.690475940704346, 'learning_rate': 4.714557564224548e-05, 'epoch': 0.07}


  8%|▊         | 80/1061 [02:12<26:51,  1.64s/it]

{'loss': 2.1836, 'grad_norm': 10.131820678710938, 'learning_rate': 4.666983824928639e-05, 'epoch': 0.08}


  8%|▊         | 90/1061 [02:29<26:23,  1.63s/it]

{'loss': 1.8263, 'grad_norm': 18.705177307128906, 'learning_rate': 4.619410085632731e-05, 'epoch': 0.08}


  9%|▉         | 100/1061 [02:45<26:15,  1.64s/it]

{'loss': 1.6328, 'grad_norm': 5.273403167724609, 'learning_rate': 4.5718363463368227e-05, 'epoch': 0.09}


 10%|█         | 110/1061 [03:02<25:54,  1.63s/it]

{'loss': 2.1727, 'grad_norm': 9.609236717224121, 'learning_rate': 4.524262607040914e-05, 'epoch': 0.1}


 11%|█▏        | 120/1061 [03:18<25:31,  1.63s/it]

{'loss': 2.0338, 'grad_norm': 22.64391326904297, 'learning_rate': 4.476688867745005e-05, 'epoch': 0.11}


 12%|█▏        | 130/1061 [03:34<25:15,  1.63s/it]

{'loss': 1.8218, 'grad_norm': 22.91121482849121, 'learning_rate': 4.4291151284490963e-05, 'epoch': 0.12}


 13%|█▎        | 140/1061 [03:50<25:05,  1.63s/it]

{'loss': 1.5179, 'grad_norm': 2.5691187381744385, 'learning_rate': 4.3815413891531873e-05, 'epoch': 0.13}


 14%|█▍        | 150/1061 [04:07<24:55,  1.64s/it]

{'loss': 1.6752, 'grad_norm': 29.19272232055664, 'learning_rate': 4.333967649857279e-05, 'epoch': 0.14}


 15%|█▌        | 160/1061 [04:23<24:44,  1.65s/it]

{'loss': 2.414, 'grad_norm': 14.106046676635742, 'learning_rate': 4.286393910561371e-05, 'epoch': 0.15}


 16%|█▌        | 170/1061 [04:40<24:21,  1.64s/it]

{'loss': 1.4363, 'grad_norm': 3.2831244468688965, 'learning_rate': 4.238820171265462e-05, 'epoch': 0.16}


 17%|█▋        | 180/1061 [04:56<23:58,  1.63s/it]

{'loss': 1.602, 'grad_norm': 3.275811195373535, 'learning_rate': 4.1912464319695534e-05, 'epoch': 0.17}


 18%|█▊        | 190/1061 [05:12<23:32,  1.62s/it]

{'loss': 1.9689, 'grad_norm': 18.545124053955078, 'learning_rate': 4.1436726926736444e-05, 'epoch': 0.18}


 19%|█▉        | 200/1061 [05:29<23:10,  1.62s/it]

{'loss': 1.0013, 'grad_norm': 12.05911922454834, 'learning_rate': 4.0960989533777354e-05, 'epoch': 0.19}


 20%|█▉        | 210/1061 [05:45<23:09,  1.63s/it]

{'loss': 2.4142, 'grad_norm': 28.815481185913086, 'learning_rate': 4.048525214081827e-05, 'epoch': 0.2}


 21%|██        | 220/1061 [06:01<22:43,  1.62s/it]

{'loss': 1.6045, 'grad_norm': 11.386752128601074, 'learning_rate': 4.000951474785918e-05, 'epoch': 0.21}


 22%|██▏       | 230/1061 [06:18<22:26,  1.62s/it]

{'loss': 1.7467, 'grad_norm': 14.776989936828613, 'learning_rate': 3.95337773549001e-05, 'epoch': 0.22}


 23%|██▎       | 240/1061 [06:34<22:21,  1.63s/it]

{'loss': 1.6537, 'grad_norm': 28.03081703186035, 'learning_rate': 3.9058039961941014e-05, 'epoch': 0.23}


 24%|██▎       | 250/1061 [06:50<21:55,  1.62s/it]

{'loss': 1.6401, 'grad_norm': 8.471131324768066, 'learning_rate': 3.8582302568981925e-05, 'epoch': 0.24}


 25%|██▍       | 260/1061 [07:07<21:51,  1.64s/it]

{'loss': 2.0362, 'grad_norm': 9.26048469543457, 'learning_rate': 3.8106565176022835e-05, 'epoch': 0.25}


 25%|██▌       | 270/1061 [07:23<21:23,  1.62s/it]

{'loss': 1.3115, 'grad_norm': 6.096595287322998, 'learning_rate': 3.763082778306375e-05, 'epoch': 0.25}


 26%|██▋       | 280/1061 [07:39<21:07,  1.62s/it]

{'loss': 2.3086, 'grad_norm': 34.36260223388672, 'learning_rate': 3.715509039010466e-05, 'epoch': 0.26}


 27%|██▋       | 290/1061 [07:55<20:51,  1.62s/it]

{'loss': 0.9555, 'grad_norm': 7.435389041900635, 'learning_rate': 3.667935299714558e-05, 'epoch': 0.27}


 28%|██▊       | 300/1061 [08:12<20:36,  1.62s/it]

{'loss': 1.484, 'grad_norm': 17.066041946411133, 'learning_rate': 3.620361560418649e-05, 'epoch': 0.28}


 29%|██▉       | 310/1061 [08:29<21:43,  1.74s/it]

{'loss': 1.4282, 'grad_norm': 15.281428337097168, 'learning_rate': 3.5727878211227405e-05, 'epoch': 0.29}


 30%|███       | 320/1061 [08:45<20:04,  1.63s/it]

{'loss': 1.3644, 'grad_norm': 25.662216186523438, 'learning_rate': 3.525214081826832e-05, 'epoch': 0.3}


 31%|███       | 330/1061 [09:01<19:42,  1.62s/it]

{'loss': 1.2113, 'grad_norm': 16.846323013305664, 'learning_rate': 3.477640342530923e-05, 'epoch': 0.31}


 32%|███▏      | 340/1061 [09:17<19:30,  1.62s/it]

{'loss': 1.8277, 'grad_norm': 10.636877059936523, 'learning_rate': 3.430066603235014e-05, 'epoch': 0.32}


 33%|███▎      | 350/1061 [09:34<19:09,  1.62s/it]

{'loss': 2.1319, 'grad_norm': 27.511198043823242, 'learning_rate': 3.382492863939106e-05, 'epoch': 0.33}


 34%|███▍      | 360/1061 [09:50<18:50,  1.61s/it]

{'loss': 1.9575, 'grad_norm': 24.745065689086914, 'learning_rate': 3.334919124643197e-05, 'epoch': 0.34}


 35%|███▍      | 370/1061 [10:06<18:37,  1.62s/it]

{'loss': 0.9843, 'grad_norm': 4.587541580200195, 'learning_rate': 3.2873453853472886e-05, 'epoch': 0.35}


 36%|███▌      | 380/1061 [10:22<18:25,  1.62s/it]

{'loss': 2.41, 'grad_norm': 21.282426834106445, 'learning_rate': 3.23977164605138e-05, 'epoch': 0.36}


 37%|███▋      | 390/1061 [10:35<14:25,  1.29s/it]

{'loss': 1.663, 'grad_norm': 12.733410835266113, 'learning_rate': 3.192197906755471e-05, 'epoch': 0.37}


 38%|███▊      | 400/1061 [10:48<14:01,  1.27s/it]

{'loss': 2.4118, 'grad_norm': 18.67781639099121, 'learning_rate': 3.144624167459562e-05, 'epoch': 0.38}


 39%|███▊      | 410/1061 [11:01<13:49,  1.27s/it]

{'loss': 1.9258, 'grad_norm': 5.867391586303711, 'learning_rate': 3.097050428163654e-05, 'epoch': 0.39}


 40%|███▉      | 420/1061 [11:14<14:08,  1.32s/it]

{'loss': 1.6656, 'grad_norm': 24.951547622680664, 'learning_rate': 3.049476688867745e-05, 'epoch': 0.4}


 41%|████      | 430/1061 [11:27<14:06,  1.34s/it]

{'loss': 1.9796, 'grad_norm': 33.739891052246094, 'learning_rate': 3.0019029495718366e-05, 'epoch': 0.41}


 41%|████▏     | 440/1061 [11:41<13:11,  1.27s/it]

{'loss': 1.9097, 'grad_norm': 21.048171997070312, 'learning_rate': 2.954329210275928e-05, 'epoch': 0.41}


 42%|████▏     | 450/1061 [11:53<12:28,  1.23s/it]

{'loss': 2.1541, 'grad_norm': 24.830135345458984, 'learning_rate': 2.9067554709800193e-05, 'epoch': 0.42}


 43%|████▎     | 460/1061 [12:05<12:20,  1.23s/it]

{'loss': 1.5806, 'grad_norm': 68.99971008300781, 'learning_rate': 2.8591817316841106e-05, 'epoch': 0.43}


 44%|████▍     | 470/1061 [12:17<12:05,  1.23s/it]

{'loss': 2.1451, 'grad_norm': 21.589874267578125, 'learning_rate': 2.8116079923882016e-05, 'epoch': 0.44}


 45%|████▌     | 480/1061 [12:30<11:55,  1.23s/it]

{'loss': 1.9487, 'grad_norm': 8.669411659240723, 'learning_rate': 2.764034253092293e-05, 'epoch': 0.45}


 46%|████▌     | 490/1061 [12:42<11:41,  1.23s/it]

{'loss': 1.4128, 'grad_norm': 10.61294174194336, 'learning_rate': 2.7164605137963843e-05, 'epoch': 0.46}


 47%|████▋     | 500/1061 [12:54<11:27,  1.23s/it]

{'loss': 1.5221, 'grad_norm': 13.719630241394043, 'learning_rate': 2.668886774500476e-05, 'epoch': 0.47}


 48%|████▊     | 510/1061 [13:08<11:26,  1.25s/it]

{'loss': 0.7721, 'grad_norm': 25.15228843688965, 'learning_rate': 2.6213130352045673e-05, 'epoch': 0.48}


 49%|████▉     | 520/1061 [13:20<11:06,  1.23s/it]

{'loss': 2.1148, 'grad_norm': 25.630935668945312, 'learning_rate': 2.5737392959086587e-05, 'epoch': 0.49}


 50%|████▉     | 530/1061 [13:32<10:50,  1.23s/it]

{'loss': 1.2754, 'grad_norm': 29.371397018432617, 'learning_rate': 2.52616555661275e-05, 'epoch': 0.5}


 51%|█████     | 540/1061 [13:45<10:43,  1.24s/it]

{'loss': 1.7177, 'grad_norm': 11.8334321975708, 'learning_rate': 2.4785918173168414e-05, 'epoch': 0.51}


 52%|█████▏    | 550/1061 [13:57<10:27,  1.23s/it]

{'loss': 1.679, 'grad_norm': 9.533391952514648, 'learning_rate': 2.4310180780209327e-05, 'epoch': 0.52}


 53%|█████▎    | 560/1061 [14:09<10:14,  1.23s/it]

{'loss': 0.9409, 'grad_norm': 23.22347640991211, 'learning_rate': 2.3834443387250237e-05, 'epoch': 0.53}


 54%|█████▎    | 570/1061 [14:22<10:04,  1.23s/it]

{'loss': 1.0628, 'grad_norm': 1.6757489442825317, 'learning_rate': 2.3358705994291154e-05, 'epoch': 0.54}


 55%|█████▍    | 580/1061 [14:34<09:48,  1.22s/it]

{'loss': 1.8491, 'grad_norm': 8.43675422668457, 'learning_rate': 2.2882968601332067e-05, 'epoch': 0.55}


 56%|█████▌    | 590/1061 [14:46<09:37,  1.23s/it]

{'loss': 2.0278, 'grad_norm': 33.28837585449219, 'learning_rate': 2.2407231208372977e-05, 'epoch': 0.56}


 57%|█████▋    | 600/1061 [14:59<09:24,  1.22s/it]

{'loss': 2.032, 'grad_norm': 34.37202835083008, 'learning_rate': 2.193149381541389e-05, 'epoch': 0.57}


 57%|█████▋    | 610/1061 [15:11<09:15,  1.23s/it]

{'loss': 2.1238, 'grad_norm': 8.662797927856445, 'learning_rate': 2.1455756422454808e-05, 'epoch': 0.57}


 58%|█████▊    | 620/1061 [15:23<09:03,  1.23s/it]

{'loss': 1.3815, 'grad_norm': 17.08553123474121, 'learning_rate': 2.098001902949572e-05, 'epoch': 0.58}


 59%|█████▉    | 630/1061 [15:36<08:56,  1.24s/it]

{'loss': 1.7924, 'grad_norm': 31.020246505737305, 'learning_rate': 2.050428163653663e-05, 'epoch': 0.59}


 60%|██████    | 640/1061 [15:48<08:40,  1.24s/it]

{'loss': 2.4033, 'grad_norm': 31.072303771972656, 'learning_rate': 2.0028544243577545e-05, 'epoch': 0.6}


 61%|██████▏   | 650/1061 [16:01<09:19,  1.36s/it]

{'loss': 1.4374, 'grad_norm': 28.822362899780273, 'learning_rate': 1.955280685061846e-05, 'epoch': 0.61}


 62%|██████▏   | 660/1061 [16:13<07:32,  1.13s/it]

{'loss': 1.1246, 'grad_norm': 21.106542587280273, 'learning_rate': 1.907706945765937e-05, 'epoch': 0.62}


 63%|██████▎   | 670/1061 [16:24<07:17,  1.12s/it]

{'loss': 1.3572, 'grad_norm': 8.12912654876709, 'learning_rate': 1.8601332064700285e-05, 'epoch': 0.63}


 64%|██████▍   | 680/1061 [16:36<07:30,  1.18s/it]

{'loss': 2.6281, 'grad_norm': 29.339580535888672, 'learning_rate': 1.81255946717412e-05, 'epoch': 0.64}


 65%|██████▌   | 690/1061 [16:47<06:26,  1.04s/it]

{'loss': 1.238, 'grad_norm': 10.136873245239258, 'learning_rate': 1.7649857278782115e-05, 'epoch': 0.65}


 66%|██████▌   | 700/1061 [16:57<06:31,  1.09s/it]

{'loss': 1.2985, 'grad_norm': 13.888323783874512, 'learning_rate': 1.7174119885823025e-05, 'epoch': 0.66}


 67%|██████▋   | 710/1061 [17:07<05:47,  1.01it/s]

{'loss': 1.287, 'grad_norm': 34.208213806152344, 'learning_rate': 1.669838249286394e-05, 'epoch': 0.67}


 68%|██████▊   | 720/1061 [17:18<05:57,  1.05s/it]

{'loss': 1.6592, 'grad_norm': 33.23435974121094, 'learning_rate': 1.6222645099904855e-05, 'epoch': 0.68}


 69%|██████▉   | 730/1061 [17:28<05:41,  1.03s/it]

{'loss': 1.5527, 'grad_norm': 19.97476577758789, 'learning_rate': 1.5746907706945765e-05, 'epoch': 0.69}


 70%|██████▉   | 740/1061 [17:39<05:56,  1.11s/it]

{'loss': 1.3424, 'grad_norm': 7.361581325531006, 'learning_rate': 1.527117031398668e-05, 'epoch': 0.7}


 71%|███████   | 750/1061 [17:52<06:52,  1.33s/it]

{'loss': 1.5938, 'grad_norm': 22.877412796020508, 'learning_rate': 1.4795432921027594e-05, 'epoch': 0.71}


 72%|███████▏  | 760/1061 [18:03<05:20,  1.06s/it]

{'loss': 1.2572, 'grad_norm': 16.156583786010742, 'learning_rate': 1.4319695528068507e-05, 'epoch': 0.72}


 73%|███████▎  | 770/1061 [18:14<05:15,  1.08s/it]

{'loss': 0.9261, 'grad_norm': 3.5303611755371094, 'learning_rate': 1.3843958135109419e-05, 'epoch': 0.73}


 74%|███████▎  | 780/1061 [18:25<04:48,  1.03s/it]

{'loss': 1.7529, 'grad_norm': 14.281705856323242, 'learning_rate': 1.3368220742150334e-05, 'epoch': 0.74}


 74%|███████▍  | 790/1061 [18:35<04:59,  1.10s/it]

{'loss': 1.6849, 'grad_norm': 3.44814395904541, 'learning_rate': 1.2892483349191248e-05, 'epoch': 0.74}


 75%|███████▌  | 800/1061 [18:47<04:52,  1.12s/it]

{'loss': 1.5788, 'grad_norm': 13.211359977722168, 'learning_rate': 1.2416745956232161e-05, 'epoch': 0.75}


 76%|███████▋  | 810/1061 [18:57<04:23,  1.05s/it]

{'loss': 1.8446, 'grad_norm': 11.011384010314941, 'learning_rate': 1.1941008563273073e-05, 'epoch': 0.76}


 77%|███████▋  | 820/1061 [19:08<04:26,  1.11s/it]

{'loss': 1.2704, 'grad_norm': 11.410271644592285, 'learning_rate': 1.1465271170313988e-05, 'epoch': 0.77}


 78%|███████▊  | 830/1061 [19:21<05:05,  1.32s/it]

{'loss': 1.5516, 'grad_norm': 30.113908767700195, 'learning_rate': 1.09895337773549e-05, 'epoch': 0.78}


 79%|███████▉  | 840/1061 [19:32<03:56,  1.07s/it]

{'loss': 1.3209, 'grad_norm': 29.09003257751465, 'learning_rate': 1.0513796384395815e-05, 'epoch': 0.79}


 80%|████████  | 850/1061 [19:44<03:50,  1.09s/it]

{'loss': 0.8404, 'grad_norm': 22.458105087280273, 'learning_rate': 1.0038058991436728e-05, 'epoch': 0.8}


 81%|████████  | 860/1061 [19:56<04:24,  1.31s/it]

{'loss': 1.3393, 'grad_norm': 19.404251098632812, 'learning_rate': 9.56232159847764e-06, 'epoch': 0.81}


 82%|████████▏ | 870/1061 [20:09<03:40,  1.15s/it]

{'loss': 1.5686, 'grad_norm': 31.99281883239746, 'learning_rate': 9.086584205518555e-06, 'epoch': 0.82}


 83%|████████▎ | 880/1061 [20:20<03:22,  1.12s/it]

{'loss': 1.7247, 'grad_norm': 4.311183929443359, 'learning_rate': 8.610846812559467e-06, 'epoch': 0.83}


 84%|████████▍ | 890/1061 [20:31<03:18,  1.16s/it]

{'loss': 1.2101, 'grad_norm': 16.816936492919922, 'learning_rate': 8.135109419600382e-06, 'epoch': 0.84}


 85%|████████▍ | 900/1061 [20:43<03:05,  1.15s/it]

{'loss': 1.199, 'grad_norm': 45.016685485839844, 'learning_rate': 7.659372026641294e-06, 'epoch': 0.85}


 86%|████████▌ | 910/1061 [20:54<02:50,  1.13s/it]

{'loss': 1.4994, 'grad_norm': 9.338199615478516, 'learning_rate': 7.183634633682209e-06, 'epoch': 0.86}


 87%|████████▋ | 920/1061 [21:06<02:41,  1.15s/it]

{'loss': 0.5483, 'grad_norm': 14.815545082092285, 'learning_rate': 6.707897240723121e-06, 'epoch': 0.87}


 88%|████████▊ | 930/1061 [21:17<02:33,  1.17s/it]

{'loss': 1.6225, 'grad_norm': 19.71107292175293, 'learning_rate': 6.232159847764035e-06, 'epoch': 0.88}


 89%|████████▊ | 940/1061 [21:29<02:22,  1.18s/it]

{'loss': 1.692, 'grad_norm': 4.710208415985107, 'learning_rate': 5.756422454804948e-06, 'epoch': 0.89}


 90%|████████▉ | 950/1061 [21:40<02:04,  1.12s/it]

{'loss': 1.5251, 'grad_norm': 11.773701667785645, 'learning_rate': 5.2806850618458615e-06, 'epoch': 0.9}


 90%|█████████ | 960/1061 [21:52<02:03,  1.23s/it]

{'loss': 1.1056, 'grad_norm': 9.140661239624023, 'learning_rate': 4.804947668886775e-06, 'epoch': 0.9}


 91%|█████████▏| 970/1061 [22:04<01:51,  1.23s/it]

{'loss': 2.6742, 'grad_norm': 5.537801265716553, 'learning_rate': 4.3292102759276875e-06, 'epoch': 0.91}


 92%|█████████▏| 980/1061 [22:17<01:39,  1.23s/it]

{'loss': 1.3457, 'grad_norm': 8.456594467163086, 'learning_rate': 3.853472882968601e-06, 'epoch': 0.92}


 93%|█████████▎| 990/1061 [22:29<01:22,  1.16s/it]

{'loss': 1.7929, 'grad_norm': 34.4231071472168, 'learning_rate': 3.377735490009515e-06, 'epoch': 0.93}


 94%|█████████▍| 1000/1061 [22:40<01:13,  1.20s/it]

{'loss': 1.8664, 'grad_norm': 3.3932361602783203, 'learning_rate': 2.901998097050428e-06, 'epoch': 0.94}


 95%|█████████▌| 1010/1061 [22:53<01:02,  1.23s/it]

{'loss': 1.8372, 'grad_norm': 8.625835418701172, 'learning_rate': 2.4262607040913416e-06, 'epoch': 0.95}


 96%|█████████▌| 1020/1061 [23:05<00:47,  1.16s/it]

{'loss': 1.8462, 'grad_norm': 14.878179550170898, 'learning_rate': 1.950523311132255e-06, 'epoch': 0.96}


 97%|█████████▋| 1030/1061 [23:17<00:35,  1.15s/it]

{'loss': 1.8411, 'grad_norm': 17.88118553161621, 'learning_rate': 1.4747859181731685e-06, 'epoch': 0.97}


 98%|█████████▊| 1040/1061 [23:28<00:24,  1.15s/it]

{'loss': 0.9965, 'grad_norm': 19.92991065979004, 'learning_rate': 9.99048525214082e-07, 'epoch': 0.98}


 99%|█████████▉| 1050/1061 [23:40<00:12,  1.14s/it]

{'loss': 1.1699, 'grad_norm': 12.564977645874023, 'learning_rate': 5.233111322549952e-07, 'epoch': 0.99}


100%|█████████▉| 1060/1061 [23:51<00:01,  1.15s/it]

{'loss': 1.4611, 'grad_norm': 30.085477828979492, 'learning_rate': 4.757373929590866e-08, 'epoch': 1.0}


100%|██████████| 1061/1061 [23:53<00:00,  1.35s/it]


{'train_runtime': 1433.3727, 'train_samples_per_second': 5.92, 'train_steps_per_second': 0.74, 'train_loss': 1.7795007240086878, 'epoch': 1.0}


100%|██████████| 67/67 [01:03<00:00,  1.05it/s]


('./fine-tuned-bert\\tokenizer_config.json',
 './fine-tuned-bert\\special_tokens_map.json',
 './fine-tuned-bert\\vocab.txt',
 './fine-tuned-bert\\added_tokens.json')

In [5]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert');
model = BertModel.from_pretrained('./fine-tuned-bert', output_hidden_states=True)

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
def tokenize_and_embed(text_list):

    embeddings = []
    for idx, text in enumerate(text_list):

        if isinstance(text, list):
            text = " ".join(text)

        # Prepararamos el texto con los tokens especiales de BERT
        marked_text = "[CLS] " + text + " [SEP]"

        # Tokenizamos el texto
        tokenized_text = tokenizer.tokenize(marked_text)

        if len(tokenized_text) > 512:
            tokenized_text = tokenized_text[:511] + ["[SEP]"]

        # Mapeamos los tokens a sus índices en el vocabulario
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Creamos el segmento alternando entre 0 y 1 por frase
        segment_id = idx % 2  # Alterna entre 0 y 1
        segments_ids = [segment_id] * len(tokenized_text)

        # Creamos tensores para los tokens y los segmentos
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])

        # Generaramos embeddings con BERT
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs.last_hidden_state

        # Promediamos las representaciones de todos los tokens para obtener el embedding del texto
        text_embedding = torch.mean(hidden_states, dim=1).squeeze().tolist()
        embeddings.append(text_embedding)

    return embeddings




In [7]:
desc_embeddings = tokenize_and_embed(descripciones)
data_clean['desc_embeddings'] = desc_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['desc_embeddings'] = desc_embeddings


In [8]:
from scipy.sparse import hstack
from gensim.matutils import corpus2csc

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MultiLabelBinarizer

# Seleccionamos las K mejores características del TF-IDF
selector = SelectKBest(f_regression, k=100)  # Selecciona 100 características más relevantes

x_data = data_clean['desc_embeddings'].tolist()

# Reducimos dimensionalidad con PCA
pca = PCA(n_components=100)  # Reducir a 100 componentes principales
data_reduced = csr_matrix(pca.fit_transform(x_data))

numerical_features = data_clean[['calories', 'protein', 'fat', 'sodium']].values

print(numerical_features.shape)  

from scipy.sparse import csr_matrix

numerical_features_sparse = csr_matrix(numerical_features)
# Combinamos TF-IDF reducido con las columnas numéricas

x_data_combined = hstack([data_reduced, numerical_features_sparse])


# Convertimos las categorías en variables binarias
mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(
    mlb.fit_transform(data_clean['categories']),
    columns=mlb.classes_,
    index=data_clean.index
)

# Añadimos las categorías codificadas al DataFrame original
categories_sparse = csr_matrix(categories_encoded.values)

x_data_combined = hstack([x_data_combined, categories_sparse])


(10608, 4)


In [9]:
#RANDOM-FOREST FINE-TUNNING
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score as r2
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

# Obtenemos x_data (TF-IDF corpus) y y_data
y_data = data_clean['rating']

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(x_data_combined, y_data, test_size=0.20, random_state=42)

# Dividimos el conjunto de entrenamiento en train_val y val
X_train_val, X_val, Y_train_val, Y_val = train_test_split(X_train, Y_train, test_size=0.30, random_state=42)

# Convertimos a matriz densa para usar con StandardScaler
X_train_val = X_train_val.toarray()
X_val = X_val.toarray()
X_test = X_test.toarray()

# Escalamos los datos
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Configuramos el modelo Random Forest y la búsqueda en cuadrícula
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [ 'sqrt']
}

#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Entrenar el modelo con la búsqueda de hiperparámetros
#grid_search.fit(X_train_val_scaled, Y_train_val)
#best_params = grid_search.best_params_

#print("Best parameters found:", best_params)

# Entrenamos el modelo con los mejores parámetros
model_original = RandomForestRegressor(
    n_estimators=500,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42)
model_original.fit(X_train_val_scaled, Y_train_val)
Y_predict = model_original.predict(X_test_scaled)

# Evaluamos el modelo
MSE_original = mse(Y_test, Y_predict)
MAE_original = mae(Y_test, Y_predict)
R2_original = r2(Y_test, Y_predict)

print(f"MSE: {MSE_original}")
print(f"MAE: {MAE_original}")
print(f"R2: {R2_original}")


MSE: 1.4051807917003667
MAE: 0.796064842972369
R2: 0.11734697898491231


In [11]:
#RED NEURONAL FINE-TUNNGING
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import random


random_state = 42
# Fijar semillas para reproducibilidad
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(random_state)

# Definición del modelo
class RegressionNN(nn.Module):
    def __init__(self, input_dim):
        super(RegressionNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 50),
            nn.ReLU(),
            nn.Linear(50, 25),
            nn.ReLU(),
            nn.Linear(25, 1)
        )
    def forward(self, x):
        return self.fc(x)

# Función principal
def NN_eval(X, y):
    # Estandarizar los datos
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Convertir a tensores
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

    # Crear DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Inicializar modelo, función de pérdida y optimizador
    model = RegressionNN(input_dim=X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    # Ciclo de entrenamiento
    for epoch in range(20):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Training Loss: {epoch_loss:.4f}")

    # Evaluación del modelo
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)
        predictions = model(X_test_tensor)
        test_loss = criterion(predictions, y_test_tensor)
        print(f"Test loss: {test_loss.item()}")

        # Conversión a NumPy y cálculo de R^2
        predictions_np = predictions.cpu().numpy()
        r2 = r2_score(y_test, predictions_np)
        print(f"R^2 Score: {r2:.4f}")

NN_eval(x_data_combined.toarray(), data_clean['rating'].tolist())


Epoch 1, Training Loss: 3730.4982
Epoch 2, Training Loss: 1859.0912
Epoch 3, Training Loss: 497.3343
Epoch 4, Training Loss: 359.6677
Epoch 5, Training Loss: 330.5455
Epoch 6, Training Loss: 314.4824
Epoch 7, Training Loss: 300.7600
Epoch 8, Training Loss: 288.1559
Epoch 9, Training Loss: 278.6870
Epoch 10, Training Loss: 270.4228
Epoch 11, Training Loss: 260.3098
Epoch 12, Training Loss: 252.6835
Epoch 13, Training Loss: 244.8431
Epoch 14, Training Loss: 235.6489
Epoch 15, Training Loss: 228.4026
Epoch 16, Training Loss: 220.3535
Epoch 17, Training Loss: 212.0309
Epoch 18, Training Loss: 204.3385
Epoch 19, Training Loss: 196.1488
Epoch 20, Training Loss: 188.3479
Test loss: 1.4507546424865723
R^2 Score: 0.0887
