In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import gensim
from gensim.models import Word2Vec
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path_to_json = 'full_format_recipes.json'
data = pd.read_json(path_to_json)


In [3]:
# Descargamos recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\casa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\casa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\casa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\casa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if isinstance(text, list):
        text = ' '.join(text)
    # Tokenización
    tokens = word_tokenize(text)
    # Convertimos a minúsculas y eliminamos caracteres especiales
    tokens = [token.lower() for token in tokens if token.isalnum()]
    # Eliminamos números y números seguidos de un punto (ej. 1., 2.)
    tokens = [token for token in tokens if not re.match(r'^\d+\.?$', token)]

    # Eliminamos stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # Lematización
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens


In [5]:
import torch
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased');
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)  # Devuelve todos los estados ocultos

model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [6]:
def tokenize_and_embed(text_list):

    embeddings = []
    for idx, text in enumerate(text_list):

        if isinstance(text, list):
            text = " ".join(text)

        # Preparamos el texto con los tokens especiales de BERT
        marked_text = "[CLS] " + text + " [SEP]"

        # Tokenizamos el texto
        tokenized_text = tokenizer.tokenize(marked_text)

        if len(tokenized_text) > 512:
            tokenized_text = tokenized_text[:511] + ["[SEP]"]

        # Mapeamos los tokens a sus índices en el vocabulario
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

        # Creamos el segmento alternando entre 0 y 1 por frase
        segment_id = idx % 2  # Alterna entre 0 y 1
        segments_ids = [segment_id] * len(tokenized_text)

        # Creamos tensores para los tokens y los segmentos
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensor = torch.tensor([segments_ids])

        # Generamos embeddings con BERT
        with torch.no_grad():
            outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs.last_hidden_state

        # Promediamos las representaciones de todos los tokens para obtener el embedding del texto
        text_embedding = torch.mean(hidden_states, dim=1).squeeze().tolist()
        embeddings.append(text_embedding)

    return embeddings




In [7]:
data.replace(r'^\s*$', np.nan, regex=True, inplace=True)  # Reemplazamos cadenas vacías por NaN
data_clean = data.dropna()

descripciones = data_clean['desc'].tolist()
direcciones = data_clean['directions'].tolist()

In [8]:
desc_embeddings = tokenize_and_embed(descripciones)
data_clean['desc_embeddings'] = desc_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['desc_embeddings'] = desc_embeddings


In [9]:
from scipy.sparse import hstack
from gensim.matutils import corpus2csc

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MultiLabelBinarizer

# Seleccionamos las K mejores características del TF-IDF
selector = SelectKBest(f_regression, k=100)  # Selecciona 100 características más relevantes

x_data = data_clean['desc_embeddings'].tolist()

# Reducimos dimensionalidad con PCA
pca = PCA(n_components=100)  # Reducimos a 100 componentes principales
data_reduced = csr_matrix(pca.fit_transform(x_data))

numerical_features = data_clean[['calories', 'protein', 'fat', 'sodium']].values

print(numerical_features.shape)  

from scipy.sparse import csr_matrix

numerical_features_sparse = csr_matrix(numerical_features)
# Combinamos TF-IDF reducido con las columnas numéricas

x_data_combined = hstack([data_reduced, numerical_features_sparse])


# Convertimos las categorías en variables binarias
mlb = MultiLabelBinarizer()
categories_encoded = pd.DataFrame(
    mlb.fit_transform(data_clean['categories']),
    columns=mlb.classes_,
    index=data_clean.index
)

# Añadimos las categorías codificadas al DataFrame original
categories_sparse = csr_matrix(categories_encoded.values)

x_data_combined = hstack([x_data_combined, categories_sparse])


(10608, 4)


In [10]:
#RANDOM-FOREST BERT
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score as r2
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np

np.random.seed(42)

# Obtenemos x_data (TF-IDF corpus) y y_data
y_data = data_clean['rating']

# Dividimos los datos en conjuntos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(x_data_combined, y_data, test_size=0.20, random_state=42)

# Dividimos el conjunto de entrenamiento en train_val y val
X_train_val, X_val, Y_train_val, Y_val = train_test_split(X_train, Y_train, test_size=0.30, random_state=42)

# Convertimos a matriz densa para usar con StandardScaler
X_train_val = X_train_val.toarray()
X_val = X_val.toarray()
X_test = X_test.toarray()

# Escalamos los datos
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Configuramos el modelo Random Forest y la búsqueda en cuadrícula
rf = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [ 'sqrt']
}

#grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Entrenar el modelo con la búsqueda de hiperparámetros
#grid_search.fit(X_train_val_scaled, Y_train_val)
#best_params = grid_search.best_params_

#print("Best parameters found:", best_params)

# Entrenamos el modelo con los mejores parámetros
model_original = RandomForestRegressor(
    n_estimators=500,
    max_depth=50,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42)
model_original.fit(X_train_val_scaled, Y_train_val)
Y_predict = model_original.predict(X_test_scaled)

# Evaluamos el modelo
MSE_original = mse(Y_test, Y_predict)
MAE_original = mae(Y_test, Y_predict)
R2_original = r2(Y_test, Y_predict)

print(f"MSE: {MSE_original}")
print(f"MAE: {MAE_original}")
print(f"R2: {R2_original}")
#Best parameters found: {'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

MSE: 1.3989905838755572
MAE: 0.7879021055802463
R2: 0.12123530828001161


In [15]:
#RED NEURONAL BERT
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import random


random_state = 42
# Fijamos las semillas para reproducibilidad
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

set_seed(random_state)

# Definición del modelo
class RegressionNN(nn.Module):
    def __init__(self, input_dim):
        super(RegressionNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 50),
            nn.ReLU(),
            nn.Linear(50, 25),
            nn.ReLU(),
            nn.Linear(25, 1)
        )
    def forward(self, x):
        return self.fc(x)

# Función principal
def NN_eval(X, y):
    # Estandarizamos los datos
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Convertimos a tensores
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

    # Creamos DataLoader
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Inicializamos modelo, función de pérdida y optimizador
    model = RegressionNN(input_dim=X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    # Ciclo de entrenamiento
    for epoch in range(20):
        model.train()
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f"Epoch {epoch+1}, Training Loss: {epoch_loss:.4f}")

    # Evaluación del modelo
    model.eval()
    with torch.no_grad():
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)
        predictions = model(X_test_tensor)
        test_loss = criterion(predictions, y_test_tensor)
        print(f"Test loss: {test_loss.item()}")

        # Convertimos a NumPy y calculamos R^2
        predictions_np = predictions.cpu().numpy()
        r2 = r2_score(y_test, predictions_np)
        print(f"R^2 Score: {r2:.4f}")

NN_eval(x_data_combined.toarray(), data_clean['rating'].tolist())


Epoch 1, Training Loss: 3727.0065
Epoch 2, Training Loss: 1848.1549
Epoch 3, Training Loss: 501.3654
Epoch 4, Training Loss: 369.2608
Epoch 5, Training Loss: 340.3003
Epoch 6, Training Loss: 324.0240
Epoch 7, Training Loss: 309.2646
Epoch 8, Training Loss: 296.8918
Epoch 9, Training Loss: 286.7914
Epoch 10, Training Loss: 278.4535
Epoch 11, Training Loss: 267.8661
Epoch 12, Training Loss: 260.0252
Epoch 13, Training Loss: 251.9780
Epoch 14, Training Loss: 242.3833
Epoch 15, Training Loss: 235.0107
Epoch 16, Training Loss: 226.8097
Epoch 17, Training Loss: 217.9929
Epoch 18, Training Loss: 210.3486
Epoch 19, Training Loss: 202.0996
Epoch 20, Training Loss: 193.9895
Test loss: 1.4931845664978027
R^2 Score: 0.0621
