In [2]:
# CONECTAR CON DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import sys
path ='/content/drive/MyDrive/cod/LEA3_Marketing'
os.chdir(path) ## volver la carpeta de repositorio directorio de trabajo
sys.path.append(path) ## agregarla al path, para leer archivos propios como paquetes

In [6]:
import numpy as np
import pandas as pd
import sqlite3 as sql
import a_funciones as fn
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
import ipywidgets as widgets
from IPython.display import display

# Para filtros colaborativos
#!pip install lightfm

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score
from sklearn.model_selection import train_test_split


In [8]:
# CREAR CONEXIÓN CON LA BASE DE DATOS db_movies
con = sql.connect('data/db_movies')

# CREAR EL CURSOR
cur = con.cursor() ## se crea el cursor, que es el otro tipo de conexión para ejecutar las consultas

# <font color=4CBB17>**3. Sistema de recomendación basado en contenido KNN, con base en todo lo visto por el usuario**</font>

In [9]:
# Cargar todos los datos una sola vez desde full_ratings
df_full = pd.read_sql('SELECT * FROM full_ratings', con)

# Escalar variables númericas
scaler = MinMaxScaler()
df_full['movie_year'] = scaler.fit_transform(df_full[['movie_year']])
df_full['movie_rating'] = scaler.fit_transform(df_full[['movie_rating']])

# Seleccionar las columnas que que influiran en el modelo de vecinos cercanos
feature_cols = df_full.drop(columns=['movie_id', 'movie_title', 'user_id']).columns

# Excluir todas las columnas que no sean dummy de géneros (suponiendo que no son 'movie_id', 'user_id', etc.)
columnas_excluir = {'movie_id', 'movie_title', 'user_id', 'movie_rating','movie_title','movie_year'}
genre_cols = [col for col in df_full.columns if col not in columnas_excluir and df_full[col].dropna().isin([0, 1]).all()]

# Dropdown interactivo con todos los géneros detectados
dropdown_genero = widgets.Dropdown(
    options=genre_cols,
    description='Género:',
    layout=widgets.Layout(width='300px')
)

def recomendar_con_genero(user_id, genero, n_recomendaciones=10):
    ratings_user = df_full[df_full['user_id'] == user_id]

    rated_ids = ratings_user['movie_id'].unique()
    df_rated = ratings_user[feature_cols].copy()
    df_rated['dummy'] = 1
    perfil = df_rated.groupby('dummy').mean()

    # Filtrar solo películas no vistas y que pertenezcan al género seleccionado
    df_no_rated = df_full[(~df_full['movie_id'].isin(rated_ids)) & (df_full[genero] == 1)]
    df_no_rated = df_no_rated.drop_duplicates('movie_id')
    X_no_rated = df_no_rated[feature_cols]


    # Modelo KNN
    model = neighbors.NearestNeighbors(n_neighbors=n_recomendaciones, metric='cosine')
    model.fit(X_no_rated)
    dist, idx = model.kneighbors(perfil)

    recs = df_no_rated.iloc[idx[0]][['movie_title', 'movie_id']].copy()
    recs['similitud'] = 1 - dist[0] # Se usa similitud en ves de distancia, ya que es más diciente para el usuario
    return recs.sort_values(by='similitud', ascending=False)

def mostrar_recomendaciones(genero):
    display(recomendar_con_genero(user_id=609, genero=genero))

# Crear el widget interactivo
widgets.interact(mostrar_recomendaciones, genero=dropdown_genero)


interactive(children=(Dropdown(description='Género:', layout=Layout(width='300px'), options=('Action', 'Advent…

# <font color=4CBB17>**4. Sistemas de recomendación basados en filtro colaborativo**</font>

In [10]:
#instalar lightfm para sistema de recomendación de filtro colaborativo
#!pip install lightfm

In [11]:
ratings = pd.read_sql('SELECT * FROM full_ratings ', con)
ratings

Unnamed: 0,movie_id,user_id,movie_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,movie_title,movie_year
0,1,1,4.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,Toy Story,1995
1,3,1,4.0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men,1995
2,6,1,4.0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,Heat,1995
3,47,1,5.0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,Seven (a.k.a. Se7en),1995
4,50,1,5.0,0,0,0,0,0,1,0,...,0,0,1,0,0,1,0,0,"Usual Suspects, The",1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39693,592,609,3.0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,Batman,1989
39694,742,609,3.0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Thinner,1996
39695,786,609,3.0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,Eraser,1996
39696,892,609,3.0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Twelfth Night,1996


In [12]:
#Crear data set en formato lightfm, primero lista de usuarios unicos y items únicos
dataset_train = Dataset()
dataset_test = Dataset()

all_unique_users = ratings['user_id'].unique()
all_unique_items = ratings['movie_id'].unique()

dataset_train.fit(users=all_unique_users, items=all_unique_items)
dataset_test.fit(users=all_unique_users, items=all_unique_items)

In [13]:
## verificar que el data set creado tenga los mismos registros que base original
cur.execute('SELECT count(distinct user_id), count (distinct movie_id) FROM full_ratings ')
print(cur.fetchall())

# Se compara la vbase de datos original con la de dataset train
num_users, num_items = dataset_train.interactions_shape()

print(num_users, num_items)


[(526, 2250)]
526 2250


In [14]:
# Separar tarin y test
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=123)

# Para trabajar con el paquete light FM, se deben crear datos que crean consecutivos para cada usuario e item

train_interactions_list = [(row['user_id'], row['movie_id'], row['movie_rating']) for index, row in train_df.iterrows()]

train_interactions, train_weights = dataset_train.build_interactions(train_interactions_list)

test_interactions_list = [(row['user_id'], row['movie_id'], row['movie_rating']) for index, row in test_df.iterrows()]
test_interactions, test_weights = dataset_test.build_interactions(test_interactions_list)

print(f"Shape of Interactions Matrix: {train_interactions.shape}")
print(f"Shape of Weights Matrix: {train_weights.shape}")
print(f"Shape of Interactions Matrix: {test_interactions.shape}")
print(f"Shape of Weights Matrix: {test_weights.shape}")

Shape of Interactions Matrix: (526, 2250)
Shape of Weights Matrix: (526, 2250)
Shape of Interactions Matrix: (526, 2250)
Shape of Weights Matrix: (526, 2250)


In [15]:
#Este código es de verificación para mirar los datos cómo se estructuran
'''train_int_array=train_interactions.toarray()
train_weights_array=train_weights.toarray()

test_int_array=test_interactions.toarray()
test_weights_array=test_weights.toarray()

print(train_int_array)
print(test_int_array)

print(train_weights_array)
print(test_weights_array)'''

'train_int_array=train_interactions.toarray()\ntrain_weights_array=train_weights.toarray()\n\ntest_int_array=test_interactions.toarray()\ntest_weights_array=test_weights.toarray()\n\nprint(train_int_array)\nprint(test_int_array)\n\nprint(train_weights_array)\nprint(test_weights_array)'

In [16]:
import time

# Ajuste manual del modelo
loss_model='logistic'  ## 'logistic', 'bpr', 'warp', 'warp-kos': este último solo sirve para implicitas

model=LightFM(loss=loss_model, random_state=123)
model.fit(train_interactions, epochs=10, verbose=True, sample_weight=train_weights)

# AUC por usuario
train_auc = auc_score(model, train_interactions)
test_auc = auc_score(model, test_interactions)

print('AUC: train %.2f, test %.2f.' % (train_auc.mean(), test_auc.mean()))

Epoch: 100%|██████████| 10/10 [00:00<00:00, 39.64it/s]


AUC: train 0.81, test 0.79.


In [19]:
!pip install --quiet optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/386.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m266.2/386.6 kB[0m [31m8.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/231.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import optuna

# Afinamiento de hiperparámetros con optuna

def objective(trial):

    # grilla de hiperparámetros
    n_components = trial.suggest_int('no_components', 10, 100)       # Sugiere un entero para el número de componentes latentes (dimensionalidad del modelo).
    learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1, log=True) # Sugiere un flotante (en escala logarítmica) para la tasa de aprendizaje del optimizador.
    loss = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp']) # Sugiere un valor categórico para la función de pérdida ('logistic', 'bpr' o 'warp').
    epochs = trial.suggest_int('epochs', 10, 50)                     # Sugiere un entero para el número de épocas de entrenamiento.

    print(f"  Params: n_components={n_components}, lr={learning_rate:.4f}, loss={loss},  epochs={epochs}")

    # Modelo con parámetros sugeridos
    model = LightFM(
        no_components=n_components,
        learning_rate=learning_rate,
        loss=loss,
        random_state = 42
    )

    model.fit(train_interactions,
              epochs=epochs,
              verbose=False,
              sample_weight=train_weights)

    test_auc = auc_score(model, test_interactions).mean()

    return test_auc


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

#Resultados
print(f"Best trial:")
print(f"  Value (Test AUC): {study.best_value:.4f}")
print(f"  Params: {study.best_params}")

#Entrenar modelo con mejores hyperparametros
best_params = study.best_params


model = LightFM(
    no_components=best_params['no_components'],
    learning_rate=best_params['learning_rate'],
    loss=best_params['loss'],
    random_state=42
)

model.fit(train_interactions,
                epochs=best_params['epochs'],
                verbose=True,
                sample_weight=train_weights)


# Resultado final
train_auc = auc_score(model, train_interactions).mean()
test_auc = auc_score(model, test_interactions).mean()

print('Final Model AUC: train %.4f, test %.4f.' % (train_auc, test_auc))


[I 2025-05-13 00:53:14,370] A new study created in memory with name: no-name-e5084d48-2069-4014-8ab5-0c650a03f17c


  Params: n_components=52, lr=0.0841, loss=bpr,  epochs=35


[I 2025-05-13 00:53:23,966] Trial 0 finished with value: 0.8057624697685242 and parameters: {'no_components': 52, 'learning_rate': 0.08414915403654875, 'loss': 'bpr', 'epochs': 35}. Best is trial 0 with value: 0.8057624697685242.


  Params: n_components=67, lr=0.0017, loss=bpr,  epochs=35


[I 2025-05-13 00:53:45,255] Trial 1 finished with value: 0.7039970755577087 and parameters: {'no_components': 67, 'learning_rate': 0.0016939671468572687, 'loss': 'bpr', 'epochs': 35}. Best is trial 0 with value: 0.8057624697685242.


  Params: n_components=59, lr=0.0138, loss=bpr,  epochs=23


[I 2025-05-13 00:53:50,516] Trial 2 finished with value: 0.8037230968475342 and parameters: {'no_components': 59, 'learning_rate': 0.013846716066246734, 'loss': 'bpr', 'epochs': 23}. Best is trial 0 with value: 0.8057624697685242.


  Params: n_components=61, lr=0.0013, loss=warp,  epochs=39


[I 2025-05-13 00:53:54,314] Trial 3 finished with value: 0.7899662852287292 and parameters: {'no_components': 61, 'learning_rate': 0.0013315492460390036, 'loss': 'warp', 'epochs': 39}. Best is trial 0 with value: 0.8057624697685242.


  Params: n_components=93, lr=0.0760, loss=logistic,  epochs=29


In [None]:
def recommendation(model, data, original_user_id, conn, k):

    # Cargar todas las películas que no ha visto el usuario desde la base de datos
    df_nr_movies = pd.read_sql(f'select * from full_ratings where movie_id<>{original_user_id}', conn)
    id_nr_movie = df_nr_movies['movie_id'].values

    # Obtener los índices de ítems (películas) en el modelo que no están en los vistos
    item_id_nr = [value for key, value in data.mapping()[2].items() if value not in id_nr_movie]

    uid_index = data.mapping()[0][original_user_id]# Obtener el índice del usuario en el modelo Lightfm

    # Predecir los scores y ordenar
    scores = model.predict(uid_index, item_id_nr)
    sorted_indices = np.argsort(-scores).tolist()

    # Obtener los IDs de las películas con mejor score
    top_items = [key for key, value in data.mapping()[2].items() if value in sorted_indices[:k]]

    # Filtrar las películas recomendadas del df original
    recommended = df_nr_movies[df_nr_movies['movie_id'].isin(top_items)][['movie_id', 'movie_title']]

    # Eliminar duplicados por si una película aparece más de una vez
    recommended.drop_duplicates(inplace=True)

    return recommended

In [None]:
#dataset_train.mapping()[0]

In [None]:
recommendation(model, dataset_train, 38, con, 10)

In [None]:
import nbformat

# Path to your notebook
input_notebook = 'd_modelos.ipynb'
output_notebook = 'd_modelos2.ipynb'

# Load the notebook
with open(input_notebook, 'r') as f:
    notebook = nbformat.read(f, as_version=4)

# Check if the notebook has 'metadata.widgets' and remove it
if 'widgets' in notebook.metadata:
    del notebook.metadata['widgets']  # Removes widgets metadata completely

# Alternatively, if you want to add a 'state' key inside 'widgets', do this:
# if 'widgets' in notebook.metadata:
#     notebook.metadata['widgets']['state'] = {}

# Save the modified notebook
with open(output_notebook, 'w') as f:
    nbformat.write(notebook, f)

print(f"Fixed notebook saved as {output_notebook}")