# Recommender basado en memoria — Notebook para Google Colab

**Sumario de contenido**:

- Normalización de columnas.
- Implementación **item-based** y **user-based** (similitud coseno).
- Funciones para la adición de usuarios del equipo y obtención de Top-10.
- Búsqueda de películas similares (ej. "Toy Story").
- Evaluación (RMSE y MAE) con split train/test.

In [11]:
import os
print('Ficheros CSV presentes en el directorio actual:')
print(sorted([f for f in os.listdir('.') if f.endswith('.csv')]))

Ficheros CSV presentes en el directorio actual:
['links.csv', 'movies.csv', 'ratings.csv', 'tags.csv']


In [12]:
# 2) Cargamos CSVs (robusto a variantes de encoding/separador)
import pandas as pd
import os

def load_csv_try(path):
    if not os.path.exists(path):
        return None
    try:
        return pd.read_csv(path)
    except Exception:
        try:
            return pd.read_csv(path, encoding='latin-1')
        except Exception:
            return pd.read_csv(path, sep=';')

ratings_path = 'ratings.csv'
movies_path  = 'movies.csv'
tags_path    = 'tags.csv'
links_path   = 'links.csv'

ratings = load_csv_try(ratings_path)
movies  = load_csv_try(movies_path)
tags    = load_csv_try(tags_path) if os.path.exists(tags_path) else None
links   = load_csv_try(links_path) if os.path.exists(links_path) else None

print('ratings:', None if ratings is None else ratings.shape)
print('movies :', None if movies is None else movies.shape)

ratings: (100836, 4)
movies : (9742, 3)


In [13]:
# 3) Normalizamos columnas esperadas (userId, movieId, rating; movieId, title, genres)
# Renombrar variantes comunes de columnas
if ratings is None or movies is None:
    raise ValueError('No se han encontrado ratings.csv o movies.csv en el directorio. Sube los archivos y vuelve a ejecutar.')

# Normalize ratings columns
colmap = {}
for c in ratings.columns:
    lc = c.lower()
    if lc in ['userid','user_id','user id','user']:
        colmap[c] = 'userId'
    if lc in ['movieid','movie_id','movie id','movie']:
        colmap[c] = 'movieId'
    if lc in ['rating','ratings','score']:
        colmap[c] = 'rating'
ratings = ratings.rename(columns=colmap)
if not {'userId','movieId','rating'}.issubset(ratings.columns):
    raise ValueError('ratings.csv debe contener userId, movieId y rating')
ratings = ratings[['userId','movieId','rating']].dropna()
ratings['userId'] = ratings['userId'].astype(int)
ratings['movieId'] = ratings['movieId'].astype(int)
ratings['rating'] = ratings['rating'].astype(float)

# Normalize movies columns
colmap = {}
for c in movies.columns:
    lc = c.lower()
    if lc in ['movieid','movie_id','movie id','movie']:
        colmap[c] = 'movieId'
    if lc in ['title','name']:
        colmap[c] = 'title'
    if lc in ['genres','genre']:
        colmap[c] = 'genres'
movies = movies.rename(columns=colmap)
if not {'movieId','title'}.issubset(movies.columns):
    raise ValueError('movies.csv debe contener al menos movieId y title')
movies = movies[['movieId','title'] + (['genres'] if 'genres' in movies.columns else [])]
movies['movieId'] = movies['movieId'].astype(int)

print('Datos normalizados OK. ratings rows =', len(ratings), 'movies rows =', len(movies))

Datos normalizados OK. ratings rows = 100836 movies rows = 9742


In [14]:
# 4) Construimos matrices de utilidad (users x items) y matrices de similitud
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
item_ids = ratings_matrix.columns.tolist()
user_ids = ratings_matrix.index.tolist()

# compute similarity matrices (cosine)
item_sim = pd.DataFrame(cosine_similarity(ratings_matrix.T.fillna(0)), index=item_ids, columns=item_ids)
user_sim = pd.DataFrame(cosine_similarity(ratings_matrix.fillna(0)), index=user_ids, columns=user_ids)

print('ratings matrix shape:', ratings_matrix.shape)
print('item_sim shape:', item_sim.shape)
print('user_sim shape:', user_sim.shape)

ratings matrix shape: (610, 9724)
item_sim shape: (9724, 9724)
user_sim shape: (610, 610)


In [15]:
# 5) Funciones: predict y recommend (item-based y user-based)
from math import sqrt

def predict_item_based(user_id, target_item):
    if user_id not in ratings_matrix.index or target_item not in ratings_matrix.columns:
        return None
    user_ratings = ratings_matrix.loc[user_id].dropna()
    if user_ratings.empty:
        return None
    sims = item_sim.loc[target_item, user_ratings.index]
    numer = (sims * user_ratings).sum()
    denom = sims.abs().sum()
    if denom == 0:
        return None
    return numer/denom

def recommend_item_based(user_id, n=10):
    if user_id not in ratings_matrix.index:
        return []
    rated = set(ratings_matrix.loc[user_id].dropna().index)
    candidates = [i for i in item_ids if i not in rated]
    preds = []
    for it in candidates:
        p = predict_item_based(user_id, it)
        if p is not None:
            preds.append((it,p))
    preds.sort(key=lambda x: x[1], reverse=True)
    out = []
    for movieId, score in preds[:n]:
        title = movies.loc[movies.movieId==movieId,'title'].iloc[0] if not movies.loc[movies.movieId==movieId].empty else None
        out.append({'movieId': int(movieId), 'title': title, 'predicted_rating': float(score)})
    return out

def predict_user_based(user_id, target_item):
    if user_id not in user_sim.index or target_item not in ratings_matrix.columns:
        return None
    users_rated = ratings_matrix[target_item].dropna()
    if users_rated.empty:
        return None
    sims = user_sim.loc[user_id, users_rated.index]
    numer = (sims * users_rated).sum()
    denom = sims.abs().sum()
    if denom == 0:
        return None
    return numer/denom

def recommend_user_based(user_id, n=10):
    if user_id not in ratings_matrix.index:
        return []
    rated = set(ratings_matrix.loc[user_id].dropna().index)
    candidates = [i for i in item_ids if i not in rated]
    preds = []
    for it in candidates:
        p = predict_user_based(user_id, it)
        if p is not None:
            preds.append((it,p))
    preds.sort(key=lambda x: x[1], reverse=True)
    out = []
    for movieId, score in preds[:n]:
        title = movies.loc[movies.movieId==movieId,'title'].iloc[0] if not movies.loc[movies.movieId==movieId].empty else None
        out.append({'movieId': int(movieId), 'title': title, 'predicted_rating': float(score)})
    return out

def most_similar_movies(movie_title, topn=10):
    matches = movies[movies.title.str.contains(movie_title, case=False, na=False)]
    if matches.empty:
        return []
    movieId = matches.iloc[0]['movieId']
    sims = item_sim.loc[movieId].sort_values(ascending=False)[1:topn+1]
    out = []
    for mid, sim in sims.items():
        title = movies.loc[movies.movieId==mid,'title'].iloc[0] if not movies.loc[movies.movieId==mid].empty else None
        out.append({'movieId': int(mid), 'title': title, 'similarity': float(sim)})
    return out

print('Funciones de recomendación creadas.')

Funciones de recomendación creadas.


In [16]:
# 6) Añadimos usuarios del equipo y generar recomendaciones
# Edita los movieId para que existan en tu movies.csv
team_users = {
    999001: {1:5.0, 2:4.0, 32:3.5},
    999002: {2:5.0, 10:4.0, 250:4.5},
}

# Agregar nuevos ratings
new_rows = []
for uid, rd in team_users.items():
    for mid, r in rd.items():
        new_rows.append({'userId': uid, 'movieId': mid, 'rating': r})
if new_rows:
    new_df = pd.DataFrame(new_rows)
    ratings = pd.concat([ratings, new_df], ignore_index=True)
    # Recomputa matrices
    ratings_matrix = ratings.pivot_table(index='userId', columns='movieId', values='rating')
    item_ids = ratings_matrix.columns.tolist()
    user_ids = ratings_matrix.index.tolist()
    item_sim = pd.DataFrame(cosine_similarity(ratings_matrix.T.fillna(0)), index=item_ids, columns=item_ids)
    user_sim = pd.DataFrame(cosine_similarity(ratings_matrix.fillna(0)), index=user_ids, columns=user_ids)

from pprint import pprint
for uid in team_users.keys():
    print('\n--- Recomendaciones para usuario', uid, '(Item-based) ---')
    pprint(recommend_item_based(uid, n=10))
    print('\n--- Recomendaciones para usuario', uid, '(User-based) ---')
    pprint(recommend_user_based(uid, n=10))


--- Recomendaciones para usuario 999001 (Item-based) ---
[{'movieId': 129, 'predicted_rating': 5.0, 'title': 'Pie in the Sky (1996)'},
 {'movieId': 241, 'predicted_rating': 5.0, 'title': 'Fluke (1995)'},
 {'movieId': 341, 'predicted_rating': 5.0, 'title': 'Double Happiness (1994)'},
 {'movieId': 449,
  'predicted_rating': 5.0,
  'title': 'Fear of a Black Hat (1994)'},
 {'movieId': 615,
  'predicted_rating': 5.0,
  'title': 'Bread and Chocolate (Pane e cioccolata) (1973)'},
 {'movieId': 1162,
  'predicted_rating': 5.0,
  'title': 'Ruling Class, The (1972)'},
 {'movieId': 1335, 'predicted_rating': 5.0, 'title': 'Blood Beach (1981)'},
 {'movieId': 1473, 'predicted_rating': 5.0, 'title': 'Best Men (1997)'},
 {'movieId': 1565, 'predicted_rating': 5.0, 'title': 'Head Above Water (1996)'},
 {'movieId': 1685,
  'predicted_rating': 5.0,
  'title': 'I Love You, I Love You Not (1996)'}]

--- Recomendaciones para usuario 999001 (User-based) ---
[{'movieId': 633,
  'predicted_rating': 5.0000000000

In [17]:
# 7) Películas similares a 'Toy Story'
sim_to_toy = most_similar_movies('Toy Story', topn=10)
import pandas as pd
pd.DataFrame(sim_to_toy)

Unnamed: 0,movieId,title,similarity
0,3114,Toy Story 2 (1999),0.57054
1,480,Jurassic Park (1993),0.563601
2,780,Independence Day (a.k.a. ID4) (1996),0.562231
3,260,Star Wars: Episode IV - A New Hope (1977),0.555382
4,356,Forrest Gump (1994),0.545127
5,364,"Lion King, The (1994)",0.539198
6,1210,Star Wars: Episode VI - Return of the Jedi (1983),0.539142
7,648,Mission: Impossible (1996),0.536973
8,1265,Groundhog Day (1993),0.532246
9,1270,Back to the Future (1985),0.528473


In [18]:
# 8) Evaluación: partición train/test y métricas RMSE/MAE
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt

train, test = train_test_split(ratings, test_size=0.2, random_state=42)

def build_from_df(ratings_df):
    rm = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
    ids_items = rm.columns.tolist()
    ids_users = rm.index.tolist()
    itsim = pd.DataFrame(cosine_similarity(rm.T.fillna(0)), index=ids_items, columns=ids_items)
    usim = pd.DataFrame(cosine_similarity(rm.fillna(0)), index=ids_users, columns=ids_users)
    return rm, itsim, usim

train_rm, train_item_sim, train_user_sim = build_from_df(train)

def predict_item_based_df(user, item):
    if user not in train_rm.index or item not in train_rm.columns:
        return None
    user_ratings = train_rm.loc[user].dropna()
    if user_ratings.empty:
        return None
    sims = train_item_sim.loc[item, user_ratings.index]
    numer = (sims * user_ratings).sum()
    denom = sims.abs().sum()
    if denom == 0:
        return None
    return numer/denom

def predict_user_based_df(user, item):
    if item not in train_rm.columns or user not in train_rm.index:
        return None
    users_rated = train_rm[item].dropna()
    if users_rated.empty:
        return None
    sims = train_user_sim.loc[user, users_rated.index]
    numer = (sims * users_rated).sum()
    denom = sims.abs().sum()
    if denom == 0:
        return None
    return numer/denom

# Evaluate on test set
y_true_item, y_pred_item, y_true_user, y_pred_user = [], [], [], []
for _, row in test.iterrows():
    u = row['userId']; m = row['movieId']; r = row['rating']
    pi = predict_item_based_df(u, m)
    pu = predict_user_based_df(u, m)
    if pi is not None:
        y_true_item.append(r); y_pred_item.append(pi)
    if pu is not None:
        y_true_user.append(r); y_pred_user.append(pu)

def safe_metrics(y_true, y_pred):
    if len(y_true)==0:
        return None, None
    rmse = sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    return rmse, mae

rmse_item, mae_item = safe_metrics(y_true_item, y_pred_item)
rmse_user, mae_user = safe_metrics(y_true_user, y_pred_user)

print('Item-based on test -> RMSE:', rmse_item, ' MAE:', mae_item)
print('User-based on test -> RMSE:', rmse_user, ' MAE:', mae_user)

Item-based on test -> RMSE: 0.9071088653851868  MAE: 0.7025946227701853
User-based on test -> RMSE: 0.9608990107439946  MAE: 0.7429318423534959


In [19]:
# 9) Guardamos resultados y notas finales
# Guardamos recomendaciones para los usuarios del equipo
all_recs = []
for uid in team_users.keys():
    for r in recommend_item_based(uid, n=10):
        all_recs.append({'userId': uid, 'movieId': r['movieId'], 'title': r['title'], 'predicted_rating': r['predicted_rating'], 'method': 'item'})
    for r in recommend_user_based(uid, n=10):
        all_recs.append({'userId': uid, 'movieId': r['movieId'], 'title': r['title'], 'predicted_rating': r['predicted_rating'], 'method': 'user'})

if all_recs:
    pd.DataFrame(all_recs).to_csv('recomendaciones_equipo.csv', index=False)
    print('Saved recomendaciones_equipo.csv')

print('\nNotebook listo para ejecutar en Colab. Ejecuta las celdas en orden y revisa las secciones de evaluación y conclusiones.')

Saved recomendaciones_equipo.csv

Notebook listo para ejecutar en Colab. Ejecuta las celdas en orden y revisa las secciones de evaluación y conclusiones.
