# Modelo LightFM

In [1]:
pip install pickle

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


In [2]:
import pandas as pd

In [3]:
df_reviews= pd.read_parquet('reviews.parquet', engine='pyarrow')

In [4]:
df_business= pd.read_parquet('business.parquet', engine='pyarrow')

In [5]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23734 entries, 0 to 23733
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    23734 non-null  object        
 1   user_id      23734 non-null  object        
 2   business_id  23734 non-null  object        
 3   stars        23734 non-null  int64         
 4   useful       23734 non-null  int64         
 5   funny        23734 non-null  int64         
 6   cool         23734 non-null  int64         
 7   text         23734 non-null  object        
 8   date         23734 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 1.6+ MB


In [6]:
df_reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,Ltl-K9qNkYZfceRtu6Vr4A,nkN_do3fJ9xekchVC-v68A,8A5LSwsKK5vXDH1M3_Tf1w,5,6,1,6,Bacara is amazing and makes for an indulgent r...,2005-03-29 19:29:36
1,9qnI6MbB82Fy_67eYBAoMg,nkN_do3fJ9xekchVC-v68A,U3grYFIeu6RgAAQgdriHww,5,9,2,3,"Not the easiest place to find, but just what I...",2005-03-29 19:42:12
2,d4-VjTAvvmT1GVV40hR1rQ,nkN_do3fJ9xekchVC-v68A,UjOq8dBVNv9weHrVDqm3kQ,1,4,0,0,Times like these I wish I could give negative ...,2005-03-29 19:53:58
3,v3YVV8C6S46Yq5mHSLvt8g,nkN_do3fJ9xekchVC-v68A,CoZ2mpsMBP8HUG1ymKoZTg,3,2,0,1,Started with ceviche which was fresh and excel...,2005-03-31 01:45:05
4,h-ACmkD5yYpD_AW5kRs9zA,Bf87HcPERF9yiSjb2tQBqw,ld_H5-FpZOWm_tkzwkPYQQ,5,0,1,6,"Three pulls, a total of $1.25 wagered, payout ...",2005-05-15 09:58:17


In [7]:
# Suponiendo que `reviews` es tu dataframe de Yelp
data = df_reviews.groupby("user_id").apply(
    lambda x: dict(zip(x["business_id"], x["stars"]))
).to_dict()


In [8]:
df_reviews['user_id'].unique()

array(['nkN_do3fJ9xekchVC-v68A', 'Bf87HcPERF9yiSjb2tQBqw',
       'q_QQ5kBBwlCcbL1s4NVK3g', ..., 'KLoEkHgC5wrORHdPpPq70Q',
       'SRqtqW96nDIxtDM7qc1YGA', 'ooHF-vRzlkyN9UEVMKocAQ'], dtype=object)

In [10]:
import math

# Funciones de similitud
def euclidean_similarity(person1, person2, data):
    common_ranked_items = [itm for itm in data[person1] if itm in data[person2]]
    rankings = [(data[person1][itm], data[person2][itm]) for itm in common_ranked_items]
    distance = [pow(rank[0] - rank[1], 2) for rank in rankings]
    return 1 / (1 + sum(distance))

def pearson_similarity(person1, person2, data):
    common_ranked_items = [itm for itm in data[person1] if itm in data[person2]]
    n = len(common_ranked_items)
    s1 = sum([data[person1][item] for item in common_ranked_items])
    s2 = sum([data[person2][item] for item in common_ranked_items])
    ss1 = sum([pow(data[person1][item], 2) for item in common_ranked_items])
    ss2 = sum([pow(data[person2][item], 2) for item in common_ranked_items])
    ps = sum([data[person1][item] * data[person2][item] for item in common_ranked_items])
    num = n * ps - (s1 * s2)
    den = math.sqrt((n * ss1 - math.pow(s1, 2)) * (n * ss2 - math.pow(s2, 2)))
    return (num / den) if den != 0 else 0

In [11]:
def normalize_rating(rating, min_rating=1, max_rating=5):
    """
    Normaliza el rating a un rango entre min_rating y max_rating.
    """
    min_possible_rating = 0  # El rating mínimo que podría dar la similitud (en el caso de que uses un puntaje de 0)
    max_possible_rating = 10  # El rating máximo que podrías obtener (ajusta según tus necesidades)

    # Normalización Min-Max
    normalized_rating = min_rating + (rating - min_possible_rating) * (max_rating - min_rating) / (max_possible_rating - min_possible_rating)
    return max(min_rating, min(max_rating, normalized_rating))  # Asegurarse de que esté en el rango [min_rating, max_rating]

In [12]:
def recommend(person, bound, data, df_business):
    # Calculamos la similitud con todos los demás usuarios
    scores = [(pearson_similarity(person, other, data), other) for other in data if other != person]

    # Ordenamos los puntajes en orden descendente (de mayor a menor similitud)
    scores.sort(reverse=True, key=lambda x: x[0])

    # Crear un diccionario de negocios recomendados
    recs = {}
    for sim, other in scores:
        ranked = data[other]  # Obtén los negocios recomendados
        for itm in ranked:
            if itm not in data[person]:  # Solo recomendar negocios no evaluados por la persona
                # Calcular el peso de la recomendación
                weight = sim * ranked[itm]
                if itm in recs:
                    recs[itm] += weight  # Acumular el puntaje
                else:
                    recs[itm] = weight

    # Ordenar los negocios recomendados por el puntaje (de mayor a menor)
    recs_sorted = sorted(recs.items(), key=lambda x: x[1], reverse=True)

    # Filtrar los negocios que están en df_business
    filtered_business_ids = [b_id for b_id, _ in recs_sorted if b_id in df_business['business_id'].values]

    # Asegurarse de que el número de negocios recomendados no exceda el 'bound'
    filtered_business_ids = filtered_business_ids[:bound]

    # Crear una lista para almacenar los datos recomendados
    recommended_business_data = []

    # Iterar sobre los business_id recomendados
    for business_id in filtered_business_ids:
        # Obtener el nombre, dirección y ciudad desde df_business
        business_info = df_business[df_business['business_id'] == business_id].iloc[0]
        
        # Obtener la recomendación (rating) desde recs
        rating = recs.get(business_id, 0)
        
        # Normalizar el rating al rango de 1 a 5
        normalized_rating = normalize_rating(rating)
        
        # Añadir la información a la lista
        recommended_business_data.append({
            'business_id': business_id,
            'name': business_info['name'],
            'address': business_info['address'],
            'city': business_info['city'],
            'rating': normalized_rating
        })

    # Convertir la lista en un DataFrame
    recommended_business_info = pd.DataFrame(recommended_business_data)

    return recommended_business_info


In [13]:
# Ejemplo de uso
person = 'Bf87HcPERF9yiSjb2tQBqw'
bound = 5
recs = recommend(person, bound, data, df_business)
print(recs)


              business_id               name               address  \
0  6a4gLLFSgr-Q6CZXDLzBGQ             Cochon  930 Tchoupitoulas St   
1  hRskO0RDRW3Cq3k7_Kggcg  Le Pavillon Hotel        833 Poydras St   
2  _ab50qdWOk0DdB6XOrBitw  Acme Oyster House      724 Iberville St   
3  yf8a1DOlqoqlKHuLSKEfxg    Dante's Kitchen          736 Dante St   
4  MI7cUsPiQGYQRuQmn3MbtA       Boca - Tampa        901 W Platt St   

          city  rating  
0  New Orleans     4.6  
1  New Orleans     3.0  
2  New Orleans     3.0  
3  New Orleans     3.0  
4        Tampa     3.0  


In [15]:
import pickle

In [16]:
with open('recommend_model.pkl', 'wb') as f:
    pickle.dump({
        'recommend': recommend,
        'normalize_rating': normalize_rating,
        'df_business': df_business,
        'data': data
    }, f)

In [None]:
# # Cargar el modelo desde el archivo pickle
# with open('recommend_model.pkl', 'rb') as f:
#     model = pickle.load(f)

# # Acceder a las funciones y datos
# recommend = model['recommend']
# normalize_rating = model['normalize_rating']
# df_business = model['df_business']
# data = model['data']

# # Usar el modelo cargado
# # Ejemplo: Hacer recomendaciones
# user = 'user1'
# bound = 3
# recommendations = recommend(user, bound, data, df_business)


In [17]:
import math

def calculate_mse(person, data, df_business):
    """
    Calcula el Error Medio Cuadrático (MSE) para las predicciones de un usuario.
    
    :param person: ID del usuario para el que se evaluará el modelo.
    :param data: Diccionario con las calificaciones de usuarios para negocios.
    :param df_business: DataFrame con información de los negocios.
    :return: Error Medio Cuadrático (MSE).
    """
    # Obtenemos las predicciones para los negocios que el usuario ha calificado
    predictions = {}
    for other in data:
        if other != person:
            sim = pearson_similarity(person, other, data)
            for business in data[other]:
                if business not in predictions:
                    predictions[business] = 0
                predictions[business] += sim * data[other][business]
    
    # Normalizar predicciones (opcional, dependiendo de tus datos)
    normalized_predictions = {
        k: normalize_rating(v) for k, v in predictions.items()
    }
    
    # Calcular el MSE solo para negocios que el usuario calificó
    mse_sum = 0
    n = 0  # Contador de negocios evaluados
    for business, real_rating in data[person].items():
        if business in normalized_predictions:
            predicted_rating = normalized_predictions[business]
            mse_sum += (real_rating - predicted_rating) ** 2
            n += 1
    
    return mse_sum / n if n > 0 else None


In [18]:
# Ejemplo de uso
person = 'Bf87HcPERF9yiSjb2tQBqw'
calculate_mse(person, data, df_business)


9.5