# **Recomendacion basada en popularidad**
---

In [1]:
import numpy as np
import pandas as pd

#Recomendacion por popularidad

df=pd.read_csv('../../Dataset_Apart/Cleaned/DatasetAirbnb_Cleaned_v1.csv')
# Definir las columnas de calificaciones
rating_columns = ['Limpieza', 'Veracidad', 'Llegada', 'Comunicacion', 'Ubicacion', 'Calidad']

# Llenar valores faltantes en las columnas de calificaciones con la media
df[rating_columns] = df[rating_columns].fillna(df[rating_columns].mean())

# Calcular el promedio de las calificaciones para cada apartamento
df['Promedio_Ratings'] = df[rating_columns].mean(axis=1)

# Definir la puntuación de popularidad como una combinación de evaluaciones y promedio de ratings
# Ajusta los pesos según la importancia que quieras dar a cada factor
peso_evaluaciones = 0.6
peso_ratings = 0.4

df['Puntuacion_Popularidad'] = peso_evaluaciones * df['Evaluaciones'] + peso_ratings * df['Promedio_Ratings']

def recomendador_por_popularidad(df, top_n=5):
    return df.sort_values(by='Puntuacion_Popularidad', ascending=False).head(top_n)

# Obtener los 3 apartamentos más populares según la nueva métrica
recomendaciones_populares = recomendador_por_popularidad(df, top_n=3)
print(recomendaciones_populares[['ID', 'Titulo', 'Evaluaciones', 'Promedio_Ratings', 'Puntuacion_Popularidad']])


                      ID                                  Titulo  \
79   6119377963866422626      Estudio Biarritz, estudio, 2 pers.   
185  3262728650081535420               Estrella ii 2d Spaniahome   
120  2130813439338571442  Nido acogedor entre la bahía y el mar.   

     Evaluaciones  Promedio_Ratings  Puntuacion_Popularidad  
79         6854.0          4.794622             4114.317849  
185        2076.0          4.794622             1247.517849  
120        1394.0          4.750000              838.300000  


# **Recomendacion por similaridad**
---

La idea esque el usuario haga una preseleccion de aquellos que le gustan y recomendarle los mas similares a estos. A continuacion se hace una prueba simplificada midiendo la similaridad por coseno.

In [4]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Seleccionar características para calcular la similitud
features = ['ID','Titulo','Descripcion Simple','URL','url_img']

# Escalar características
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[features])

# Calcular la similitud del coseno
similarity_matrix = cosine_similarity(scaled_features)

# Función para recomendar apartamentos similares
def recomendador_por_similitud(df, apartment_id, top_n=5):
    # Encontrar el índice del apartamento
    idx = df.index[df['ID'] == apartment_id][0]
    
    # Obtener la similitud de los apartamentos
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Ordenar los apartamentos por similitud
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    
    # Obtener los índices de los apartamentos más similares
    similar_apartments_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    
    return df.iloc[similar_apartments_indices]

# Obtener apartamentos similares a un apartamento específico
apartamento_id = 1200688382914319681  # ID del apartamento de ejemplo
recomendaciones_similares = recomendador_por_similitud(df, apartamento_id, top_n=4)
recomendaciones_similares


Unnamed: 0,ID,Titulo,Descripcion Simple,Evaluaciones,Tipo,Precio,URL,Limpieza,Veracidad,Llegada,...,Camas,Baños,Dormitorios,Baño Compartido,Wifi,Mascotas,Piscina,Parking,Promedio_Ratings,Puntuacion_Popularidad
67,-7545000407210804405,"Frente al mar con piscina, 3dorm. Parking, Fibra",Alojamiento entero: apto. residencial en Daimu...,53.0,A pie de playa,87.0,https://www.airbnb.es/rooms/31642583?adults=1&...,4.9,4.9,4.9,...,5.0,2.0,3.0,0,1.0,1.0,1.0,1.0,4.9,33.76
421,6895992565345467060,Magnifico Chalet en la Sierra de Madrid,"Habitación en Valdencina, España",90.0,En el campo,172.0,https://www.airbnb.es/rooms/14770870?adults=1&...,4.9,4.9,5.0,...,5.0,2.033012,3.0,0,1.0,1.0,1.0,1.0,4.916667,55.966667
769,-2912441540787268215,Casa de Madera II,"Alojamiento entero: cabaña en Bicorp, España",70.0,Cabañas,37.0,https://www.airbnb.es/rooms/31512931?adults=1&...,5.0,4.9,4.9,...,6.0,1.0,3.0,0,1.0,1.0,1.0,1.0,4.933333,43.973333
596,-7843512998668970045,La cabaña de Flo,Alojamiento entero: chalet en Saint-Christophe...,115.0,Cabañas,150.0,https://www.airbnb.es/rooms/47299709?adults=1&...,4.9,5.0,5.0,...,5.0,2.0,3.0,0,1.0,1.0,1.0,1.0,4.933333,70.973333


# **1. Filtrado Colaborativo Basado en Matrices de Descomposición**

Usaremos scikit-learn para implementar SVD y NMF.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import NMF, TruncatedSVD

# Cargar los datos
apartments = pd.read_csv('/Users/mariolamas/Desktop/Social-Network-Analysis/Dataset_Apart/Cleaned/DatasetAirbnb_Cleaned_v1.csv')
comments = pd.read_csv('/Users/mariolamas/Desktop/Social-Network-Analysis/Src/Data_clean/CommentDataset_cleaned_v2.csv')

comments.rename(columns={'apart_id':'URL'},inplace=True)
# Unir los datos por la columna URL
data = pd.merge(apartments, comments, on='URL')

# Crear matriz de usuario-apartamento
ratings_matrix = data.pivot_table(index='user_id', columns='URL', values='rating').fillna(0)

# Dividir los datos en entrenamiento y prueba
train_data, test_data = train_test_split(ratings_matrix, test_size=0.2, random_state=42)

# Filtrado colaborativo basado en SVD
svd = TruncatedSVD(n_components=50)
train_svd = svd.fit_transform(train_data)
test_svd = svd.transform(test_data)

# Reconstruir la matriz de predicción
predicted_ratings_svd = pd.DataFrame(svd.inverse_transform(train_svd), index=train_data.index, columns=train_data.columns)

# Evaluar el modelo
train_rmse = mean_squared_error(train_data, predicted_ratings_svd, squared=False)
print(f'SVD Train RMSE: {train_rmse}')

# Filtrado colaborativo basado en NMF
nmf = NMF(n_components=50)
train_nmf = nmf.fit_transform(train_data)
test_nmf = nmf.transform(test_data)

# Reconstruir la matriz de predicción
predicted_ratings_nmf = pd.DataFrame(nmf.inverse_transform(train_nmf), index=train_data.index, columns=train_data.columns)

# Evaluar el modelo
train_rmse_nmf = mean_squared_error(train_data, predicted_ratings_nmf, squared=False)
print(f'NMF Train RMSE: {train_rmse_nmf}')




SVD Train RMSE: 0.19896361174479182
NMF Train RMSE: 0.19185358279804438




# **Reglas de asociacion**
---

In [8]:
apartamentos = pd.read_csv('/Users/mariolamas/Desktop/Social-Network-Analysis/Dataset_Apart/Cleaned/DatasetAirbnb_Cleaned_v1.csv')
comentarios = pd.read_csv('/Users/mariolamas/Desktop/Social-Network-Analysis/Src/Data_clean/CommentDataset_cleaned_v2.csv',index_col=0)

In [11]:
from sklearn.preprocessing import LabelEncoder
item_encoder = LabelEncoder()
apartamentos['ID'] = item_encoder.fit_transform(apartamentos['ID'])

In [16]:

data_mege=pd.merge(apartamentos,comentarios,left_on='URL',right_on='apart_id')
# Crear la matriz usuario-item usando pivot_table
matriz_usuario_item = data_mege.pivot_table(index='user_id', columns='ID', values='rating', fill_value=0).reset_index()
matriz_usuario_item.drop('user_id',axis=1,inplace=True)
matriz_usuario_item

ID,0,1,3,6,12,14,16,17,18,21,...,763,764,766,769,772,773,774,780,782,784
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13810,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13812,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
matriz_usuario_item[matriz_usuario_item != 0] = 1

In [35]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

matriz_usuario_item[matriz_usuario_item != 0] = 1

# Aplicar algoritmo Apriori para encontrar itemsets frecuentes
frequent_itemsets = apriori(matriz_usuario_item,min_support=.001, use_colnames=True)

# Generar reglas de asociación
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1.2)

# Ordenar reglas por lift en orden descendente
rules = rules.sort_values(by='lift', ascending=False)

# Mostrar las reglas más relevantes
print("Reglas de Asociación:")
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head()




Reglas de Asociación:


Unnamed: 0,antecedents,consequents,support,confidence,lift


In [37]:
frequent_itemsets.sort_values(by='support',ascending=False)

Unnamed: 0,support,itemsets
0,0.013320,(0)
169,0.012306,(554)
166,0.011582,(544)
87,0.011582,(277)
25,0.011510,(90)
...,...,...
153,0.001158,(492)
133,0.001086,(429)
29,0.001013,(106)
31,0.001013,(108)
