In [1]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
import pickle
import gzip
import joblib

In [2]:
columnas = ['user_id', 'business_id', 'rating']

# Extraccion de archivos
df = pd.read_parquet('../Data Engineering/Unification/df_unified.parquet',columns=columnas)
df_rest = pd.read_parquet('../Data Engineering/Unification/df_restaurants.parquet')
#df = df[(df['avg_rating']>3.5)&(df['review_count']>50)]

In [3]:
# generacion del modelo y matriz de relaciones
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)

trainset = data.build_full_trainset()
knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) 
knn.fit(trainset) # Modelo necesario (dump model)

inner_to_raw = {}
for raw_id, inner_id in trainset._raw2inner_id_items.items():
    inner_to_raw[inner_id] = raw_id

raw_to_inner = {v: k for k, v in inner_to_raw.items()}

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [5]:
# Funcion de recomendacion
def get_similar_businesses(business_id,cluster):
    # eleccion del top 5
    business_inner_id = raw_to_inner[business_id]
    business_similarities = knn.get_neighbors(business_inner_id, k=1000)
    
    # toma de datos de los ids seleccionados
    similar_businesses_ids = [inner_to_raw[inner_id] for inner_id in business_similarities]
    similar_businesses = df_rest[df_rest['business_id'].isin(similar_businesses_ids) & (df_rest['cluster'] == cluster)].head(5)
    similar_businesses.drop(columns=['%_competition','longitude','latitude','cluster','cluster_rating','cluster_name','review_count'],inplace=True)

    return similar_businesses

In [6]:
#Consulta
business_id = '0x88c2fd4b6db6ca95:0x5b414c5c84a4c5e0'  # business_id deseado
cluster = df_rest[df_rest['business_id'] == business_id]['cluster'].values[0]
top_recommendations = get_similar_businesses(business_id,cluster)
top_recommendations

Unnamed: 0,business_id,business_name,category,avg_rating,address,state,city,postal_code,county
283,Y_XyeCu8AkZyhcF0q5n8Dg,Pitas Republic,No Detail,5.0,13145 US Hwy 301 S,Pennsylvania,Riverview,33578,Hillsborough County
438,siwG4ZM7RjUDO52DI84m3w,Ray's Vegan Soul,Family,5.0,"341, 5th Street South, Downtown, Saint Petersb...",Pennsylvania,St. Petersburg,33701,Pinellas County
844,wMti5VLmoqnEce1VQILUNg,Eclipse Brewing,Family,5.0,25 E Park Ave,Pennsylvania,Merchantville,8109,Camden County
931,auH0BAgcZwxuQFKmvwy5PQ,Chef Tony Macaroni,Family,5.0,6918 N Florida Ave,Pennsylvania,Tampa,33604,Hillsborough County
932,7T1VAYHIS3cgsDj3iS6JWQ,The Frosted Fox Cake Shop,Family,5.0,6511 Germantown Ave,Pennsylvania,Philadelphia,19119,Philadelphia County


In [9]:
results = {'business_id':[],'related_ids':[]}
count = 0
businesses =df.business_id.unique()

for business_id in businesses:
    k = 5
    similar_businesses_ids = []
    cluster = df_rest[df_rest['business_id'] == business_id].cluster.values[0]
    lista = df_rest[df_rest['cluster']==cluster].business_id.tolist()
    while len(similar_businesses_ids) < 5:
        business_inner_id = raw_to_inner[business_id]
        business_similarities = knn.get_neighbors(business_inner_id, k=k)
        similar_businesses_ids = [inner_to_raw[inner_id] for inner_id in business_similarities]
        similar_businesses_ids = [j for j in similar_businesses_ids if j in lista]
        k +=10
    count+=1
    print(f'\r{count}, %: {count/33125*100}%',end='',flush=True)
    
    results['business_id'].append(business_id)
    results['related_ids'].append(similar_businesses_ids)


26272, %: 79.31169811320756%%

In [93]:
df_results = pd.DataFrame(results)
df_results.head()

Unnamed: 0,business_id,related_ids
0,MTSW4McQd7CbVtyjqoe9mw,"[5QefK7gbedeBiqsYJW54Bw, vCHNWdW-ys-nWUx3Cpvk8..."
1,0bPLkL0QhhPO5kt1_EXmNQ,"[kVjnI2z6bXKJU7JVCJe72A, TIoOs_emkTIjyTq7Lt_uH..."
2,kfNv-JZpuN6TVNSO6hHdkw,"[MTSW4McQd7CbVtyjqoe9mw, 2dlQX5sP9X6Dlm1MmNOlS..."


In [97]:
searched_id = '5QefK7gbedeBiqsYJW54Bw'
related = df_results[df_results['business_id'] == searched_id].related_ids.tolist()[0]
df_rest[df_rest['business_id'].isin(related)].drop(columns=['%_competition','longitude','latitude','cluster_rating','cluster_name','review_count'])

IndexError: list index out of range

In [None]:
df_results.to_parquet('df_results.parquet')