In [36]:
import pandas as pd
from surprise import Dataset, Reader, KNNBasic
import pickle
import gzip
import joblib

In [11]:
# Extraccion de archivos
df_rest = pd.read_parquet('../Data Engineering/Unification/Final Unifications/df_restaurants.parquet')
df_reviews = pd.read_parquet('../Data Engineering/Unification/Final Unifications/df_reviews.parquet')
df = pd.merge(df_rest, df_reviews, on='business_id', how='inner')

df = df[(df['avg_rating']>3.5)&(df['review_count']>50)]

unique_businesses = df['business_id'].unique()

In [24]:
# generacion del modelo y matriz de relaciones
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader)

trainset = data.build_full_trainset()
knn = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}) 
knn.fit(trainset) # Modelo necesario (dump model)

inner_to_raw = {}
for raw_id, inner_id in trainset._raw2inner_id_items.items():
    inner_to_raw[inner_id] = raw_id

raw_to_inner = {v: k for k, v in inner_to_raw.items()}

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [26]:
# Funcion de recomendacion
def get_similar_businesses(business_id,cluster):
    # eleccion del top 5
    business_inner_id = raw_to_inner[business_id]
    business_similarities = knn.get_neighbors(business_inner_id, k=1000)
    
    # toma de datos de los ids seleccionados
    similar_businesses_ids = [inner_to_raw[inner_id] for inner_id in business_similarities]
    similar_businesses = df_rest[df_rest['business_id'].isin(similar_businesses_ids) & (df_rest['cluster'] == cluster)].head(5)
    similar_businesses.drop(columns=['%_competition','longitude','latitude','cluster','cluster_rating','cluster_name','review_count'],inplace=True)

    return similar_businesses

In [33]:
#Consulta
business_id = '0x88c2fd4b6db6ca95:0x5b414c5c84a4c5e0'  # business_id deseado
cluster = df_rest[df_rest['business_id'] == business_id]['cluster'].values[0]
top_recommendations = get_similar_businesses(business_id,cluster)
top_recommendations

Unnamed: 0,business_id,business_name,category,avg_rating,address,state,city,postal_code,county
844,wMti5VLmoqnEce1VQILUNg,Eclipse Brewing,Family,5.0,25 E Park Ave,Pennsylvania,Merchantville,8109,Camden County
932,7T1VAYHIS3cgsDj3iS6JWQ,The Frosted Fox Cake Shop,Family,5.0,6511 Germantown Ave,Pennsylvania,Philadelphia,19119,Philadelphia County
2555,vHqJerp8iZN1Lje3dLa4Eg,Uptown Eats,Family,5.0,"689 Dr Mlk Jr St N, Ste D",Pennsylvania,St. Petersburg,33701,Pinellas County
2714,cn9SECA9LNC01GRXCi3wsg,Sugar Sweet Bakery,Family,5.0,18 S Eastbourne Ave,California,Tucson,85716,Pima County
3979,Pg2ZKh-Ss7CCpaF8MwNWYw,Antonio's Deli,Family,5.0,1014 Federal St,Pennsylvania,Philadelphia,19147,Philadelphia County


In [34]:
df_rest[df_rest['business_id'] == business_id].drop(columns=['%_competition','longitude','latitude','cluster','cluster_rating','cluster_name','review_count'])

Unnamed: 0,business_id,business_name,category,avg_rating,address,state,city,postal_code,county
50683,0x88c2fd4b6db6ca95:0x5b414c5c84a4c5e0,"Grillin N Chillin St. Pete, Inc.",Family,4.9,"Grillin N Chillin St. Pete, Inc., 6708 Gulf Bl...",Florida,St Pete Beach,33706,Pinellas County


In [40]:
# Guardar el modelo sin compresión
joblib.dump(knn, 'modelo_knn.pkl')

# Comprimir el archivo utilizando gzip
with open('modelo_knn.pkl', 'rb') as f_in:
    with gzip.open('modelo_knn.gz', 'wb') as f_out:
        f_out.writelines(f_in)

In [37]:
with open('raw_to_inner.pkl', 'wb') as f:
    pickle.dump(raw_to_inner, f)

with open('inner_to_raw.pkl', 'wb') as f:
    pickle.dump(inner_to_raw, f)

In [41]:
business_id = 'wMti5VLmoqnEce1VQILUNg'

business_inner_id = raw_to_inner[business_id]
business_similarities = knn.get_neighbors(business_inner_id, k=5)
similar_businesses_ids = [inner_to_raw[inner_id] for inner_id in business_similarities]

similar_businesses_ids

['2dlQX5sP9X6Dlm1MmNOlSw',
 'SlLfWzeYOrVBxCd-0QILZQ',
 'pUVRCYL8rT4I5Ry5FYkRsA',
 '9Y5JPV0TFZpJXFokFxwewQ',
 'jULIPydhMj18KOI5OHTwfA']