In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import gc
from tqdm import tqdm
from collections import defaultdict
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn, cat, mean

# Read Data

In [3]:
# read users pool
involved_users = pd.read_pickle("/content/drive/MyDrive/yelp/sample_users.pkl")

In [4]:
involved_users.rename(columns={"review_count": "review_count_x"}, inplace=True)

In [5]:
# read restaurants pool
restaurants = pd.read_json("/content/drive/MyDrive/SampleYelpData/sample_business.json")

In [21]:
def inference_process_business_data(source: pd.DataFrame):
  # setup an array for writing each row in the csv file
  rows = []
  # setup an array for headers we are not using strictly
  removed_header = ['name', 'address', 'latitude', 'longitude', 'is_open']
  # headers that can be directly used
  useful_header = ['business_id', 'city', 'state', 'postal_code', 'stars', 'review_count']
  # setup an array for headers we are adding
  business_data = source
  # append the initial keys as csv headers
  header = source.columns
  business_data = business_data.drop(columns=removed_header).reset_index(drop=True)
  orig_header = sorted(business_data.columns)

  days_of_week = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
  attributes = ["RestaurantsTakeOut", 'RestaurantsReservations',
            'RestaurantsDelivery', 'Alcohol','RestaurantsPriceRange2',
            'OutdoorSeating','RestaurantsGoodForGroups',
            'HasTV', 'Caters', 'GoodForKids', 'BusinessAcceptsCreditCards',
            'WiFi']

  print('processing data in the business dataset...')
  # for every entry in the business data array
  final_features = useful_header.copy()
  final_features.extend(days_of_week)
  final_features.extend(attributes)

  for entry in tqdm(range(0, len(business_data))):
    row = []
    for item in useful_header:
      row.append(business_data.loc[entry, item])

    # iterate through the days of the week to extract the open and close times
    for time in days_of_week:
      flag = 0
      # if a time is available
      if business_data.loc[entry, 'hours'] is not None:
        if time in business_data.loc[entry, 'hours'].keys():
          # append the open time
          if "-" in business_data.loc[entry, 'hours'][time]:
            open_time, close_time = business_data.loc[entry, 'hours'][time].split('-')
            if open_time != close_time:
              flag = 1
              row.append(1)
      if flag == 0:
        row.append(0)

    # for each attribute that is not nested
    for attribute in attributes:
      # if there is an attribute
      if business_data.loc[entry, 'attributes'] is not None:
        if attribute in business_data.loc[entry, 'attributes'].keys():
          # if the attribute contains true
          if business_data.loc[entry, 'attributes'][attribute] == "none":
            row.append(np.nan)
          else:
            row.append(business_data.loc[entry, 'attributes'][attribute])
        else:
          # append NA for the attribute
          row.append(np.nan)
      else:
          row.append(np.nan)

    # remove stray text, such as "\n" form address
    # set up an array for the cleaned row entries
    row_clean = []
    # for every item in the row
    for item in row:
      # scan and replace for nasty text
      row_clean.append(str(item).replace('\n', ' '))
    # after all fields have been extracted and cleaned, append the row to the rows array for writing to csv
    rows.append(row_clean)

  new_df = pd.DataFrame(rows, columns=final_features)
  new_df.replace({"none": np.nan}, inplace=True)

  new_df.drop(columns = ["state", "postal_code"], inplace=True)
  new_df.rename(columns={"review_count": "review_count_y",
                         "stars": "business_stars"}, inplace=True)
  new_df["review_count_y"] = new_df["review_count_y"].astype(int)
  new_df["business_stars"] = new_df["business_stars"].astype(float)


  return new_df

In [22]:
parsed_restaurants = inference_process_business_data(restaurants)

processing data in the business dataset...


100%|██████████| 28028/28028 [00:16<00:00, 1674.94it/s]


In [63]:
class WideAndDeep(nn.Module):
    def __init__(
        self,
        embedding_table_shapes, # embeddings for sparse features
        wide_size,
        emb_size, # length of concat embeddings
        dense_feature_size, # length of dense features
        dropout=0.2, # dropout for embeddings
    ):
        super(WideAndDeep, self).__init__()
        self.initial_cat_layer = ConcatenatedEmbeddings(
            embedding_table_shapes, dropout=dropout
        )
        self.wide_linear_relu = nn.Sequential(
            nn.Linear(wide_size, 1),
        )
        self.deep_linear_relu_stack = nn.Sequential(
            nn.Linear(emb_size + dense_feature_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
        )

    def forward(self, wide_features, sparse_features, dense_features):
        wide_res = self.wide_linear_relu(wide_features)
        # get embeddings for sparse features
        concat_emb = self.initial_cat_layer(sparse_features)
        deep_input = torch.cat((concat_emb, dense_features), dim=1)
        deep_res = self.deep_linear_relu_stack(deep_input)
        total_res = wide_res + deep_res
        return total_res

In [54]:
dense_features = ["fans", "average_stars", "starting_year", "friends_num", "useful",
                  "funny", "cool", "elite_times", "business_stars"]

In [55]:
wide_features = ["review_count_x", "review_count_y", 'compliment_hot',
       'compliment_more', 'compliment_profile',
       'compliment_cute', 'compliment_list', 'compliment_note',
       'compliment_plain', 'compliment_cool', 'compliment_funny',
       'compliment_writer', 'compliment_photos', 'starting_year']

In [56]:
sparse_features = [
    "city",
    "RestaurantsTakeOut",
    'RestaurantsReservations',
    'RestaurantsDelivery',
    'Alcohol',
    'RestaurantsPriceRange2',
    'OutdoorSeating',
    'RestaurantsGoodForGroups',
    'HasTV',
    'Caters',
    'GoodForKids',
    'BusinessAcceptsCreditCards',
    'WiFi',
    'Sunday',
    'Monday',
    'Tuesday',
    'Wednesday',
    'Thursday',
    'Friday',
    'Saturday'
    ]

In [13]:
def build_sparse_features_idx_mapping(sparse_features, concat_df):
  sparce_features_to_idx = defaultdict(dict)
  idx_to_sparce_features = defaultdict(dict)
  for f in sparse_features:
    feature = list(concat_df[f].unique())
    feature_nums = len(feature)
    for i in range(feature_nums):
      sparce_features_to_idx[f][feature[i]] = i
      idx_to_sparce_features[f][i] = [feature[i]]
  return sparce_features_to_idx, idx_to_sparce_features

In [23]:
sparce_features_to_idx, idx_to_sparce_features = build_sparse_features_idx_mapping(sparse_features, parsed_restaurants)

In [15]:
def encode_concat_df(concat_df, sparce_features_to_idx, sparse_features):
  encoded_concat_df = concat_df.copy()
  for f in sparse_features:
    encoded_concat_df[f] = encoded_concat_df[f].map(sparce_features_to_idx[f])
  return encoded_concat_df

In [64]:
def get_embedding_shape(sparse_features, concat_df):
  embedding_table_shapes = {}
  for f in sparse_features:
    if f != "city":
      embedding_table_shapes[f] = (len(concat_df[f].unique()), 16)
    else:
      embedding_table_shapes[f] = (len(concat_df[f].unique()), 128)
  return embedding_table_shapes

In [66]:
embedding_table_shapes = get_embedding_shape(sparse_features, parsed_restaurants)

In [24]:
encoded_restaurants = encode_concat_df(parsed_restaurants, sparce_features_to_idx, sparse_features)

In [26]:
encoded_restaurants.to_pickle("/content/drive/MyDrive/yelp/encoded_restaurants.pkl")

In [None]:
!pip install nvtabular
from nvtabular.framework_utils.torch.layers import ConcatenatedEmbeddings

In [None]:
embedding_table_shapes = json.load(open("/content/drive/MyDrive/SampleYelpData/embedding_table_shapes.json"))

In [None]:
model = WideAndDeep(embedding_table_shapes, 14, 432, 9)
model.load_state_dict(torch.load("/content/drive/MyDrive/yelp/TrainedModel128/model_27.pt"))

In [122]:
encoded_restaurants = pd.read_pickle("/content/drive/MyDrive/yelp/encoded_restaurants.pkl")

def recommend_topK(user_id, encoded_restaurants, K=10):
  assert user_id in involved_users['user_id'].values
  user_data = involved_users[involved_users['user_id'] == user_id].drop(columns=['user_id'])
  assert user_data.isna().sum().sum() == 0
  assert encoded_restaurants.isna().sum().sum() == 0
  encoded_restaurants_ids = encoded_restaurants['business_id']
  encoded_business_mat = encoded_restaurants.drop(columns=['business_id'])
  user_and_business_mat = pd.concat([user_data, encoded_business_mat], axis=1)
  user_and_business_mat = user_and_business_mat.fillna(method="ffill")
  wide_features_val, sparse_features_val, dense_features_val = user_and_business_mat[wide_features].values, user_and_business_mat[sparse_features].values, user_and_business_mat[dense_features].values
  t_wide_features_val, t_sparse_features_val, t_dense_features_val = torch.tensor(wide_features_val).float(), torch.tensor(sparse_features_val).long(), torch.tensor(dense_features_val).float()
  with torch.no_grad():
    pred = model(t_wide_features_val, t_sparse_features_val, t_dense_features_val)
    top10_idx = torch.argsort(torch.squeeze(pred), descending=True)[:10]
    top10_idx = top10_idx.numpy()
    top10_business_ids = encoded_restaurants_ids.iloc[top10_idx]
    print(f"recommend 10 restaurants for user {user_id}")
    for i in range(len(top10_idx)):
      print(f"restaurant id is {top10_business_ids.values[i]}, predicted score is {torch.squeeze(pred)[top10_idx[i]]}")
  # user_and_business_mat.fillna(0, inplace=True)
  # user_and_business_mat.drop(columns=['business_id'], inplace=True
  # print(user_data)

In [123]:
recommend_topK("LwZJFLGxQwjjeOgpqTJnfw", encoded_restaurants, K=10)

recommend 10 restaurants for user LwZJFLGxQwjjeOgpqTJnfw
restaurant id is _ab50qdWOk0DdB6XOrBitw, predicted score is 6.0654120445251465
restaurant id is ac1AeYqs8Z4_e2X5M3if2A, predicted score is 6.036512851715088
restaurant id is GXFMD0Z4jEVZBCsbPf4CTQ, predicted score is 5.762081623077393
restaurant id is ytynqOUb3hjKeJfRj5Tshw, predicted score is 5.6587114334106445
restaurant id is oBNrLz4EDhiscSlbOl8uAw, predicted score is 5.459670066833496
restaurant id is VQcCL9PiNL_wkGf-uF3fjg, predicted score is 5.404573440551758
restaurant id is _C7QiQQc47AOEv4PE3Kong, predicted score is 5.391777992248535
restaurant id is I_3LMZ_1m2mzR0oLIOePIg, predicted score is 5.301872253417969
restaurant id is GBTPC53ZrG1ZBY3DT8Mbcw, predicted score is 5.269805908203125
restaurant id is gTC8IQ_i8zXytWSly3Ttvg, predicted score is 5.2371745109558105


In [125]:
recommend_topK("KVbMZV-XJPSH9wXEwuXIaA", encoded_restaurants, K=10)

recommend 10 restaurants for user KVbMZV-XJPSH9wXEwuXIaA
restaurant id is _ab50qdWOk0DdB6XOrBitw, predicted score is 5.976140022277832
restaurant id is ac1AeYqs8Z4_e2X5M3if2A, predicted score is 5.803963661193848
restaurant id is ytynqOUb3hjKeJfRj5Tshw, predicted score is 5.560360908508301
restaurant id is GXFMD0Z4jEVZBCsbPf4CTQ, predicted score is 5.216649055480957
restaurant id is oBNrLz4EDhiscSlbOl8uAw, predicted score is 5.112929344177246
restaurant id is GBTPC53ZrG1ZBY3DT8Mbcw, predicted score is 5.076563358306885
restaurant id is VQcCL9PiNL_wkGf-uF3fjg, predicted score is 5.0743489265441895
restaurant id is I_3LMZ_1m2mzR0oLIOePIg, predicted score is 4.960506916046143
restaurant id is _C7QiQQc47AOEv4PE3Kong, predicted score is 4.957914352416992
restaurant id is iSRTaT9WngzB8JJ2YKJUig, predicted score is 4.947678089141846


In [127]:
recommend_topK("BgZwJBhVWKq1Urs4rKBdiA", encoded_restaurants, K=10)

recommend 10 restaurants for user BgZwJBhVWKq1Urs4rKBdiA
restaurant id is _ab50qdWOk0DdB6XOrBitw, predicted score is 6.031673431396484
restaurant id is ac1AeYqs8Z4_e2X5M3if2A, predicted score is 5.9640069007873535
restaurant id is GXFMD0Z4jEVZBCsbPf4CTQ, predicted score is 5.585503101348877
restaurant id is ytynqOUb3hjKeJfRj5Tshw, predicted score is 5.518928527832031
restaurant id is oBNrLz4EDhiscSlbOl8uAw, predicted score is 5.36427640914917
restaurant id is iSRTaT9WngzB8JJ2YKJUig, predicted score is 5.334163665771484
restaurant id is VQcCL9PiNL_wkGf-uF3fjg, predicted score is 5.325367450714111
restaurant id is _C7QiQQc47AOEv4PE3Kong, predicted score is 5.264368534088135
restaurant id is GBTPC53ZrG1ZBY3DT8Mbcw, predicted score is 5.2140583992004395
restaurant id is 6a4gLLFSgr-Q6CZXDLzBGQ, predicted score is 5.155121803283691


In [92]:
# import json
# with open("/content/drive/MyDrive/SampleYelpData/embedding_table_shapes.json", "w") as outfile:
#     json.dump(embedding_table_shapes, outfile)

In [85]:
model = WideAndDeep(embedding_table_shapes, 14, 432, 9)
model.load_state_dict(torch.load("/content/drive/MyDrive/yelp/TrainedModel128/model_27.pt"))

<All keys matched successfully>