In [None]:
pwd

'/content'

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [None]:
!pip install "dask[dataframe]"
!pip install "dask[bag]"
!pip install "dask[array]"

Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 5.4 MB/s 
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, partd, fsspec
Successfully installed fsspec-2021.11.0 locket-0.2.1 partd-1.2.0


In [None]:
import pandas as pd
import json
import numpy as np
import dask.bag as db
import dask.dataframe as dd # substitute of pandas to load the data in streams
import dask.array as da

In [None]:
# Import the review dataset as streams to avoid running out of memory
dict_bag = db.read_text('yelp_academic_dataset_review.json', blocksize=int(5e6)).map(json.loads)
df_r = dict_bag.to_dataframe(columns=['user_id', 'business_id', 'stars', 'useful', 'date'])
df_r = df_r.repartition(npartitions=10)
df_r.head()

Unnamed: 0,user_id,business_id,stars,useful,date
0,ak0TdVmGKo4pwqdJSTLwWw,buF9druCkbuXLX526sGELQ,4.0,3,2014-10-11 03:34:02
1,YoVfDbnISlW0f7abNQACIg,RA4V8pr014UyUbDvI-LW2A,4.0,1,2015-07-03 20:38:25
2,eC5evKn1TWDyHCyQAwguUw,_sS2LBIGNT5NQb6PD1Vtjw,5.0,0,2013-05-28 20:38:06
3,SFQ1jcnGguO0LYWnbbftAA,0AzLzHfOJgL7ROwhdww2ew,2.0,1,2010-01-08 02:29:15
4,0kA0PAJ8QFMeveQWHFqz2A,8zehGz9jnxPqXtOc7KaJxA,4.0,0,2011-07-28 18:05:01


In [None]:
# Business dataset
dict_bag = db.read_text('yelp_academic_dataset_business.json', blocksize=int(5e6)).map(json.loads)
df_b = dict_bag.to_dataframe(columns=['business_id', 'city', 'stars', 'review_count'])
df_b = df_b.repartition(npartitions=10)
df_b.head()

Unnamed: 0,business_id,city,stars,review_count
0,6iYb2HFDywm3zjuRg0shjw,Boulder,4.0,86
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,4.0,126
2,bvN78flM8NLprQ1a1y5dRg,Portland,4.5,13
3,oaepsyvc0J17qwi8cfrOWg,Orange City,3.0,8
4,PE9uqAjdw0E4-8mjGl3wVA,Atlanta,4.0,14


In [None]:
# Users dataset (Import it in streams again)
dict_bag = db.read_text('yelp_academic_dataset_user.json', blocksize=int(5e6)).map(json.loads)
df_u = dict_bag.to_dataframe(columns=['user_id', 'friends', 'review_count'])
df_u = df_u.repartition(npartitions=10)
df_u.head()

Unnamed: 0,user_id,friends,review_count
0,q_QQ5kBBwlCcbL1s4NVK3g,"xBDpTUbai0DXrvxCe3X16Q, 7GPNBO496aecrjJfW6UWtg...",1220
1,dIIKEfOgo0KqUfGQvGikPg,"XPzYf9_mwG2eXYP2BAGSTA, 2LooM5dcIk2o01nftYdPIg...",2136
2,D6ErcUnFALnCQN4b1W_TlA,"GfB6sC4NJQvSI2ewbQrDNA, jhZtzZNNZJOU2YSZ6jPlXQ...",119
3,JnPIjvC0cmooNDfsa9BmXg,"HQZPQhKMwRAyS6BCselVWQ, kP2U1s_sjQfHO9grxiyDTA...",987
4,37Hc8hr3cw0iHLoPzLK6Ow,"-Q88pZUcrfN0BLBDp-bkAQ, etPn4Pv1Gc4cRZjRgB_BOw...",495


In [None]:
# Recommender system using the rating of the reviews
# We appoach it using low rank approximation techniques
# We have 3 goals
#   - Suggest new friends (to the users)
#   - Recommend new restaurants (to the users)
#   - Tell the restaurant which are possible competitors

### Data Cleaning ###
# Select a city
area = 'Vancouver'
df_b = df_b[df_b.city == area]
df_b = df_b.drop('city', axis=1)
# Active users / business
df_b = df_b[df_b.review_count > 200]
df_u = df_u[df_u.review_count > 100]
# Select only the variables that we need now
df_r = df_r[['user_id', 'business_id', 'stars']]
df_u = df_u[['user_id']]
df_b = df_b[['business_id']]

# subset of users that reviewd in that city
df_r = dd.merge(df_r, df_u, how='inner', on='user_id')
df = dd.merge(df_r, df_b, how='inner', on='business_id')
df.head()


Unnamed: 0,user_id,business_id,stars
0,jKn_HycMvKa3yPHAUoCQAQ,eYmf3scmgHqJASKkRF0QCg,4.0
1,Z2x-sMYhTKmYEU5yuv75pA,eYmf3scmgHqJASKkRF0QCg,4.0
2,YWeHmeVtoc_RMmEfuEqBrA,eYmf3scmgHqJASKkRF0QCg,5.0
3,xC-q_yh0XwcjRLimkS3RNg,eYmf3scmgHqJASKkRF0QCg,5.0
4,mXRomu-YuObER_HJtpf_UA,eYmf3scmgHqJASKkRF0QCg,4.0


In [None]:
#df['stars'] = df['stars'].astype(np.int8)

In [None]:
# reexpress the dataset in matrix form
df = df.categorize(columns=['business_id'])
#df.stars = df.stars.astype(np.int8)
M = dd.reshape.pivot_table(df, index='user_id', columns='business_id', values='stars').fillna(0)


In [None]:
M_df = M
M = M.compute()
M = M.to_numpy()
star_mean = np.mean(M, axis = 1)
M_std = M - star_mean.reshape(-1, 1)


In [None]:
# Compute the SVD decomposition
# choose rank
k = 300  
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(M_std, k)

In [None]:
# compute predicted rating for all the pairs user / business
sigma = np.diag(sigma)
pred_ratings = np.dot(np.dot(U, sigma), Vt) + star_mean.reshape(-1, 1)
preds_df = pd.DataFrame(pred_ratings, index = M_df.index, columns = M_df.columns)

In [None]:

# user := user_id of the person that we want to suggest rec
# n := number of recommendations
user = 'YWeHmeVtoc_RMmEfuEqBrA'
n = 10
# select real reviews from user
A = M_df.compute()
A = A[A.index == user]
A = A.T
data = np.array([A.index, A[user]])
user_data = pd.DataFrame(data.T, columns = ['business_id', 'stars'])
# predicted reviews from user
B = preds_df[preds_df.index == user]
B = B.T
data_B = np.array([B.index, B[user]])
user_pred = pd.DataFrame(data_B.T, columns = ['business_id', 'stars'])
#only recommend non previously rated business
not_rated = pd.merge(user_data, user_pred, how='outer', on='business_id', indicator=True)
#not_rated = not_rated[not_rated['_merge'] == 'right_only']
not_rated = not_rated.sort_values(by=['stars_y'], ascending=False)
not_rated['business_id'][:n]

310    BdHvzCsxbLCErx36UcnMZQ
128    UebEhMTqHL1XHOtRDmq6PQ
0      eYmf3scmgHqJASKkRF0QCg
220    R1yQ3WO1DT0TMZQ1AVsjAw
407    HTJTTobu5hXM2Xgqj8OBgg
270    l2f2PU8Rtr5TPtzwaGQ_sA
225    DF60u_0flQzqTcBqaLL4Iw
83     VPqWLp9kMiZEbctCebIZUA
406    X0z6FAw48MIXuo6uhuwJQw
301    K1943yeGQELTUeiH6bDa2g
Name: business_id, dtype: object