In [1]:
pwd

'/content'

In [2]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [3]:
!pip install "dask[dataframe]"
!pip install "dask[bag]"
!pip install "dask[array]"

Collecting fsspec>=0.6.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[?25l[K     |██▌                             | 10 kB 23.4 MB/s eta 0:00:01[K     |█████                           | 20 kB 30.1 MB/s eta 0:00:01[K     |███████▍                        | 30 kB 35.3 MB/s eta 0:00:01[K     |██████████                      | 40 kB 39.1 MB/s eta 0:00:01[K     |████████████▍                   | 51 kB 29.7 MB/s eta 0:00:01[K     |██████████████▉                 | 61 kB 30.4 MB/s eta 0:00:01[K     |█████████████████▎              | 71 kB 26.3 MB/s eta 0:00:01[K     |███████████████████▉            | 81 kB 27.4 MB/s eta 0:00:01[K     |██████████████████████▎         | 92 kB 28.1 MB/s eta 0:00:01[K     |████████████████████████▊       | 102 kB 30.1 MB/s eta 0:00:01[K     |███████████████████████████▏    | 112 kB 30.1 MB/s eta 0:00:01[K     |█████████████████████████████▊  | 122 kB 30.1 MB/s eta 0:00:01[K     |████████████████████████████████| 132 kB 30.1 M

In [4]:
import pandas as pd
import json
import numpy as np
import dask.bag as db
import dask.dataframe as dd # substitute of pandas to load the data in streams
import dask.array as da

In [5]:
# Import the review dataset as streams to avoid running out of memory
dict_bag = db.read_text('yelp_academic_dataset_review.json', blocksize=int(5e6)).map(json.loads)
df_r = dict_bag.to_dataframe(columns=['user_id', 'business_id', 'stars', 'useful', 'date'])
df_r = df_r.repartition(npartitions=10)
#df_r.head()

In [6]:
# Business dataset
dict_bag = db.read_text('yelp_academic_dataset_business.json', blocksize=int(5e6)).map(json.loads)
df_b = dict_bag.to_dataframe(columns=['business_id', 'city', 'stars', 'review_count'])
df_b = df_b.repartition(npartitions=10)
#df_b.head()

In [7]:
# Users dataset (Import it in streams again)
dict_bag = db.read_text('yelp_academic_dataset_user.json', blocksize=int(5e6)).map(json.loads)
df_u = dict_bag.to_dataframe(columns=['user_id', 'friends', 'review_count'])
df_u = df_u.repartition(npartitions=10)
#df_u.head()

In [8]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 19.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619399 sha256=7bde521f5f4ebb84d16e74fb0460e3c7e5be1be3c0b433d0f153e4a67585b2fc
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [9]:
from surprise import Dataset
from surprise import Reader

In [11]:
# Recommender system using the rating of the reviews
# We appoach it using low rank approximation techniques
# We have 3 goals
#   - Suggest new friends (to the users)# Business dataset
dict_bag = db.read_text('yelp_academic_dataset_business.json', blocksize=int(5e6)).map(json.loads)
df_b = dict_bag.to_dataframe(columns=['business_id', 'city', 'stars', 'review_count'])
df_b = df_b.repartition(npartitions=10)
#df_b.head()
#   - Recommend new restaurants (to the users)
#   - Tell the restaurant which are possible competitors

### Data Cleaning ###
# Select a city
area = 'Richmond'
df_b = df_b[df_b.city == area]
df_b = df_b.drop('city', axis=1)
# Active users / business
df_b = df_b[df_b.review_count > 100]
df_u = df_u[df_u.review_count > 50]
# Select only the variables that we need now
df_r = df_r[['user_id', 'business_id', 'stars']]
df_u = df_u[['user_id']]
df_b = df_b[['business_id']]

# subset of users that reviewd in that city
df_r = dd.merge(df_r, df_u, how='inner', on='user_id')
df = dd.merge(df_r, df_b, how='inner', on='business_id')
#df.head()


In [12]:
df.head()

Unnamed: 0,user_id,business_id,stars
0,Fqd_7pQ2G2w7cD0Y3Dtbsw,lQlJvXi19RJs_q4xR6gcNg,3.0
1,xC-q_yh0XwcjRLimkS3RNg,lQlJvXi19RJs_q4xR6gcNg,4.0
2,ozXoLPVB0wtkZQvrHZIj9g,lQlJvXi19RJs_q4xR6gcNg,5.0
3,zKdDu546gjeSDsV3vcR9PQ,lQlJvXi19RJs_q4xR6gcNg,4.0
4,2t2HCw37rOKKWnT8C_yDng,lQlJvXi19RJs_q4xR6gcNg,5.0


In [13]:
#df_small = df.sample(frac=0.1, replace=False, random_state=1)
df_small = df

In [14]:
df_small = df_small.compute()

In [15]:
from surprise.model_selection import train_test_split

In [16]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_small, reader)

In [17]:
from surprise import SVD
from surprise.model_selection import cross_validate

In [18]:
svd = SVD(verbose=True, n_epochs=10)
#cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
#trainset = data.build_full_trainset()
trainset, testset = train_test_split(data, test_size=.25)
svd.fit(trainset)
pred = svd.test(testset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


In [19]:
from surprise import accuracy
#pred = svd.test(trainset)
accuracy.rmse(pred)

RMSE: 0.9661


0.9660627458255446

In [None]:
#pred = svd.predict("Z2x-sMYhTKmYEU5yuv75pA", "eYmf3scmgHqJASKkRF0QCg")

In [21]:
#df2 = pd.merge(df_small, df_u, how = "inner", )

TypeError: ignored

In [22]:

users = df_small["user_id"].unique()
business = df_small["business_id"].unique()

In [None]:
#len(users)

4009

In [23]:
from itertools import product

In [24]:
df_full = pd.DataFrame(list(product(users, business)), columns = ['user_id', 'business_id'])

In [25]:
df_full

Unnamed: 0,user_id,business_id
0,Fqd_7pQ2G2w7cD0Y3Dtbsw,lQlJvXi19RJs_q4xR6gcNg
1,Fqd_7pQ2G2w7cD0Y3Dtbsw,seDFTCwhSrzOnaj0iYd5Jw
2,Fqd_7pQ2G2w7cD0Y3Dtbsw,SQo3j_PrRgQlbR9c4ctjUA
3,Fqd_7pQ2G2w7cD0Y3Dtbsw,cpkyRpNWllRmOhM_qlsdiQ
4,Fqd_7pQ2G2w7cD0Y3Dtbsw,y1BLVJUWINOSMLeDaXqXBQ
...,...,...
400895,Y7Iet_ePqnhT8GYoAWJJtA,BZIHRWA_zqnCwAUif7d2Yw
400896,Y7Iet_ePqnhT8GYoAWJJtA,m_Tu4PBV7adWCsuwOrCT8g
400897,Y7Iet_ePqnhT8GYoAWJJtA,GUWoTn1SUNv9kCuSsXPXCw
400898,Y7Iet_ePqnhT8GYoAWJJtA,AGNyx4JBRbjvHfCDdib5_g


In [26]:
n = 400900
#df_full['rating'] = np.zeros(n)
usr = df_full["user_id"]
bsn = df_full["business_id"]
r = []
for i in range(n):
  a, b, c, rat, e = svd.predict(usr[i], bsn[i])
  r.append(rat)

In [27]:
df_full['rating'] = r

In [28]:
df_full.head()

Unnamed: 0,user_id,business_id,rating
0,Fqd_7pQ2G2w7cD0Y3Dtbsw,lQlJvXi19RJs_q4xR6gcNg,3.874791
1,Fqd_7pQ2G2w7cD0Y3Dtbsw,seDFTCwhSrzOnaj0iYd5Jw,4.478811
2,Fqd_7pQ2G2w7cD0Y3Dtbsw,SQo3j_PrRgQlbR9c4ctjUA,3.241875
3,Fqd_7pQ2G2w7cD0Y3Dtbsw,cpkyRpNWllRmOhM_qlsdiQ,3.584831
4,Fqd_7pQ2G2w7cD0Y3Dtbsw,y1BLVJUWINOSMLeDaXqXBQ,4.009142


In [29]:
n = 4009
df_p = pd.DataFrame(index = range(n), columns=['user', 'rating'])
r = []
for usr in users:
  df_aid = df_full[df_full.user_id == usr]
  rat_aid = df_aid["rating"].values
  r.append(rat_aid)

df_p['user'] = users
df_p['rating'] = r

In [30]:
df_p

Unnamed: 0,user,rating
0,Fqd_7pQ2G2w7cD0Y3Dtbsw,"[3.8747907604313445, 4.478811358487205, 3.2418..."
1,xC-q_yh0XwcjRLimkS3RNg,"[4.126127492894533, 4.0186065352731495, 3.4641..."
2,ozXoLPVB0wtkZQvrHZIj9g,"[4.135755786861839, 4.201073701513672, 3.48445..."
3,zKdDu546gjeSDsV3vcR9PQ,"[4.057550819142904, 4.432898681274764, 3.53111..."
4,2t2HCw37rOKKWnT8C_yDng,"[4.169037253867187, 4.074708953952024, 3.22235..."
...,...,...
4004,dOFyIGbyfB69VB4pnoKqQA,"[4.15065875364662, 4.284660278000494, 3.525793..."
4005,bvuxor346zDAen3poNl9qA,"[4.09205974726469, 4.281266344683393, 3.608856..."
4006,EXKAPY0klEaXXR_j_1pRTQ,"[4.079517030460323, 4.149342837077514, 3.57642..."
4007,tjGKVFhvKqoDSth3ri5X-A,"[4.213846756388873, 4.040080435967653, 3.62665..."


In [37]:
# Restaurant recommendation
user = "ozXoLPVB0wtkZQvrHZIj9g"
df_pr = df_full[df_full.user_id == user]
df_pr = df_pr.sort_values(by=['rating'], ascending=False)

In [38]:
df_pr

Unnamed: 0,user_id,business_id,rating
223,ozXoLPVB0wtkZQvrHZIj9g,PXoqSMlonM9qTcQnkNkE5A,4.437157
266,ozXoLPVB0wtkZQvrHZIj9g,BLITQJYfKJSQ9XNuk8pIrw,4.354710
249,ozXoLPVB0wtkZQvrHZIj9g,YdEZpDqqoWgYu-qwLcXpXA,4.243331
246,ozXoLPVB0wtkZQvrHZIj9g,x9RA_NPjcvQ8-EU28ppBqA,4.231062
247,ozXoLPVB0wtkZQvrHZIj9g,odUQDozVqxsUtrSX8ENHyA,4.221454
...,...,...,...
296,ozXoLPVB0wtkZQvrHZIj9g,m_Tu4PBV7adWCsuwOrCT8g,3.044403
218,ozXoLPVB0wtkZQvrHZIj9g,v6715hqLSp_bD5U5Cze2xg,3.009256
260,ozXoLPVB0wtkZQvrHZIj9g,yLjf1SFF3249j1T69VGj-Q,2.836519
298,ozXoLPVB0wtkZQvrHZIj9g,AGNyx4JBRbjvHfCDdib5_g,2.688197


In [51]:
df2 = df_small[df_small.user_id == user]
# removing previously visited restaurants
not_rated = pd.merge(df_pr, df2, how='outer', on='business_id', indicator=True)
not_rated = not_rated[not_rated['_merge'] == 'left_only']
not_rated = not_rated[["user_id_x", "business_id", "rating"]]

In [57]:
not_rated.to_latex()

'\\begin{tabular}{lllr}\n\\toprule\n{} &               user\\_id\\_x &             business\\_id &    rating \\\\\n\\midrule\n0  &  ozXoLPVB0wtkZQvrHZIj9g &  PXoqSMlonM9qTcQnkNkE5A &  4.437157 \\\\\n1  &  ozXoLPVB0wtkZQvrHZIj9g &  BLITQJYfKJSQ9XNuk8pIrw &  4.354710 \\\\\n2  &  ozXoLPVB0wtkZQvrHZIj9g &  YdEZpDqqoWgYu-qwLcXpXA &  4.243331 \\\\\n3  &  ozXoLPVB0wtkZQvrHZIj9g &  x9RA\\_NPjcvQ8-EU28ppBqA &  4.231062 \\\\\n4  &  ozXoLPVB0wtkZQvrHZIj9g &  odUQDozVqxsUtrSX8ENHyA &  4.221454 \\\\\n5  &  ozXoLPVB0wtkZQvrHZIj9g &  seDFTCwhSrzOnaj0iYd5Jw &  4.201074 \\\\\n6  &  ozXoLPVB0wtkZQvrHZIj9g &  UqqkI-YNmfpng4BKj8kxcg &  4.186991 \\\\\n7  &  ozXoLPVB0wtkZQvrHZIj9g &  zQCE8SFTuasTCqmH1NxeYQ &  4.181533 \\\\\n8  &  ozXoLPVB0wtkZQvrHZIj9g &  CBEdmS6N7NzgOPT5uESuCg &  4.179386 \\\\\n9  &  ozXoLPVB0wtkZQvrHZIj9g &  Hr5sm6NpE6K-98PrqHksbQ &  4.159879 \\\\\n10 &  ozXoLPVB0wtkZQvrHZIj9g &  eyw69sYMI5d1vDhC3wX3gw &  4.157706 \\\\\n11 &  ozXoLPVB0wtkZQvrHZIj9g &  dFzDHiN52S96A9rfp1-PoQ &  4.155407 \\

In [59]:
from scipy.stats import pearsonr

In [60]:
# Recommendation person-person
n = 4009
user = "ei7wfryXlvZ6OM9NK27cPQ"
v1 = df_p[df_p.user == user]
v1 = v1["rating"].values[0]
df_pp = pd.DataFrame(index = range(n), columns = ["user_id"])
rat_pp = df_p["rating"].values
corr = []
for i in range(n):
  v2 = rat_pp[i]
  c = pearsonr(v1, v2)[0]
  corr.append(c)

In [61]:
df_pp['user_id'] = users
df_pp['corr'] = corr

In [62]:
df_pp = df_pp.sort_values(by=['corr'], ascending=False)

In [64]:
print(df_pp.head(10).to_latex())

\begin{tabular}{llr}
\toprule
{} &                 user\_id &      corr \\
\midrule
1256 &  ei7wfryXlvZ6OM9NK27cPQ &  1.000000 \\
3995 &  l-f0w9wXPFChvVhUnxA9wQ &  0.967422 \\
2140 &  vCyvUmbQkCYct6JXlmXRaQ &  0.967190 \\
2206 &  ZGgheWqp5TbGsNLt5lze5w &  0.967190 \\
2194 &  RvR05MnF76MnIjo32zFiFQ &  0.967190 \\
2189 &  dOfABpD-PA94keA5LlHXNA &  0.967190 \\
2187 &  GGSfLyM88SbFGhnm46ykXQ &  0.967190 \\
2181 &  xipyrIBO-r1j6PLJ7sIduA &  0.967190 \\
2175 &  npOX1\_LiuayRus0SNavDcQ &  0.967190 \\
2147 &  C76\_BRi\_0N-L8mgxnBF2Aw &  0.967190 \\
\bottomrule
\end{tabular}



In [65]:
# Find competitors
n = 100
#bsn = "seDFTCwhSrzOnaj0iYd5Jw"
df_b = pd.DataFrame(index = range(n), columns=['business', 'rating'])
r = []
for bsn in business:
  df_aid = df_full[df_full.business_id == bsn]
  rat_aid = df_aid["rating"].values
  r.append(rat_aid)

df_b['business'] = business
df_b['rating'] = r


# v1 = df_p[df_p.business == user]
# v1 = v1["rating"].values[0]
# df_pp = pd.DataFrame(index = range(n), columns = ["user_id"])
# rat_pp = df_p["rating"].values
# corr = []
# for i in range(n):
#   v2 = rat_pp[i]
#   c = pearsonr(v1, v2)[0]
#   corr.append(c)

In [66]:
n = 100
bsn = "niUrhHoR9leK0lr5moyySQ"
v1 = df_b[df_b.business == bsn]
v1 = v1["rating"].values[0]
df_bb = pd.DataFrame(index = range(n), columns = ["business_id"])
rat_bb = df_b["rating"].values
corr = []
for i in range(n):
  v2 = rat_bb[i]
  c = pearsonr(v1, v2)[0]
  corr.append(c)

df_bb['business_id'] = business
df_bb['corr'] = corr
df_bb = df_bb.sort_values(by=['corr'], ascending=False)

In [67]:
df_bb.head()

Unnamed: 0,business_id,corr
92,niUrhHoR9leK0lr5moyySQ,1.0
5,ZzK99Z5oWteQVlBtdm9r1w,0.554133
62,p4Ifl07Rg8bAhXlcflxqng,0.530318
51,Q4u7W5grwSKZy-suUKhtqw,0.52975
54,-zYenEsSHXt8XfAy8lnAjg,0.521009


In [68]:
print(df_bb.head().to_latex())

\begin{tabular}{llr}
\toprule
{} &             business\_id &      corr \\
\midrule
92 &  niUrhHoR9leK0lr5moyySQ &  1.000000 \\
5  &  ZzK99Z5oWteQVlBtdm9r1w &  0.554133 \\
62 &  p4Ifl07Rg8bAhXlcflxqng &  0.530318 \\
51 &  Q4u7W5grwSKZy-suUKhtqw &  0.529750 \\
54 &  -zYenEsSHXt8XfAy8lnAjg &  0.521009 \\
\bottomrule
\end{tabular}

