In [None]:
pwd

'/content'

In [1]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab\ Notebooks

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks


In [2]:
!pip install "dask[dataframe]"
!pip install "dask[bag]"
!pip install "dask[array]"

Collecting fsspec>=0.6.0
  Downloading fsspec-2021.11.0-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 30.8 MB/s 
Collecting partd>=0.3.10
  Downloading partd-1.2.0-py3-none-any.whl (19 kB)
Collecting locket
  Downloading locket-0.2.1-py2.py3-none-any.whl (4.1 kB)
Installing collected packages: locket, partd, fsspec
Successfully installed fsspec-2021.11.0 locket-0.2.1 partd-1.2.0


In [3]:
import pandas as pd
import json
import numpy as np
import dask.bag as db
import dask.dataframe as dd # substitute of pandas to load the data in streams
import dask.array as da

In [116]:
# Import the review dataset as streams to avoid running out of memory
dict_bag = db.read_text('yelp_academic_dataset_review.json', blocksize=int(5e6)).map(json.loads)
df_r = dict_bag.to_dataframe(columns=['user_id', 'business_id', 'stars', 'useful', 'date'])
df_r = df_r.repartition(npartitions=10)
#df_r.head()

In [117]:
# Business dataset
dict_bag = db.read_text('yelp_academic_dataset_business.json', blocksize=int(5e6)).map(json.loads)
df_b = dict_bag.to_dataframe(columns=['business_id', 'city', 'stars', 'review_count'])
df_b = df_b.repartition(npartitions=10)
#df_b.head()

In [118]:
# Users dataset (Import it in streams again)
dict_bag = db.read_text('yelp_academic_dataset_user.json', blocksize=int(5e6)).map(json.loads)
df_u = dict_bag.to_dataframe(columns=['user_id', 'friends', 'review_count'])
df_u = df_u.repartition(npartitions=10)
#df_u.head()

In [7]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 29 kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1619427 sha256=896d88d419d1942015c627e2d5f714c04154d07cb30721f95b31c5f6ddfd8089
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [8]:
from surprise import Dataset
from surprise import Reader

In [119]:
# Recommender system using the rating of the reviews
# We appoach it using low rank approximation techniques
# We have 3 goals
#   - Suggest new friends (to the users)# Business dataset
dict_bag = db.read_text('yelp_academic_dataset_business.json', blocksize=int(5e6)).map(json.loads)
df_b = dict_bag.to_dataframe(columns=['business_id', 'city', 'stars', 'review_count'])
df_b = df_b.repartition(npartitions=10)
#df_b.head()
#   - Recommend new restaurants (to the users)
#   - Tell the restaurant which are possible competitors

### Data Cleaning ###
# Select a city
area = 'Richmond'
df_b = df_b[df_b.city == area]
df_b = df_b.drop('city', axis=1)
# Active users / business
df_b = df_b[df_b.review_count > 100]
df_u = df_u[df_u.review_count > 10]
# Select only the variables that we need now
df_r = df_r[['user_id', 'business_id', 'stars']]
df_u = df_u[['user_id']]
df_b = df_b[['business_id']]

# subset of users that reviewd in that city
df_r = dd.merge(df_r, df_b, how='inner', on='business_id')
df = dd.merge(df_r, df_u, how='inner', on='user_id')
#df.head()
# Another way of subselecting the data to avoid interference between the inner joins



In [63]:
df.head()

Unnamed: 0,user_id,business_id,stars
0,bCZvBE0AY-edknhD64Of9A,j7VmHg9M-eWZuDw-p4AIaA,4.0
1,bCZvBE0AY-edknhD64Of9A,Xv21hWWM1XT25NWLBOle3Q,4.0
2,bCZvBE0AY-edknhD64Of9A,I4jytmnWi2m9qGnuqNxUTg,2.0
3,bCZvBE0AY-edknhD64Of9A,s43_Tw05YR_3nwHLBOj3SA,3.0
4,bCZvBE0AY-edknhD64Of9A,s43_Tw05YR_3nwHLBOj3SA,4.0


In [120]:
#df_small = df.sample(frac=0.1, replace=False, random_state=1)
df_small = df

In [121]:
df_small = df_small.compute()

In [122]:
df_small

Unnamed: 0,user_id,business_id,stars
0,bCZvBE0AY-edknhD64Of9A,j7VmHg9M-eWZuDw-p4AIaA,4.0
1,bCZvBE0AY-edknhD64Of9A,sNDfYZP9OYypywbi33aSxw,4.0
2,bCZvBE0AY-edknhD64Of9A,KqJa1YxZM0PT4iHqhJJjFA,4.0
3,bCZvBE0AY-edknhD64Of9A,PXoqSMlonM9qTcQnkNkE5A,4.0
4,bCZvBE0AY-edknhD64Of9A,j0qVvKZEBdlkQHqyYE1iTw,3.0
...,...,...,...
1538,cDz4PbqLBkn2FLqMtNaXAg,nDGM13CCIVe1w4dyS2Bcgg,4.0
1539,bEQhe2pE7ukH04Kfnzdk5g,nDGM13CCIVe1w4dyS2Bcgg,4.0
1540,R1LiRX25tOdZ0vhIjhUBRg,nDGM13CCIVe1w4dyS2Bcgg,4.0
1541,kAucm0rGyi72fFHPErQPog,3_AK31Wj9-1dbPZ1fdX_-Q,5.0


In [124]:
active_user = df_small['user_id'].value_counts()

In [125]:
actives = pd.DataFrame(columns= ['user_id', 'count'])
active_user = pd.DataFrame(active_user)
actives['user_id'] = active_user.index
actives['count'] = active_user['user_id'].values

In [126]:
actives = actives[actives['count'] > 5]

In [132]:
df_small = pd.merge(actives, df_small, how='inner', on='user_id')

In [134]:
df_small = df_small.drop(columns='count')

In [135]:
from surprise.model_selection import train_test_split

In [136]:
reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_small, reader)

In [137]:
from surprise import SVD
from surprise.model_selection import cross_validate

In [138]:
svd = SVD(verbose=True, n_epochs=10)
#cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
#trainset = data.build_full_trainset()
trainset, testset = train_test_split(data, test_size=.25)
svd.fit(trainset)
pred = svd.test(testset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


In [139]:
df_small

Unnamed: 0,user_id,business_id,stars
0,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w,4.0
1,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w,4.0
2,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w,4.0
3,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w,4.0
4,9EeEpkZg5jV18N_KvrcT6g,cpkyRpNWllRmOhM_qlsdiQ,4.0
...,...,...,...
4728,EIRBAYXCV3647N3ejEwkIA,VHvlre6m3-oUlixGLlOnRA,4.0
4729,EIRBAYXCV3647N3ejEwkIA,chucQ4h98spUmknWQHJ6lA,5.0
4730,EIRBAYXCV3647N3ejEwkIA,CCV_rtjUHKhJA0JLruwImw,3.0
4731,EIRBAYXCV3647N3ejEwkIA,odUQDozVqxsUtrSX8ENHyA,5.0


In [140]:
from surprise import accuracy
#pred = svd.test(trainset)
accuracy.rmse(pred)

RMSE: 0.9072


0.9072129181943878

In [None]:
#pred = svd.predict("Z2x-sMYhTKmYEU5yuv75pA", "eYmf3scmgHqJASKkRF0QCg")

In [None]:
#df2 = pd.merge(df_small, df_u, how = "inner", )

TypeError: ignored

In [141]:

users = df_small["user_id"].unique()
business = df_small["business_id"].unique()

In [166]:
len(users)

392

In [145]:
from itertools import product

In [146]:
df_full = pd.DataFrame(list(product(users, business)), columns = ['user_id', 'business_id'])

In [147]:
df_full

Unnamed: 0,user_id,business_id
0,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w
1,9EeEpkZg5jV18N_KvrcT6g,cpkyRpNWllRmOhM_qlsdiQ
2,9EeEpkZg5jV18N_KvrcT6g,DqxAArHOssu55PBlh7A5Wg
3,9EeEpkZg5jV18N_KvrcT6g,sd0PiR-44d1pL9cD5ULFug
4,9EeEpkZg5jV18N_KvrcT6g,1bP4v7kwOJ22YJmx3bqEqg
...,...,...
39195,EIRBAYXCV3647N3ejEwkIA,BZIHRWA_zqnCwAUif7d2Yw
39196,EIRBAYXCV3647N3ejEwkIA,z9Ys_7sMnShHQS7ArTMHHg
39197,EIRBAYXCV3647N3ejEwkIA,DU-2FkgvtChVDNOevPeteg
39198,EIRBAYXCV3647N3ejEwkIA,j7VmHg9M-eWZuDw-p4AIaA


In [148]:
n = 39200
#df_full['rating'] = np.zeros(n)
usr = df_full["user_id"]
bsn = df_full["business_id"]
r = []
for i in range(n):
  a, b, c, rat, e = svd.predict(usr[i], bsn[i])
  r.append(rat)

In [149]:
df_full['rating'] = r

In [150]:
df_full.head()

Unnamed: 0,user_id,business_id,rating
0,9EeEpkZg5jV18N_KvrcT6g,ZzK99Z5oWteQVlBtdm9r1w,3.91541
1,9EeEpkZg5jV18N_KvrcT6g,cpkyRpNWllRmOhM_qlsdiQ,3.451094
2,9EeEpkZg5jV18N_KvrcT6g,DqxAArHOssu55PBlh7A5Wg,3.43855
3,9EeEpkZg5jV18N_KvrcT6g,sd0PiR-44d1pL9cD5ULFug,3.901723
4,9EeEpkZg5jV18N_KvrcT6g,1bP4v7kwOJ22YJmx3bqEqg,3.699598


In [151]:
n = 392  #4009
df_p = pd.DataFrame(index = range(n), columns=['user', 'rating'])
r = []
for usr in users:
  df_aid = df_full[df_full.user_id == usr]
  rat_aid = df_aid["rating"].values
  r.append(rat_aid)

df_p['user'] = users
df_p['rating'] = r

In [152]:
df_p

Unnamed: 0,user,rating
0,9EeEpkZg5jV18N_KvrcT6g,"[3.9154100240198937, 3.45109433033383, 3.43854..."
1,Xwnf20FKuikiHcSpcEbpKQ,"[3.724427848916611, 3.249779688598004, 3.25949..."
2,RtGqdDBvvBCjcu5dUqwfzA,"[4.055292370900522, 3.7757071370342525, 3.7277..."
3,tgrs5VQ6thhouiEtMU-N_w,"[3.70486958799799, 3.5679279324271205, 3.53258..."
4,YyKL9NmOEokXUvgJHcC2hw,"[3.8621654077253225, 3.5341046664816416, 3.464..."
...,...,...
387,fk154iSzQpjilunR4je0HA,"[3.6789090120897328, 3.339533531830215, 3.6424..."
388,9XPcu19pmQ6JZ9YsATU_pw,"[3.896447512212429, 3.7548490399577634, 3.3690..."
389,t8cIgvaln7MbM51IS6MclA,"[3.9513504602388902, 3.4277524949658464, 3.501..."
390,lLO3Q3tqzfI5GENb2oTbrg,"[3.9493748872674748, 3.4098446603435093, 3.262..."


In [175]:
# Restaurant recommendation
user = "RtGqdDBvvBCjcu5dUqwfzA"
df_pr = df_full[df_full.user_id == user]
df_pr = df_pr.sort_values(by=['rating'], ascending=False)

In [176]:
df_pr

Unnamed: 0,user_id,business_id,rating
226,RtGqdDBvvBCjcu5dUqwfzA,Hr5sm6NpE6K-98PrqHksbQ,4.489781
234,RtGqdDBvvBCjcu5dUqwfzA,BLITQJYfKJSQ9XNuk8pIrw,4.434626
210,RtGqdDBvvBCjcu5dUqwfzA,PXoqSMlonM9qTcQnkNkE5A,4.389605
217,RtGqdDBvvBCjcu5dUqwfzA,VGq9wu_ZtvonP1Rke4CxYw,4.381972
280,RtGqdDBvvBCjcu5dUqwfzA,R1KeQwYWkHczmZjSbfY2XA,4.343036
...,...,...,...
213,RtGqdDBvvBCjcu5dUqwfzA,j0qVvKZEBdlkQHqyYE1iTw,3.296224
294,RtGqdDBvvBCjcu5dUqwfzA,m_Tu4PBV7adWCsuwOrCT8g,3.257720
291,RtGqdDBvvBCjcu5dUqwfzA,cLVwhzVhNdF-5sM56p3ODQ,3.197837
243,RtGqdDBvvBCjcu5dUqwfzA,Q4u7W5grwSKZy-suUKhtqw,3.011076


In [177]:
df2 = df_small[df_small.user_id == user]
# removing previously visited restaurants
not_rated = pd.merge(df_pr, df2, how='outer', on='business_id', indicator=True)
not_rated = not_rated[not_rated['_merge'] == 'left_only']
not_rated = not_rated[["user_id_x", "business_id", "rating"]]

In [178]:
not_rated.to_latex()

'\\begin{tabular}{lllr}\n\\toprule\n{} &               user\\_id\\_x &             business\\_id &    rating \\\\\n\\midrule\n11 &  RtGqdDBvvBCjcu5dUqwfzA &  dQjv55\\_rr1mdF18H6baRxg &  4.208295 \\\\\n13 &  RtGqdDBvvBCjcu5dUqwfzA &  x9RA\\_NPjcvQ8-EU28ppBqA &  4.196035 \\\\\n14 &  RtGqdDBvvBCjcu5dUqwfzA &  -rEA553gskIEHZteI5RExQ &  4.188387 \\\\\n19 &  RtGqdDBvvBCjcu5dUqwfzA &  aGEaKx1jbVyQxeMumKYMHQ &  4.091798 \\\\\n23 &  RtGqdDBvvBCjcu5dUqwfzA &  p1FElOKJUkFOpXx4dvTwEg &  4.069034 \\\\\n25 &  RtGqdDBvvBCjcu5dUqwfzA &  ZzK99Z5oWteQVlBtdm9r1w &  4.055292 \\\\\n31 &  RtGqdDBvvBCjcu5dUqwfzA &  lOA7o1IGnhx2YrbhYQHsLQ &  4.007474 \\\\\n32 &  RtGqdDBvvBCjcu5dUqwfzA &  j7VmHg9M-eWZuDw-p4AIaA &  3.987699 \\\\\n35 &  RtGqdDBvvBCjcu5dUqwfzA &  CCV\\_rtjUHKhJA0JLruwImw &  3.970877 \\\\\n38 &  RtGqdDBvvBCjcu5dUqwfzA &  t1RhVFdcOCA5Zrbzen0QXA &  3.955411 \\\\\n42 &  RtGqdDBvvBCjcu5dUqwfzA &  zux3l8BcJOdLcVlWg-1NGg &  3.938330 \\\\\n45 &  RtGqdDBvvBCjcu5dUqwfzA &  RdFbbpFq3ly-2R\\_CaLXXsg &  3.912

In [159]:
from scipy.stats import pearsonr

In [179]:
# Recommendation person-person
n = 392
user = "ei7wfryXlvZ6OM9NK27cPQ"
v1 = df_p[df_p.user == user]
v1 = v1["rating"].values[0]
df_pp = pd.DataFrame(index = range(n), columns = ["user_id"])
rat_pp = df_p["rating"].values
corr = []
for i in range(n):
  v2 = rat_pp[i]
  c = pearsonr(v1, v2)[0]
  corr.append(c)

In [180]:
df_pp['user_id'] = users
df_pp['corr'] = corr

In [181]:
df_pp = df_pp.sort_values(by=['corr'], ascending=False)

In [182]:
df_pp.head(10)

Unnamed: 0,user_id,corr
329,ei7wfryXlvZ6OM9NK27cPQ,1.0
358,MIPPioT-rUJQwTu3_LjGpA,0.93957
47,GERRIWMslcP4-O6mLErPSQ,0.939338
366,z3FEHWGWSR5Z9ZVARQroHw,0.936907
36,ydbq7bGaMO_9M1GRO7VnAg,0.936231
90,yWzgaaze8dn-lss14MiZJw,0.934996
182,xNpb3uB9znmRq9UZ74UQ7w,0.934034
162,IP3DqE3W5Co2Ksam39Rasg,0.933455
140,9gytAeH_q3GecTAYA7ligA,0.932898
78,nSFHdQ4geNFThpM-rFxtbQ,0.932316


In [183]:
print(df_pp.head(10).to_latex())

\begin{tabular}{llr}
\toprule
{} &                 user\_id &      corr \\
\midrule
329 &  ei7wfryXlvZ6OM9NK27cPQ &  1.000000 \\
358 &  MIPPioT-rUJQwTu3\_LjGpA &  0.939570 \\
47  &  GERRIWMslcP4-O6mLErPSQ &  0.939338 \\
366 &  z3FEHWGWSR5Z9ZVARQroHw &  0.936907 \\
36  &  ydbq7bGaMO\_9M1GRO7VnAg &  0.936231 \\
90  &  yWzgaaze8dn-lss14MiZJw &  0.934996 \\
182 &  xNpb3uB9znmRq9UZ74UQ7w &  0.934034 \\
162 &  IP3DqE3W5Co2Ksam39Rasg &  0.933455 \\
140 &  9gytAeH\_q3GecTAYA7ligA &  0.932898 \\
78  &  nSFHdQ4geNFThpM-rFxtbQ &  0.932316 \\
\bottomrule
\end{tabular}



In [167]:
# Find competitors
n = 100
#bsn = "seDFTCwhSrzOnaj0iYd5Jw"
df_b = pd.DataFrame(index = range(n), columns=['business', 'rating'])
r = []
for bsn in business:
  df_aid = df_full[df_full.business_id == bsn]
  rat_aid = df_aid["rating"].values
  r.append(rat_aid)

df_b['business'] = business
df_b['rating'] = r


# v1 = df_p[df_p.business == user]
# v1 = v1["rating"].values[0]
# df_pp = pd.DataFrame(index = range(n), columns = ["user_id"])
# rat_pp = df_p["rating"].values
# corr = []
# for i in range(n):
#   v2 = rat_pp[i]
#   c = pearsonr(v1, v2)[0]
#   corr.append(c)

In [168]:
n = 100
bsn = "niUrhHoR9leK0lr5moyySQ"
v1 = df_b[df_b.business == bsn]
v1 = v1["rating"].values[0]
df_bb = pd.DataFrame(index = range(n), columns = ["business_id"])
rat_bb = df_b["rating"].values
corr = []
for i in range(n):
  v2 = rat_bb[i]
  c = pearsonr(v1, v2)[0]
  corr.append(c)

df_bb['business_id'] = business
df_bb['corr'] = corr
df_bb = df_bb.sort_values(by=['corr'], ascending=False)

In [184]:
df_bb.head(6)

Unnamed: 0,business_id,corr
50,niUrhHoR9leK0lr5moyySQ,1.0
13,j0qVvKZEBdlkQHqyYE1iTw,0.793876
88,vLbjVhNd73054UVdvOeg7Q,0.792549
47,aGEaKx1jbVyQxeMumKYMHQ,0.784495
65,y1BLVJUWINOSMLeDaXqXBQ,0.783197
19,EXfEzv1sg3smvXJo36hSmQ,0.782686


In [186]:
print(df_bb.head(6).to_latex())

\begin{tabular}{llr}
\toprule
{} &             business\_id &      corr \\
\midrule
50 &  niUrhHoR9leK0lr5moyySQ &  1.000000 \\
13 &  j0qVvKZEBdlkQHqyYE1iTw &  0.793876 \\
88 &  vLbjVhNd73054UVdvOeg7Q &  0.792549 \\
47 &  aGEaKx1jbVyQxeMumKYMHQ &  0.784495 \\
65 &  y1BLVJUWINOSMLeDaXqXBQ &  0.783197 \\
19 &  EXfEzv1sg3smvXJo36hSmQ &  0.782686 \\
\bottomrule
\end{tabular}

