# YELP Recommendation System

In [19]:
from fastai.collab import *
from time import time

In [2]:
path = Path('.')
path.ls()

[WindowsPath('.ipynb_checkpoints'),
 WindowsPath('Book Recommendation System.ipynb'),
 WindowsPath('business.json'),
 WindowsPath('competition.ipynb'),
 WindowsPath('models'),
 WindowsPath('Recommendation System.ipynb'),
 WindowsPath('review_train.json'),
 WindowsPath('yelp_train.csv'),
 WindowsPath('yelp_val.csv')]

In [3]:
train_df = pd.read_csv(path/'yelp_train.csv')
test_df = pd.read_csv(path/'yelp_val.csv')
# print(train_df.columns)
train_df["stars"]= train_df["stars"].astype(float)
test_df["stars"]= test_df["stars"].astype(float)
train_df.head()

Unnamed: 0,user_id,business_id,stars
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0


In [4]:
data = CollabDataBunch.from_df(ratings=train_df, seed=5, valid_pct=0.1,user_name='user_id', 
                               item_name='business_id', rating_name='stars', test=test_df, bs=len(test_df))

In [5]:
data.show_batch()

user_id,business_id,target
4IVaASzU7yAYwPpLh8SMwg,sOgjPmk_1j1ldcVIBaVOjw,5.0
wjsyO5fBP7XXw-VyA-xqZA,vcxvQyAggPqxcHwvJXvjGg,5.0
Omj-5i3eVDGQ-mwocTyVrQ,m7PPpShEdfhtV-7NnPN1Nw,4.0
2oMkzQcRL7-d7URt3Xo_Xg,ZjivUlBDO-LblMIdQnT0OA,4.0
8KH0cKAXRaCf0pNG51waFw,iw9p7E4bMe1YW-APcFGjgQ,4.0


In [6]:
train_df.stars.min(), train_df.stars.max()

(1.0, 5.0)

In [7]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [29]:
wd=1e-4 
n_epochs=20
n_factors=50
lr=4e-3


## EmbeddingDotBias Model

In [30]:
learn = collab_learner(data, n_factors=n_factors, y_range=(1, 5), wd=wd, model_dir="./model/", path=path)

In [31]:
print(learn.summary())

Layer (type)         Output Shape         Param #    Trainable 
Embedding            [50]                 563,550    True      
______________________________________________________________________
Embedding            [50]                 1,231,000  True      
______________________________________________________________________
Embedding            [1]                  11,271     True      
______________________________________________________________________
Embedding            [1]                  24,620     True      
______________________________________________________________________

Total params: 1,830,441
Total trainable params: 1,830,441
Total non-trainable params: 0



In [32]:
# learn.lr_find()
# learn.lr_find2()
# learn.recorder.plot()

In [33]:
learn.fit(lr=lr, epochs=n_epochs, wd=wd)

epoch,train_loss,valid_loss,time
0,1.827066,1.805871,01:56
1,1.815598,1.784012,02:09
2,1.802888,1.762462,01:53
3,1.788999,1.741018,1:44:43
4,1.775209,1.719469,02:00
5,1.760422,1.697602,01:59
6,1.745056,1.675155,01:58
7,1.728511,1.651896,01:59
8,1.711292,1.627616,01:59
9,1.693156,1.602136,01:58


In [34]:
learn.save('yelp-dot-2.pkl')

In [35]:
(users,items), ratings = next(iter(data.test_dl))
preds = learn.model(users, items)

start_time = time()

test_time = time() - start_time
scores = pd.DataFrame(
    {"user_id": test_df["user_id"], "business_id": test_df["business_id"], "rating": preds}
)
scores = scores.sort_values(["user_id", "rating"], ascending=[True, False])

print("Took {} seconds for {} predictions.".format(test_time, len(test_df)))

Took 0.0 seconds for 142044 predictions.


In [36]:
y = test_df["stars"]
y1 = scores["rating"]

In [37]:
type(y), type(y1)

(pandas.core.series.Series, pandas.core.series.Series)

In [38]:
result =  math.sqrt(((y1-y)**2).mean())
print("RMSE: ", result)

RMSE:  1.1354478481345942


## EmbeddingNN Model

In [41]:
learn_nn = collab_learner(data, use_nn=True, emb_szs={'user_id': 40, 'business_id':40}, layers=[256, 128], y_range=(1, 5))

In [42]:
# learn.lr_find()

In [43]:
# learn.recorder.plot()

In [44]:
learn_nn.fit(lr=lr, epochs=n_epochs, wd=wd)

epoch,train_loss,valid_loss,time
0,1.926622,1.735182,02:05
1,1.718086,1.65822,02:10
2,1.583619,1.585022,02:09
3,1.475822,1.507602,02:09
4,1.385753,1.431786,02:08
5,1.30399,1.367105,02:04
6,1.227437,1.315652,02:07
7,1.154833,1.279102,01:59
8,1.0863,1.261212,02:01
9,1.022146,1.26249,02:00


In [45]:
learn_nn.save('yelp-nn-2.pkl')

In [46]:
# learner_nn = learn_nn.load("yelp-nn-1")
(users, items), ratings = next(iter(data.test_dl))
preds_nn = learn_nn.model(users, items)

In [47]:
scores_nn = pd.DataFrame(
    {"user_id": test_df["user_id"], "business_id": test_df["business_id"], "rating": preds_nn}
)
scores_nn = scores_nn.sort_values(["user_id"], ascending=True)

In [65]:
y = test_df.sort_values(["user_id"], ascending=True)["stars"]
y1 = pd.Series(preds_nn.detach().numpy().flatten())

In [66]:
type(preds_nn.detach().numpy().flatten()), preds_nn.detach().numpy().shape, preds_nn.detach().numpy().flatten()

(numpy.ndarray,
 (142044, 1),
 array([4.073997, 4.085638, 4.073994, 4.073997, ..., 4.066718, 4.073997, 4.07399 , 4.073997], dtype=float32))

In [67]:
type(y), type(y1)

(pandas.core.series.Series, pandas.core.series.Series)

In [68]:
result = rmse(y1, y)
print("NN RMSE: ", result)

AttributeError: 'Series' object has no attribute 'contiguous'

## Interpretation

In [None]:
learn = collab_learner(data, n_factors=40, y_range=(1, 5), wd=1e-1, model_dir="/tmp/model/", path="/tmp/")
learn.load('yelp-dot-1');

In [None]:
businesses = {}
businesses =  pd.read_json(path/"business.json", orient='columns',lines=True)
business_df =  pd.DataFrame.from_dict(businesses)
business_df.head()

In [None]:
g = train_df.groupby('business_id')['stars'].count()
top_business = g.sort_values(ascending=False).index.values[:1000]
top_business = top_business.astype(str)
top_business[:10]

In [None]:
top_businesses_with_name = []
for business in top_business:
    top_businesses_with_name.append(business_df[(business_df['business_id']==business)]['name'].iloc[0])
top_businesses_with_name = np.array(top_businesses_with_name)
top_businesses_with_name

## Restaurant Bias

In [None]:
learn.model

In [None]:
business_bias = learn.bias(top_business, is_item=True)

In [None]:
mean_ratings = train_df.groupby('business_id')['stars'].mean()
business_ratings = [(b, top_businesses_with_name[i], mean_ratings.loc[tb]) for i, (tb, b) in enumerate(zip(top_business, business_bias))]

In [None]:
item0 = lambda o:o[0]

In [None]:
sorted(business_ratings, key=item0)[:15]

In [None]:
sorted(business_ratings, key=item0, reverse=True)[:15]

In [None]:
business_w = learn.weight(top_business, is_item=True)
business_w.shape

In [None]:
business_pca = business_w.pca(3)
business_pca.shape

In [None]:
fac0,fac1,fac2 = business_pca.t()
business_comp = [(f, i) for f,i in zip(fac0, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
business_comp = [(f, i) for f,i in zip(fac1, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
idxs = np.random.choice(len(top_businesses_with_name), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_businesses_with_name[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()