# YELP Recommendation System

In [1]:
from fastai.collab import *
from time import time

In [2]:
path = Path('.')
# path.ls()

In [3]:
train_df = pd.read_csv(path/'yelp_train.csv')
test_df = pd.read_csv(path/'yelp_val.csv')
# print(train_df.columns)
train_df["stars"]= train_df["stars"].astype(float)
test_df["stars"]= test_df["stars"].astype(float)
train_df.head()

Unnamed: 0,user_id,business_id,stars
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0


In [4]:
data = CollabDataBunch.from_df(ratings=train_df, seed=5, valid_pct=0.1,user_name='user_id', 
                               item_name='business_id', rating_name='stars', test=test_df, bs=len(test_df))

In [5]:
# data.show_batch()

In [6]:
train_df.stars.min(), train_df.stars.max()

(1.0, 5.0)

In [7]:
def my_rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [8]:
wd=1e-4 
n_epochs=50
n_factors=50
lr=4e-3

## EmbeddingNN Model

In [9]:
learn_nn = collab_learner(data, use_nn=True, emb_szs={'user_id': 10, 'business_id':10}, layers=[256, 128], y_range=(1, 5.5))

In [10]:
print(learn_nn.summary())

Layer (type)         Output Shape         Param #    Trainable 
Embedding            [10]                 112,710    True      
______________________________________________________________________
Embedding            [10]                 246,200    True      
______________________________________________________________________
Dropout              [20]                 0          False     
______________________________________________________________________
Linear               [256]                5,376      True      
______________________________________________________________________
ReLU                 [256]                0          False     
______________________________________________________________________
BatchNorm1d          [256]                512        True      
______________________________________________________________________
Linear               [128]                32,896     True      
______________________________________________________________

In [11]:
# learn.lr_find()

In [12]:
# learn.recorder.plot()

In [13]:
learn_nn.fit(lr=lr, epochs=n_epochs, wd=wd)

epoch,train_loss,valid_loss,time
0,1.65788,1.412518,02:14
1,1.496368,1.376096,02:14
2,1.388075,1.347229,02:23
3,1.305948,1.32134,02:18
4,1.236931,1.297149,02:21
5,1.178544,1.277925,02:31
6,1.129647,1.265255,02:36
7,1.087245,1.258946,02:32
8,1.050853,1.257989,02:34
9,1.019215,1.259803,02:37


In [15]:
learn_nn.save('yelp-nn-7.pkl')

In [16]:
# learner_nn = learn_nn.load("yelp-nn-1")
(users, items), ratings = next(iter(data.test_dl))
preds_nn = learn_nn.model(users, items)

In [17]:
scores_nn = pd.DataFrame(
    {"user_id": test_df["user_id"], "business_id": test_df["business_id"], "rating": preds_nn}
)
scores_nn = scores_nn.sort_values(["user_id"], ascending=True)

In [18]:
y = test_df.sort_values(["user_id"], ascending=True)["stars"]
y1 = pd.Series(preds_nn.detach().numpy().flatten())

In [19]:
# type(preds_nn.detach().numpy().flatten()), preds_nn.detach().numpy().shape, preds_nn.detach().numpy().flatten()

In [20]:
# type(y), type(y1)

In [21]:
result = my_rmse(y1, y)
print("NN RMSE: ", result)

NN RMSE:  1.2637447468932854


## Interpretation

In [None]:
learn = collab_learner(data, n_factors=40, y_range=(1, 5), wd=1e-1, model_dir="/tmp/model/", path="/tmp/")
learn.load('yelp-dot-1');

In [None]:
businesses = {}
businesses =  pd.read_json(path/"business.json", orient='columns',lines=True)
business_df =  pd.DataFrame.from_dict(businesses)
business_df.head()

In [None]:
g = train_df.groupby('business_id')['stars'].count()
top_business = g.sort_values(ascending=False).index.values[:1000]
top_business = top_business.astype(str)
top_business[:10]

In [None]:
top_businesses_with_name = []
for business in top_business:
    top_businesses_with_name.append(business_df[(business_df['business_id']==business)]['name'].iloc[0])
top_businesses_with_name = np.array(top_businesses_with_name)
top_businesses_with_name

## Restaurant Bias

In [None]:
learn.model

In [None]:
business_bias = learn.bias(top_business, is_item=True)

In [None]:
mean_ratings = train_df.groupby('business_id')['stars'].mean()
business_ratings = [(b, top_businesses_with_name[i], mean_ratings.loc[tb]) for i, (tb, b) in enumerate(zip(top_business, business_bias))]

In [None]:
item0 = lambda o:o[0]

In [None]:
sorted(business_ratings, key=item0)[:15]

In [None]:
sorted(business_ratings, key=item0, reverse=True)[:15]

In [None]:
business_w = learn.weight(top_business, is_item=True)
business_w.shape

In [None]:
business_pca = business_w.pca(3)
business_pca.shape

In [None]:
fac0,fac1,fac2 = business_pca.t()
business_comp = [(f, i) for f,i in zip(fac0, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
business_comp = [(f, i) for f,i in zip(fac1, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
idxs = np.random.choice(len(top_businesses_with_name), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_businesses_with_name[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()