# YELP Recommendation System

In [1]:
from fastai.collab import *
from time import time

In [2]:
path = Path('.')
# path.ls()

In [3]:
train_df = pd.read_csv(path/'yelp_train.csv')
test_df = pd.read_csv(path/'yelp_val.csv')
# print(train_df.columns)
train_df["stars"]= train_df["stars"].astype(float)
test_df["stars"]= test_df["stars"].astype(float)
train_df.head()

Unnamed: 0,user_id,business_id,stars
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0


In [4]:
data = CollabDataBunch.from_df(ratings=train_df, seed=5, valid_pct=0.1,user_name='user_id', 
                               item_name='business_id', rating_name='stars', test=test_df, bs=len(test_df))

In [5]:
# data.show_batch()

In [6]:
train_df.stars.min(), train_df.stars.max()

(1.0, 5.0)

In [7]:
def my_rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [8]:
wd=1e-4 
n_epochs=30
n_factors=40
lr=4e-3 #calculated using lr_find() method

## EmbeddingDotBias Model

In [13]:
learn = collab_learner(data, n_factors=n_factors, y_range=(1, 5), wd=wd, model_dir="./model/", path=path)

In [14]:
print(learn.summary())

Layer (type)         Output Shape         Param #    Trainable 
Embedding            [40]                 450,840    True      
______________________________________________________________________
Embedding            [40]                 984,800    True      
______________________________________________________________________
Embedding            [1]                  11,271     True      
______________________________________________________________________
Embedding            [1]                  24,620     True      
______________________________________________________________________

Total params: 1,471,531
Total trainable params: 1,471,531
Total non-trainable params: 0



In [15]:
# learn.lr_find()
# learn.lr_find2()
# learn.recorder.plot()

In [16]:
learn.fit(lr=lr, epochs=n_epochs, wd=wd)

epoch,train_loss,valid_loss,time
0,1.829619,1.806162,02:57
1,1.816701,1.784372,02:53
2,1.804174,1.7629,02:52
3,1.79041,1.741606,02:51
4,1.777435,1.720321,02:52
5,1.763261,1.698885,02:50
6,1.747941,1.677105,02:55
7,1.73275,1.65478,02:52
8,1.716668,1.631731,02:52
9,1.699656,1.607805,02:43


In [17]:
learn.save('yelp-dot-4-5.pkl')

In [18]:
(users,items), ratings = next(iter(data.test_dl))
preds = learn.model(users, items)

start_time = time()

test_time = time() - start_time
scores = pd.DataFrame(
    {"user_id": test_df["user_id"], "business_id": test_df["business_id"], "rating": preds}
)
scores = scores.sort_values(["user_id", "rating"], ascending=[True, False])

print("Took {} seconds for {} predictions.".format(test_time, len(test_df)))

Took 0.0009970664978027344 seconds for 142044 predictions.


In [19]:
y = test_df["stars"]
y1 = scores["rating"]

In [20]:
# type(y), type(y1)

In [21]:
result =  math.sqrt(((y1-y)**2).mean())
print("RMSE: ", result)

RMSE:  1.0661550882163424


## Interpretation

In [None]:
learn = collab_learner(data, n_factors=40, y_range=(1, 5), wd=1e-1, model_dir="/tmp/model/", path="/tmp/")
learn.load('yelp-dot-1');

In [None]:
businesses = {}
businesses =  pd.read_json(path/"business.json", orient='columns',lines=True)
business_df =  pd.DataFrame.from_dict(businesses)
business_df.head()

In [None]:
g = train_df.groupby('business_id')['stars'].count()
top_business = g.sort_values(ascending=False).index.values[:1000]
top_business = top_business.astype(str)
top_business[:10]

In [None]:
top_businesses_with_name = []
for business in top_business:
    top_businesses_with_name.append(business_df[(business_df['business_id']==business)]['name'].iloc[0])
top_businesses_with_name = np.array(top_businesses_with_name)
top_businesses_with_name

## Restaurant Bias

In [None]:
learn.model

In [None]:
business_bias = learn.bias(top_business, is_item=True)

In [None]:
mean_ratings = train_df.groupby('business_id')['stars'].mean()
business_ratings = [(b, top_businesses_with_name[i], mean_ratings.loc[tb]) for i, (tb, b) in enumerate(zip(top_business, business_bias))]

In [None]:
item0 = lambda o:o[0]

In [None]:
sorted(business_ratings, key=item0)[:15]

In [None]:
sorted(business_ratings, key=item0, reverse=True)[:15]

In [None]:
business_w = learn.weight(top_business, is_item=True)
business_w.shape

In [None]:
business_pca = business_w.pca(3)
business_pca.shape

In [None]:
fac0,fac1,fac2 = business_pca.t()
business_comp = [(f, i) for f,i in zip(fac0, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
business_comp = [(f, i) for f,i in zip(fac1, top_businesses_with_name)]

In [None]:
sorted(business_comp, key=itemgetter(0), reverse=True)[:10]

In [None]:
sorted(business_comp, key=itemgetter(0))[:10]

In [None]:
idxs = np.random.choice(len(top_businesses_with_name), 50, replace=False)
idxs = list(range(50))
X = fac0[idxs]
Y = fac2[idxs]
plt.figure(figsize=(15,15))
plt.scatter(X, Y)
for i, x, y in zip(top_businesses_with_name[idxs], X, Y):
    plt.text(x,y,i, color=np.random.rand(3)*0.7, fontsize=11)
plt.show()