***

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Open Reduced JSON files 
business_df = pd.read_json('yelp_dataset/TX_restaurants.json')
reviews_df = pd.read_json('yelp_dataset/TX_reviews.json')
users_df = pd.read_json('yelp_dataset/TX_users.json')
print('Total number of business: ', len(business_df))
print('Total number of reviews: ', len(reviews_df))
print('Total number of users: ', len(users_df))


Total number of business:  3270
Total number of reviews:  62936
Total number of users:  798


In [None]:
business_df.head(3)

In [None]:
reviews_df.head(3)

In [None]:
users_df.head(3)

In [3]:
# Select onlt User ID, Restaurant ID & Text from the TX Reviews dataset.
reviews_Rdf = reviews_df[['business_id', 'user_id', 'stars', 'text']]
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))
reviews_Rdf.head(3)    

Unnamed: 0,business_id,user_id,stars,text
0,75HV-KqCtn_oHeiLiGlO_w,wJHy7ZJG_EvLFQDRms5rXQ,4,Great place... delicious tapas and very nice w...
1,mrABhBpFvXTFrLiLiz286g,HHEXgBRDdkFSiDu1gDSdKg,4,"""I overheard you are out of the roast beef, ri..."
2,dQ80ktWSRj5UIk96bRUmWw,QdhsBITt2VqQzwNxRu4QjQ,4,We thought about going over to Brodie Oaks to ...


### Function to clean the reviews text.

In [4]:
def cleaning_text(mess):
    revmovePunctuation = [char for char in mess if char not in string.punctuation]
    revmovePunctuation = ''.join(revmovePunctuation)
    
    # Remove all stopwords
    return " ".join([word for word in revmovePunctuation.split() if word.lower() not in stop])
reviews_Rdf['text'] = reviews_Rdf['text'].apply(cleaning_text)

In [27]:
# Subsets DataFrames from the Original Reviews DataFrame. 
# 1) Users with Text. (62936 rows × 2 columns)
# 2) Business with text. (62936 rows × 2 columns)
user_text = reviews_Rdf[['user_id','text']]
business_text = reviews_Rdf[['business_id', 'text']]

In [28]:
# Check the DataFrame Users ID with Text.
user_text.head(2)

Unnamed: 0,user_id,text
0,wJHy7ZJG_EvLFQDRms5rXQ,Great place delicious tapas nice wine list try...
1,HHEXgBRDdkFSiDu1gDSdKg,overheard roast beef right Yes still make anot...


In [29]:
# Check the DataFrame Users ID with Text 
business_text.head(2)

Unnamed: 0,business_id,text
0,75HV-KqCtn_oHeiLiGlO_w,Great place delicious tapas nice wine list try...
1,mrABhBpFvXTFrLiLiz286g,overheard roast beef right Yes still make anot...


In [36]:
# Check all reviews from only One User 'QdhsBITt2VqQzwNxRu4QjQ'.

user_text[user_text['user_id']=='-OGWTHZng0QNhvc8dhIjyQ']['text']

1980     One favorite sushi spots Austin Never crowded ...
2830     3 12 Stars pretty good loved atmosphere waitre...
3582     Im love Thai Cuisine 5 reasons 1 Amazing pad t...
4466     feeling Ill back many many happy hours 47 pm d...
4765     happened place Several years ago Zen staple wa...
5914     first place go return Austin town 888 Since 2 ...
10218    first highend Austin sushi restaurant Ive sinc...
10588    go Pluckers know going get Consistently OK foo...
11520    Eh alright gotten sushi lunch specials go conv...
12051    lived Austin 3 12 years never Torchys Boy mist...
12698    Kome favorite restaurant Ive actually waited m...
16043    Possibly best bakery Austin croissants muffins...
16727    Thank god Yelp cant believe actually consideri...
16847    Epoch rocks 24 hour free wifi plenty seating g...
17707    agree people complained location almost parkin...
18618    Ive actually fried avocado taco restaurant bum...
24102    hate write 1 star review Veggie Heaven used on.

In [32]:
a= user_text[user_text['user_id']=='-OGWTHZng0QNhvc8dhIjyQ']['text']
a.count()

51

In [17]:
# Compile All corresponding Reviews to each User and to each Business.
user_text = user_text.groupby('user_id').agg({'text': ' '.join})
business_text = business_text.groupby('business_id').agg({'text': ' '.join})

In [22]:
# Combine review of User 'QdhsBITt2VqQzwNxRu4QjQ'

user_text.loc['-OGWTHZng0QNhvc8dhIjyQ']['text']



### Vectorization with Term Frequency Inverse Doc. Frequency (TF-IDF) to extract the features from the text.

In [18]:
# Sparse Matrix User Text vectorizer (798 x 5000 )
user_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
user_vectors = user_vectorizer.fit_transform(user_text['text'])

# Sparse mMtrix Business Text vectorizer (779 x 5000)
business_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
business_vectors = business_vectorizer.fit_transform(business_text['text'])

### Users / Businesses with Stars Matrix.

In [19]:
user_bussiness_stars = pd.pivot_table(reviews_Rdf, values='stars', index=['user_id'], columns=['business_id'])
user_bussiness_stars

business_id,-4ofMtrD7pSpZIX5pnDkig,-85pDrVcAdzNK55bFpintg,-BBSLCjzw3i2PHuwJ_dabA,-Ce8p148xb0-4dv_3zwm3A,-WA4tNsXZq5sxcGw8a5_IQ,-_GnwXmzC3DXsHR9nyaC2g,-jQOsyXnAMKu6ND7ongXFg,-qjkQHH-5O8BAztc6udOuw,-waa6lH9SmAkaKiLx74_FA,0-Y_m4TJZufHBVozbMOqbA,...,z3JW-nsdfC8ijst_PSeMIA,z4fQL2IHUR4ENonH7A_9tQ,zDVjN3dC9EfQno21m3O8Pg,zE0EGIsOdAO83xE3CypN-A,zFaHweOJ40jjtvpGTjlspw,zM98ZSIJyuBQabyYornLpw,zVILoCmmDSgcM5bc2BKVWg,zZIWHmITwqaC8OhnPkjEIQ,zerPYZ-lid5CG27a1-7YMw,zyHMtStYlKG67WRprp6GZQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0MQ4webH2uc1ZAsGsNENg,,,,,,1.0,,,4.0,2.0,...,,2.0,,,,,,,,
-2sNTzGyci98Mp9PmPRg8w,,,,,,,,,2.0,,...,,,,,5.0,,,,,3.0
-4RH83ibNRpwj8NlBwtMdQ,,,,,,,,,,,...,,,,,,,,,,
-8QoOIfvwwxJ4sY201WP5A,,,,,,,,,,,...,,,,,,,,,,
-OGWTHZng0QNhvc8dhIjyQ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zgLUWGBKrSoIIH3E2XLvNQ,,,2.0,,,,,,,,...,,,,,,,,,,
zlL_PbFTXjrGXhTbPsjmlQ,,,,,3.5,,,,,5.0,...,,,,,,,,,,
zonT-KQFJiejFNUrs2KR0Q,,,,,,2.0,,,,,...,,,,,5.0,,,,,
zqMc0O7vDj82Sz5Fler1EA,,,,5.0,,,,5.0,,,...,,,,,,,,,,4.0


In [20]:
P = pd.DataFrame(user_vectors.toarray(), index=user_text.index, columns=user_vectorizer.get_feature_names())
Q = pd.DataFrame(business_vectors.toarray(), index=business_text.index, columns=business_vectorizer.get_feature_names())

In [37]:
P.shape, Q.shape

((798, 5000), (779, 5000))

In [None]:
P.head(2)

In [None]:
Q.head(2)

In [21]:
def matrix_factorization(R, P, Q, steps=50, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

* Takes __23 min 55s__ with the following parameters: __Steps = 05, gamma = 0.020 ,lamda=0.02__
* Takes __35 min 14s__ with the following parameters: __Steps = 10, gamma = 0.020 ,lamda=0.02__
* Takes __50 min 25s__ with the following parameters: __Steps = 25, gamma = 0.001 ,lamda=0.02__


### To Train New Data ONLY

In [None]:
#%%time
#P, Q = matrix_factorization(user_bussiness_stars, P, Q, steps=25, gamma=0.001,lamda=0.02)

In [None]:
# STORE P, Q and vectorizer in pickle file
import pickle
output = open('yelp_dataset/yelp_recommendation_model_LFM_v3.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(user_vectorizer,output)
output.close()

### To OPEN saved Model

In [24]:
# LOAD P, Q and vectorizer in pickle file
import pickle
input = open('yelp_dataset/yelp_recommendation_model_LFM_v3.pkl','rb')
P = pickle.load(input)
Q = pickle.load(input)
userid_vectorizer = pickle.load(input)
input.close()

### Testing the Moded: 
* Input: Preferences in Text form.  
* Output: Top Five Recommendations. 

In [25]:
words = "Best sushi, lemon aioli"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(cleaning_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:5]

for i in topRecommendations.index:
    print(business_df[business_df['business_id']==i]['name'].iloc[0])
    print(business_df[business_df['business_id']==i]['categories'].iloc[0])
    print(str(business_df[business_df['business_id']==i]['stars'].iloc[0])+ ' '+str(business_df[business_df['business_id']==i]['review_count'].iloc[0]))
    print('')

Bess Bistro
Cajun/Creole, Southern, Restaurants, American (New), French
3.5 520

Ego's
Dive Bars, Nightlife, Lounges, Karaoke, Bars, Pizza, Restaurants
3.5 249

Austin Diner
Restaurants, Diners
3.0 288

The Common Interest
Karaoke, American (New), American (Traditional), Nightlife, Sports Bars, Restaurants, Bars
4.0 157

Eberly
Restaurants, Breakfast & Brunch, Nightlife, Bars, Pubs, American (Traditional)
4.0 467



In [11]:
"""
Getting user_id and most relevant info for Kevan (Kevan is Not on users_train Set.)
"""
user_X = users_df[users_df['name']=='Kevan']
user_X

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
473,-OGWTHZng0QNhvc8dhIjyQ,Kevan,197,2007-07-14 17:16:27,329,85,97,201120122015201620172018,"s8nkbQAlrVPfg1u4anGKtw, o4ZZnp8ugpfhzLAa6jFenA...",4,...,5,0,0,0,6,11,6,6,3,2


***