***

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Open Reduced JSON files 
business_df = pd.read_json('yelp_dataset/TX_restaurants.json')
reviews_df = pd.read_json('yelp_dataset/TX_reviews.json')
users_df = pd.read_json('yelp_dataset/TX_users.json')
print('Total number of business: ', len(business_df))
print('Total number of reviews: ', len(reviews_df))
print('Total number of users: ', len(users_df))


Total number of business:  3270
Total number of reviews:  62936
Total number of users:  798


In [3]:
business_df.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,NRPemqVb4qpWFF0Avq_6OQ,Eurasia Sushi Bar & Seafood,"7101 W Hwy 71, Ste C-13",Austin,TX,78735,30.234533,-97.877262,4.5,395,1,"{'Ambience': '{'touristy': False, 'hipster': F...","Bars, Nightlife, Cocktail Bars, Seafood, Resta...","{'Monday': '0:0-0:0', 'Tuesday': '11:0-22:0', ..."
1,bRsDZ44CD3uhGnRY3NeQhQ,Wendy's,6247 Mcneil Drive,Austin,TX,78729,30.441875,-97.746581,2.0,46,1,"{'RestaurantsPriceRange2': '1', 'OutdoorSeatin...","Fast Food, Restaurants, Burgers","{'Monday': '6:30-1:0', 'Tuesday': '6:30-1:0', ..."
2,Pk4ZwXwUU50BDn5gqw_rKg,Johnny Carino's,9500 S IH-35 Service Rd,Austin,TX,78748,30.162081,-97.789132,3.0,136,1,"{'RestaurantsGoodForGroups': 'True', 'Business...","Italian, Salad, Pizza, Nightlife, Restaurants,...","{'Monday': '11:0-21:30', 'Tuesday': '11:0-21:3..."


In [4]:
reviews_df.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,m0MvtD8fjZVPWgI3y_wxBA,wJHy7ZJG_EvLFQDRms5rXQ,75HV-KqCtn_oHeiLiGlO_w,4,0,0,0,Great place... delicious tapas and very nice w...,2013-10-24 02:29:10
1,TSmmEz82y_74rL4XzgbGbg,HHEXgBRDdkFSiDu1gDSdKg,mrABhBpFvXTFrLiLiz286g,4,8,13,9,"""I overheard you are out of the roast beef, ri...",2010-08-20 23:31:16
2,xdhhuHg0xvsjy7aou3ArBg,QdhsBITt2VqQzwNxRu4QjQ,dQ80ktWSRj5UIk96bRUmWw,4,1,0,0,We thought about going over to Brodie Oaks to ...,2010-10-05 20:27:15


In [5]:
users_df.head(3)

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,-8QoOIfvwwxJ4sY201WP5A,Antoinette,288,2007-08-04 20:21:09,752,220,306,2012201320142015201620172018,"vePby1OhpTiQiX75XrN97A, UG8cewYtZdep2hzSekIqYg...",25,...,4,6,2,0,12,32,24,24,11,2
1,CQUDh80m48xnzUkx-X5NAw,David,4205,2008-12-29 21:03:01,21059,8906,14640,200920102011201220132014,"WnJlu4mpNtVxNQ2SM6GmvQ, 3BqKBuvY09lissdY_soI6w...",575,...,149,140,66,287,1034,1529,1572,1572,492,180
2,LmqbL60LuUBRlEhe6jIlJw,Jacob,260,2009-02-06 17:10:44,845,479,625,201020112012,"DOj9NanlJP3xntULCy5Uow, FmNUY7H2vZ8ZgzmZ5SRx1A...",19,...,10,2,1,7,15,41,70,70,18,3


In [6]:
# Select onlt User ID, Restaurant ID & Text from the TX Reviews dataset.
reviews_Rdf = reviews_df[['business_id', 'user_id', 'stars', 'text']]
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))
reviews_Rdf.head(3)    

Unnamed: 0,business_id,user_id,stars,text
0,75HV-KqCtn_oHeiLiGlO_w,wJHy7ZJG_EvLFQDRms5rXQ,4,Great place... delicious tapas and very nice w...
1,mrABhBpFvXTFrLiLiz286g,HHEXgBRDdkFSiDu1gDSdKg,4,"""I overheard you are out of the roast beef, ri..."
2,dQ80ktWSRj5UIk96bRUmWw,QdhsBITt2VqQzwNxRu4QjQ,4,We thought about going over to Brodie Oaks to ...


### Function to clean the reviews text.

In [7]:
def cleaning_text(mess):
    revmovePunctuation = [char for char in mess if char not in string.punctuation]
    revmovePunctuation = ''.join(revmovePunctuation)
    
    # Remove all stopwords
    return " ".join([word for word in revmovePunctuation.split() if word.lower() not in stop])
reviews_Rdf['text'] = reviews_Rdf['text'].apply(cleaning_text)

In [16]:
# Subsets DataFrames from the Original Reviews DataFrame. 
# 1) Users with Text. (62936 rows × 2 columns)
# 2) Business with text. (62936 rows × 2 columns)
user_text = reviews_Rdf[['user_id','text']]
business_text = reviews_Rdf[['business_id', 'text']]

In [17]:
# Check the DataFrame Users ID with Text.
user_text.head(2)

Unnamed: 0,user_id,text
0,wJHy7ZJG_EvLFQDRms5rXQ,Great place delicious tapas nice wine list try...
1,HHEXgBRDdkFSiDu1gDSdKg,overheard roast beef right Yes still make anot...


In [18]:
# Check the DataFrame Users ID with Text 
business_text.head(2)

Unnamed: 0,business_id,text
0,75HV-KqCtn_oHeiLiGlO_w,Great place delicious tapas nice wine list try...
1,mrABhBpFvXTFrLiLiz286g,overheard roast beef right Yes still make anot...


In [19]:
# Check all reviews from only One User 'QdhsBITt2VqQzwNxRu4QjQ'.

user_text[user_text['user_id']=='QdhsBITt2VqQzwNxRu4QjQ']['text']

2        thought going Brodie Oaks see new Bombay Bistr...
431      like especially like Ranch 616 much Theyve sli...
455      made first trip dinner South Congress Cafe rec...
531      heard getting Gus Fried Chicken Another fried ...
829      Lil Brat burger provolone sweet peppers lettuc...
                               ...                        
60555    used love Magnolia aged well maybe menu pared ...
60703    realize long since Id Wheatsville looks like r...
60905    friend told Takoba felt compelled check lunch ...
61103    Green Mesquite covered BBQ chili burgers menu ...
62091    place made feel like old man hip youth service...
Name: text, Length: 160, dtype: object

In [20]:
# Compile All corresponding Reviews to each User and to each Business.
user_text = user_text.groupby('user_id').agg({'text': ' '.join})
business_text = business_text.groupby('business_id').agg({'text': ' '.join})

In [21]:
# Combine review of User 'QdhsBITt2VqQzwNxRu4QjQ'

#user_text.loc['QdhsBITt2VqQzwNxRu4QjQ']['text']

### Vectorization with Term Frequency Inverse Doc. Frequency (TF-IDF) to extract the features from the text.

In [22]:
# Sparse Matrix User Text vectorizer (798 x 5000 )
user_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
user_vectors = user_vectorizer.fit_transform(user_text['text'])

# Sparse mMtrix Business Text vectorizer (779 x 5000)
business_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=5000)
business_vectors = business_vectorizer.fit_transform(business_text['text'])

### Users / Businesses with Stars Matrix.

In [23]:
user_bussiness_stars = pd.pivot_table(reviews_Rdf, values='stars', index=['user_id'], columns=['business_id'])
user_bussiness_stars

business_id,-4ofMtrD7pSpZIX5pnDkig,-85pDrVcAdzNK55bFpintg,-BBSLCjzw3i2PHuwJ_dabA,-Ce8p148xb0-4dv_3zwm3A,-WA4tNsXZq5sxcGw8a5_IQ,-_GnwXmzC3DXsHR9nyaC2g,-jQOsyXnAMKu6ND7ongXFg,-qjkQHH-5O8BAztc6udOuw,-waa6lH9SmAkaKiLx74_FA,0-Y_m4TJZufHBVozbMOqbA,...,z3JW-nsdfC8ijst_PSeMIA,z4fQL2IHUR4ENonH7A_9tQ,zDVjN3dC9EfQno21m3O8Pg,zE0EGIsOdAO83xE3CypN-A,zFaHweOJ40jjtvpGTjlspw,zM98ZSIJyuBQabyYornLpw,zVILoCmmDSgcM5bc2BKVWg,zZIWHmITwqaC8OhnPkjEIQ,zerPYZ-lid5CG27a1-7YMw,zyHMtStYlKG67WRprp6GZQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0MQ4webH2uc1ZAsGsNENg,,,,,,1.0,,,4.0,2.0,...,,2.0,,,,,,,,
-2sNTzGyci98Mp9PmPRg8w,,,,,,,,,2.0,,...,,,,,5.0,,,,,3.0
-4RH83ibNRpwj8NlBwtMdQ,,,,,,,,,,,...,,,,,,,,,,
-8QoOIfvwwxJ4sY201WP5A,,,,,,,,,,,...,,,,,,,,,,
-OGWTHZng0QNhvc8dhIjyQ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zgLUWGBKrSoIIH3E2XLvNQ,,,2.0,,,,,,,,...,,,,,,,,,,
zlL_PbFTXjrGXhTbPsjmlQ,,,,,3.5,,,,,5.0,...,,,,,,,,,,
zonT-KQFJiejFNUrs2KR0Q,,,,,,2.0,,,,,...,,,,,5.0,,,,,
zqMc0O7vDj82Sz5Fler1EA,,,,5.0,,,,5.0,,,...,,,,,,,,,,4.0


In [24]:
P = pd.DataFrame(user_vectors.toarray(), index=user_text.index, columns=user_vectorizer.get_feature_names())
Q = pd.DataFrame(business_vectors.toarray(), index=business_text.index, columns=business_vectorizer.get_feature_names())

In [25]:
P.shape, Q.shape

((798, 5000), (779, 5000))

In [26]:
P.head(2)

Unnamed: 0_level_0,1,10,100,1000,1015,10am,10pm,11,1130,11am,...,yummy,yup,z,zee,zen,zero,zocalo,zone,ztejas,zucchini
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0MQ4webH2uc1ZAsGsNENg,0.029415,0.020075,0.017485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.06935,0.0,0.0,0.0,0.011886,0.0,0.0,0.0,0.0,0.0
-2sNTzGyci98Mp9PmPRg8w,0.026098,0.009698,0.010055,0.00769,0.0,0.0,0.0,0.010304,0.005585,0.0,...,0.00092,0.0,0.0,0.0,0.002051,0.007608,0.002212,0.00759,0.0,0.004496


In [27]:
Q.head(2)

Unnamed: 0_level_0,1,10,100,1000,1015,10am,10pm,11,1130,11am,...,yummy,yup,z,zee,zen,zero,zocalo,zone,ztejas,zucchini
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-4ofMtrD7pSpZIX5pnDkig,0.008234,0.002091,0.001526,0.0,0.0,0.0,0.003165,0.0,0.0,0.0,...,0.00539,0.0,0.004052,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-85pDrVcAdzNK55bFpintg,0.013003,0.010564,0.0,0.0,0.0,0.0,0.0,0.004311,0.0,0.0,...,0.005447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
def matrix_factorization(R, P, Q, steps=50, gamma=0.001,lamda=0.02):
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e= e + pow(R.loc[i,j]-np.dot(P.loc[i],Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        
    return P,Q

* Takes __23 min 55s__ with the following parameters: __Steps = 05, gamma = 0.020 ,lamda=0.02__
* Takes __35 min 14s__ with the following parameters: __Steps = 10, gamma = 0.020 ,lamda=0.02__
* Takes __50 min 25s__ with the following parameters: __Steps = 25, gamma = 0.001 ,lamda=0.02__


### To Train New Data ONLY

In [None]:
#%%time
#P, Q = matrix_factorization(user_bussiness_stars, P, Q, steps=25, gamma=0.001,lamda=0.02)

In [None]:
# STORE P, Q and vectorizer in pickle file
import pickle
output = open('yelp_dataset/yelp_recommendation_model_LFM_v3.pkl', 'wb')
pickle.dump(P,output)
pickle.dump(Q,output)
pickle.dump(user_vectorizer,output)
output.close()

### To OPEN saved Model

In [30]:
# LOAD P, Q and vectorizer in pickle file
import pickle
input = open('yelp_dataset/yelp_recommendation_model_LFM_v3.pkl','rb')
P = pickle.load(input)
Q = pickle.load(input)
userid_vectorizer = pickle.load(input)
input.close()

### Testing the Moded: 
* Input: Preferences in Text form.  
* Output: Top Five Recommendations. 

In [33]:
words = "Good wine selection with a fire place, best dessert menu"
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(cleaning_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],Q.T),index=Q.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:5]

for i in topRecommendations.index:
    print(business_df[business_df['business_id']==i]['name'].iloc[0])
    print(business_df[business_df['business_id']==i]['categories'].iloc[0])
    print(str(business_df[business_df['business_id']==i]['stars'].iloc[0])+ ' '+str(business_df[business_df['business_id']==i]['review_count'].iloc[0]))
    print('')

Bess Bistro
Cajun/Creole, Southern, Restaurants, American (New), French
3.5 520

VOX Table
Tapas/Small Plates, Bars, French, Restaurants, Nightlife, American (New), Cocktail Bars
4.5 418

Lenoir
French, American (New), African, Nightlife, Bars, Wine Bars, Restaurants
4.0 462

Qui
Restaurants, Japanese, Sushi Bars, American (New)
4.0 402

Dai Due
Food, Grocery, Bars, Restaurants, Cocktail Bars, Nightlife, Breakfast & Brunch, Butcher, American (New), Local Flavor, Desserts
4.0 538



In [35]:
"""
Getting user_id and most relevant info for Kevan (Kevan is Not on users_train Set.)
"""
user_X = users_df[users_df['name']=='Kevan']
user_X

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
473,-OGWTHZng0QNhvc8dhIjyQ,Kevan,197,2007-07-14 17:16:27,329,85,97,201120122015201620172018,"s8nkbQAlrVPfg1u4anGKtw, o4ZZnp8ugpfhzLAa6jFenA...",4,...,5,0,0,0,6,11,6,6,3,2


***