# Latent Factor Model
### Libraries

In [3]:
%pylab inline
from __future__ import division
import numpy as np
import pandas as pd
import time
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import WordPunctTokenizer
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


### Load Data

In [4]:
# Open Reduced JSON files 
business_df = pd.read_json('yelp_dataset/TX_restaurants.json')
reviews_df = pd.read_json('yelp_dataset/TX_reviews.json')
users_df = pd.read_json('yelp_dataset/TX_users.json')
print('Total number of business: ', len(business_df))
print('Total number of reviews: ', len(reviews_df))
print('Total number of users: ', len(users_df))


Total number of business:  3270
Total number of reviews:  62936
Total number of users:  798


In [5]:
# Select only Business ID, User ID, Stars & Text from the TX Reviews dataset.
reviews_df= reviews_df[['business_id','user_id', 'stars', 'text']]
reviews_df

Unnamed: 0,business_id,user_id,stars,text
0,75HV-KqCtn_oHeiLiGlO_w,wJHy7ZJG_EvLFQDRms5rXQ,4,Great place... delicious tapas and very nice w...
1,mrABhBpFvXTFrLiLiz286g,HHEXgBRDdkFSiDu1gDSdKg,4,"""I overheard you are out of the roast beef, ri..."
2,dQ80ktWSRj5UIk96bRUmWw,QdhsBITt2VqQzwNxRu4QjQ,4,We thought about going over to Brodie Oaks to ...
3,_jYt69Zx1SUo_V9z0m7Ljg,7jc9f2Nn2S--5b-G5cfJ-A,4,"If I could give 3.5, I would.\n\nI loved the t..."
4,IBK8opCBFSoZCSdURwBhJQ,8dXZLf8hM4JDIU0Rk3X_Tg,2,"Maybe 2.5 stars, if I can do that. For what i..."
...,...,...,...,...
62931,2vppZx0rTDZtCzw-NljdRQ,QcVB_54m6o8jbq5PJBFsHQ,5,"All I can say is, my girlfriend hit the birthd..."
62932,gBFtNT6fh8C-7Clo-aDLvA,DYONpUFtHQK_cYD9ylG_4Q,3,"Had a groupon that was expiring, so I guess it..."
62933,AWGRRRSaxAHI1zeWYb06sg,1-c5fkvmpp6RqOLlJYd98A,4,I went here on a date (classy times in the lif...
62934,zM98ZSIJyuBQabyYornLpw,2uV7zGYSqTXb-sDCeiZ1NA,4,I voted early this election and as soon as I w...


In [6]:
import string
from nltk.corpus import stopwords
stop = []
for word in stopwords.words('english'):
    s = [char for char in word if char not in string.punctuation]
    stop.append(''.join(s))  

    # Function to clean the reviews text.
def cleaning_text(mess):
    revmovePunctuation = [char for char in mess if char not in string.punctuation]
    revmovePunctuation = ''.join(revmovePunctuation)
    # Remove all stopwords
    return " ".join([word for word in revmovePunctuation.split() if word.lower() not in stop])

In [7]:
# Extracting only the TEXT from thet review file. 
reviews_df['text'] = reviews_df['text'].apply(cleaning_text)
reviews_df['text']

0        Great place delicious tapas nice wine list try...
1        overheard roast beef right Yes still make anot...
2        thought going Brodie Oaks see new Bombay Bistr...
3        could give 35 would loved theater Rolling back...
4        Maybe 25 stars El Chilitos badif Mi Madres clo...
                               ...                        
62931    say girlfriend hit birthday dinner choice spot...
62932    groupon expiring guess time check Satay shared...
62933    went date classy times life Annalise drinks sm...
62934    voted early election soon done went Casa Chapa...
62935    Sunday morning still dressed Saturday night cl...
Name: text, Length: 62936, dtype: object

In [8]:
"""
- Subsets Tables from the Original Reviews Data.
- To create Features from the Train Set ONLY
1) Users with Text. (50348 rows × 2 columns)
2) Business with text. (50348 rows × 2 columns)
"""
user_text = reviews_df[['user_id','text']]
business_text = reviews_df[['business_id','text']]
user_text.shape, business_text.shape

((62936, 2), (62936, 2))

In [9]:
# Compile All corresponding Reviews to each User and to each Business.
user_text = user_text.groupby('user_id').agg({'text': ' '.join})
business_text = business_text.groupby('business_id').agg({'text': ' '.join})

# Combined reviews By User and by Business table size
user_text.shape, business_text.shape

((798, 1), (779, 1))

In [10]:
user_text.head(3)

Unnamed: 0_level_0,text
user_id,Unnamed: 1_level_1
-0MQ4webH2uc1ZAsGsNENg,Love patio Love coffee Love breakfast tacos Lo...
-2sNTzGyci98Mp9PmPRg8w,TaDa 300th review better way celebrate highlig...
-4RH83ibNRpwj8NlBwtMdQ,order vegetable lasagna unless expecting class...


In [11]:
"""
Vectorization with Term Frequency Inverse Doc. Frequency (TF-IDF) to extract the features from the text.
"""
# Sparse Matrix User Text vectorizer (798 x 1500 )
user_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1500)
user_vectors = user_vectorizer.fit_transform(user_text['text'])

# Sparse mMtrix Business Text vectorizer (779 x 1500)
business_vectorizer = TfidfVectorizer(tokenizer = WordPunctTokenizer().tokenize, max_features=1500)
business_vectors = business_vectorizer.fit_transform(business_text['text'])
user_vectors, business_vectors

(<798x1500 sparse matrix of type '<class 'numpy.float64'>'
 	with 728119 stored elements in Compressed Sparse Row format>,
 <779x1500 sparse matrix of type '<class 'numpy.float64'>'
 	with 729206 stored elements in Compressed Sparse Row format>)

In [12]:
"""
Users / Businesses with Stars Matrix. (798 rows × 779 columns)
"""
user_bussiness_stars = pd.pivot_table(reviews_df, values='stars', index=['user_id'], columns=['business_id'])
R = user_bussiness_stars
R

business_id,-4ofMtrD7pSpZIX5pnDkig,-85pDrVcAdzNK55bFpintg,-BBSLCjzw3i2PHuwJ_dabA,-Ce8p148xb0-4dv_3zwm3A,-WA4tNsXZq5sxcGw8a5_IQ,-_GnwXmzC3DXsHR9nyaC2g,-jQOsyXnAMKu6ND7ongXFg,-qjkQHH-5O8BAztc6udOuw,-waa6lH9SmAkaKiLx74_FA,0-Y_m4TJZufHBVozbMOqbA,...,z3JW-nsdfC8ijst_PSeMIA,z4fQL2IHUR4ENonH7A_9tQ,zDVjN3dC9EfQno21m3O8Pg,zE0EGIsOdAO83xE3CypN-A,zFaHweOJ40jjtvpGTjlspw,zM98ZSIJyuBQabyYornLpw,zVILoCmmDSgcM5bc2BKVWg,zZIWHmITwqaC8OhnPkjEIQ,zerPYZ-lid5CG27a1-7YMw,zyHMtStYlKG67WRprp6GZQ
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-0MQ4webH2uc1ZAsGsNENg,,,,,,1.0,,,4.0,2.0,...,,2.0,,,,,,,,
-2sNTzGyci98Mp9PmPRg8w,,,,,,,,,2.0,,...,,,,,5.0,,,,,3.0
-4RH83ibNRpwj8NlBwtMdQ,,,,,,,,,,,...,,,,,,,,,,
-8QoOIfvwwxJ4sY201WP5A,,,,,,,,,,,...,,,,,,,,,,
-OGWTHZng0QNhvc8dhIjyQ,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zgLUWGBKrSoIIH3E2XLvNQ,,,2.0,,,,,,,,...,,,,,,,,,,
zlL_PbFTXjrGXhTbPsjmlQ,,,,,3.5,,,,,5.0,...,,,,,,,,,,
zonT-KQFJiejFNUrs2KR0Q,,,,,,2.0,,,,,...,,,,,5.0,,,,,
zqMc0O7vDj82Sz5Fler1EA,,,,5.0,,,,5.0,,,...,,,,,,,,,,4.0


## Matrix Factorization

In [13]:
# r = user_bussiness_stars
Q = pd.DataFrame(user_vectors.toarray(), index=user_text.index, columns=user_vectorizer.get_feature_names())
P = pd.DataFrame(business_vectors.toarray(), index=business_text.index, columns=business_vectorizer.get_feature_names())
P

Unnamed: 0_level_0,1,10,100,11,12,14,15,2,20,24,...,years,yellow,yelp,yelpers,yes,yesterday,yet,young,yum,yummy
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-4ofMtrD7pSpZIX5pnDkig,0.012426,0.003155,0.002303,0.000000,0.005603,0.002656,0.007490,0.013383,0.005186,0.003523,...,0.024529,0.000000,0.000000,0.000000,0.015613,0.000000,0.010971,0.000000,0.003902,0.008134
-85pDrVcAdzNK55bFpintg,0.016013,0.013010,0.000000,0.005309,0.003851,0.010955,0.007722,0.018396,0.003564,0.000000,...,0.026974,0.000000,0.007065,0.005325,0.009657,0.000000,0.009695,0.005409,0.004023,0.006709
-BBSLCjzw3i2PHuwJ_dabA,0.008220,0.008348,0.000000,0.000000,0.000000,0.000000,0.009909,0.027542,0.000000,0.000000,...,0.012981,0.000000,0.000000,0.000000,0.008262,0.000000,0.008294,0.000000,0.000000,0.000000
-Ce8p148xb0-4dv_3zwm3A,0.010332,0.003498,0.000000,0.000000,0.000000,0.000000,0.012455,0.009891,0.003833,0.000000,...,0.010878,0.000000,0.011396,0.000000,0.013847,0.000000,0.003475,0.005816,0.000000,0.007214
-WA4tNsXZq5sxcGw8a5_IQ,0.002588,0.018395,0.000000,0.000000,0.006222,0.000000,0.012477,0.027249,0.005760,0.035211,...,0.019069,0.000000,0.011416,0.008605,0.020808,0.004086,0.005222,0.013110,0.006501,0.005420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zM98ZSIJyuBQabyYornLpw,0.015011,0.011433,0.011128,0.006221,0.022560,0.000000,0.013572,0.025148,0.012530,0.000000,...,0.019754,0.000000,0.004139,0.000000,0.007544,0.011851,0.015146,0.000000,0.000000,0.000000
zVILoCmmDSgcM5bc2BKVWg,0.026094,0.004417,0.006448,0.000000,0.005229,0.000000,0.005243,0.024980,0.009681,0.000000,...,0.004579,0.000000,0.014390,0.000000,0.013114,0.000000,0.021941,0.000000,0.000000,0.004555
zZIWHmITwqaC8OhnPkjEIQ,0.009494,0.016873,0.000000,0.007869,0.000000,0.000000,0.008584,0.036357,0.010567,0.005383,...,0.009995,0.004771,0.002618,0.003947,0.009543,0.000000,0.014370,0.000000,0.000000,0.004972
zerPYZ-lid5CG27a1-7YMw,0.026819,0.000000,0.005681,0.000000,0.000000,0.000000,0.004619,0.036678,0.000000,0.000000,...,0.004034,0.007700,0.004226,0.000000,0.003851,0.006050,0.003866,0.000000,0.019251,0.008026


In [14]:
# Calculate the Error for the Function
def funError(R, Q, P, K, lambd):
    e = 0
    for i in range(len(R)):
        for j in range(len(R[i])):
            if R[i][j] > 0:
                # loss Function ErrorSum
                e = e + pow(R[i][j]-np.dot(Q[i,:],P[:,j]), 2)
                # Adding Regularization
                for k in range(K):
                    # Error + ||Q||^2 + ||P||^2
                    e = e + (lambd/2) * ( pow(Q[i][k], 2) + pow(P[k][j], 2) )
    return e

def fit_stats(funError, samples_count, products_count):
    print('Training Complete...')
    print('------------------------------')
    print('Stats:')
    print('Users: ' + str(samples_count))
    print('Restaurants: ' + str(products_count))

In [15]:
def matrix_factorization(R, K, learn_rate, iterations, lambd):
    R = np.array(R)
    M = len(R[0])
    N = len(R)
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    P = P.T
    print('Running Stochastic Gradient Descent for %s iterations.' % iterations)
    boom = False
    error = []
    for step in range(iterations):
        if boom: break
        print('Iteration :',step, '...', end=" "),  
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    err_ij = R[i][j] - np.dot(Q[i, :], P[:, j])
                    for k in range(K):
                        Q[i][k] = Q[i][k] + learn_rate * (2 * err_ij * P[k][j] - lambd * Q[i][k])
                        P[k][j] = P[k][j] + learn_rate * ( 2 * err_ij * Q[i][k] - lambd * P[k][j])

        #Measure Error
        error.append(funError(R, Q, P, K, lambd))
        print('done.', end=" "),
        print('Error:[ %0.2f]' % error[step])    
    Q = Q
    P = P.T
    fit_stats(error, M, N)
    return (Q, P, error)

## Training New Data.

In [None]:
 """
 ----RUN THIS MODEL ONLY TO TRAIN NEW DATA OR TO CHANGE THE PARAMETERS----
 
Q = |V|*K (User features)
P = |U|*K (Business features)
R = |A|[User ,Business ,Real Rating] Matrix. (80 % of the origiinal dataset)
K = Number of latent features
iterations  = steps
learn_rate = Learning Rate
lambd = Regularization Parameter
nQ = Updated value of P
nP = Updated value of Q
#P = numpy.random.rand(N,K)
#Q = numpy.random.rand(M,K)
"""
K = 70
learn_rate = 0.01
lambd = 0.001 
iterations  = 60

start = time.time()
nQ, nP ,er = matrix_factorization(R, K, learn_rate, iterations , lambd) 
end = time.time()
duration = end-start

print('Error: %0.2f' % er[-1])
print('------------------------------')
print("Time taken to train the Model: %d seconds" %duration)

In [None]:
f0 = pylab.figure()
p1 = f0.add_subplot(111)
p1.plot([i for i in range(iterations)],er) 
p1.set_xlabel("Iteration")
p1.set_ylabel("Error")
p1.set_title("Gradient Descent : Error vs. Iterations")

## Save The Model in a pickle file

In [None]:
# STORE P, Q and vectorizer in pickle file
import pickle
output = open('yelp_dataset/yelp_recommend_model_LFM_v3_5_K70_Iter60_1500F_regF_001.pkl', 'wb')
pickle.dump(nP,output)
pickle.dump(nQ,output)
pickle.dump(user_vectorizer,output)
output.close()

## Load The Model to Test

In [16]:
# LOAD P, Q and vectorizer in pickle file
import pickle
input = open('yelp_dataset/yelp_recommend_model_LFM_v3_5_K70_Iter60_1500F_regF_001.pkl','rb')
nP = pickle.load(input)
nQ = pickle.load(input)
userid_vectorizer = pickle.load(input)
input.close()

### Testing the Moded: 
* Input: Preferences in Text form.  
* Output: Top Ten Recommendations. 

In [17]:
words = "This is definitely a restaurant I would drive to north Austin  for again. I dined here after a seminar on a Friday and it didn't disappoint."
test_df= pd.DataFrame([words], columns=['text'])
test_df['text'] = test_df['text'].apply(cleaning_text)
test_vectors = userid_vectorizer.transform(test_df['text'])
test_v_df = pd.DataFrame(test_vectors.toarray(), index=test_df.index, columns=userid_vectorizer.get_feature_names())

predictItemRating=pd.DataFrame(np.dot(test_v_df.loc[0],P.T),index=P.index,columns=['Rating'])
topRecommendations=pd.DataFrame.sort_values(predictItemRating,['Rating'],ascending=[0])[:10]

for i in topRecommendations.index:
    print(business_df[business_df['business_id']==i]['name'].iloc[0])
    print(business_df[business_df['business_id']==i]['categories'].iloc[0])
    print(str(business_df[business_df['business_id']==i]['stars'].iloc[0])+ ' '+str(business_df[business_df['business_id']==i]['review_count'].iloc[0]))
    print('')

North Italia
Pizza, Italian, Venues & Event Spaces, Breakfast & Brunch, Event Planning & Services, Restaurants
4.0 1292

Sala and Betty
Sandwiches, Food, Breakfast & Brunch, American (New), Diners, Cafes, Restaurants, Beer, Wine & Spirits
4.5 518

Fonda San Miguel
Tapas/Small Plates, Mexican, Breakfast & Brunch, Soul Food, Nightlife, Food, Food Delivery Services, Restaurants, Specialty Food, Bars, Ethnic Food, Cocktail Bars, Salad, Grocery
4.0 888

Austin Java
American (Traditional), Food, Event Planning & Services, Venues & Event Spaces, Restaurants, Coffee & Tea, Breakfast & Brunch, Tex-Mex, Sandwiches, American (New)
3.5 327

Andiamo Ristorante
Italian, Restaurants
4.0 419

The Capital Grille
Wine Bars, Bars, Nightlife, American (Traditional), Steakhouses, Restaurants, Seafood
4.0 338

Dai Due
Food, Grocery, Bars, Restaurants, Cocktail Bars, Nightlife, Breakfast & Brunch, Butcher, American (New), Local Flavor, Desserts
4.0 538

Hudson's On the Bend
Event Planning & Services, Venues 

In [None]:
"""
Getting user_id and most relevant info from userr with name :Kevan
"""
user_X = users_df[users_df['name']=='Kevan']
user_X

In [None]:
# Combine review of User '-OGWTHZng0QNhvc8dhIjyQ' (Selected pick = Kevan) and Output the count. 
Kevan_text=user_text.loc['-OGWTHZng0QNhvc8dhIjyQ']['text']
words_count = len(Kevan_text.split())
print("There are " + str(words_count) + " words on this review compilation.")

set1=set()
string=Kevan_text
string=string.lower()
string=string.split(" ")
for i in string:
    set1.add((string.count(i),i)) 

set2=sorted(set1,reverse=True)
set2[0:25]

***