In [1]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import xlrd
from scipy.sparse.linalg import spsolve
from pyxlsb import open_workbook as open_xlsb
from sklearn import metrics
import pickle
import implicit
from pandas.api.types import CategoricalDtype
import random
import matplotlib.pyplot as plt
import itertools
import copy

In [4]:
#we import the data cluster_lookup is a lookup table of hotels. Cleaned_df is the updated Dataframe with views info
cluster_lookup = pickle.load(open('cluster_lookup_upto21Feb.pickle', 'rb'))
cleaned_df = pickle.load(open('df_upto21Feb.pickle', 'rb'))
cleaned_df.head()

Unnamed: 0,user_id,cluster_id,ga:totalEvents
0,35ee4ff41378badb08aac8e56cf7759fea6f3c5b42898...,9461581,85
1,8ff5015618651a16649d9340032018a9f4ee292cfc57e...,9525153,78
2,02fcb1358a130f6a37b43f7da166c8065f7ad450385b6...,8109942,68
3,ffb2cc7df4dc609f2106e55c943c44b359401fbb04d3f...,9231251,68
4,3919087a2b968b0e5064c0eceab17ca1d2bf80aaa9d16...,9491140,65


In [5]:
#We now create the sparse ratings matrix of users and hotels:
customers = list(np.sort(cleaned_df.user_id.unique())) # Get our unique customers
products = list(cleaned_df.cluster_id.unique()) # Get our unique hotels that were booked
quantity = list(cleaned_df['ga:totalEvents']) # All of our room bookings
cat_type1 = CategoricalDtype(categories = customers)
cat_type2 = CategoricalDtype(categories = products)
rows = cleaned_df.user_id.astype(cat_type1).cat.codes 
cols = cleaned_df.cluster_id.astype(cat_type2).cat.codes 

bookings_sparse = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(customers), len(products)))

In [6]:
#In terms of sparsity this means
matrix_size = bookings_sparse.shape[0]*bookings_sparse.shape[1] # Number of possible interactions in the matrix
num_bookings = len(bookings_sparse.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_bookings/matrix_size))
print('sparsity=',sparsity)

sparsity= 99.99828106048203


In [7]:
def make_train(ratings, pct_test = 0.1):
    '''
    This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
    while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    
    parameters: 
    
    ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
    copy of the original set. This is in the form of a sparse csr_matrix. 
    
    pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set, which contains all of the original ratings. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the user-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
    compares with the actual interactions.
    
    user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
    This will be necessary later when evaluating the performance via AUC.
    '''
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  
#This will return our training set, a test set that has been binarized to 0/1 for booked/not booked, 
#and a list of which users that have at least one item masked. We will test the performance of the recommender system 
#on these users only. pct_test = 0.1 means I am masking 10%

In [8]:
# We create an array of costumers and hotels like we made earlier
customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix
products_arr = np.array(products) # Array of product IDs from the ratings matrix
product_train, product_test, product_users_altered = make_train(bookings_sparse, pct_test = 0.2)
print('total users=', len(customers))
print('Users with items masked=', len(product_users_altered))

total users= 391786
Users with items masked= 128238


In [9]:
#model for final recommendations
alpha = 800
new_model = implicit.als.AlternatingLeastSquares(factors=40, iterations=240, regularization = 0.1)
trans_ptrain=product_train.T
train=new_model.fit((trans_ptrain*alpha).astype('double'))
new_uservecs=new_model.user_factors
new_itemvecs=new_model.item_factors

100%|██████████████████████████████████████████████████████████████████████████████| 240.0/240 [01:45<00:00,  2.41it/s]


In [20]:
#function to recommend hotels
def rec_hotels(cluster_id, mf_train, user_vecs, item_vecs, customer_list, item_list, cluster_lookup, num_items = 10):
	'''
	This function will return the top recommended items to our users 
	
	parameters:
	
	cluster_id - hotel id number that you want to get recommendations for
	
	mf_train - The training matrix you used for matrix factorization fitting
	
	user_vecs - the user vectors from your fitted matrix factorization
	
	item_vecs - the item vectors from your fitted matrix factorization
	
	customer_list - an array of the customer's ID numbers that make up the rows of your ratings matrix 
					(in order of matrix)
	
	item_list - an array of the products that make up the columns of your ratings matrix
					(in order of matrix)
	
	item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
	
	num_items - The number of items you want to recommend in order of best recommendations. Default is 10. 
	
	returns:
	
	- The top n recommendations chosen based on the user/item vectors for items never interacted with
	'''
	
	cluster_ind = np.where(item_list == cluster_id)[0][0] # Returns the index row of our customer id
	recommend_vector=new_model.similar_items(cluster_ind, N=num_items)
    #filtered_recommend_vector=filrecvec.append(recommend_vector()
#	print(recommend_vector)
	product_idx = [x[0] for x in recommend_vector]# Sort the indices of the items into order 
#	print("product_idx")   
#	print(product_idx) 
	# of best recommendations
	rec_list = [] # start empty list to store items
	for index in product_idx:
		code=item_list[index]
#		print("index,code")
#		print(index,code)
		rec_list.append([code, cluster_lookup['hotel_name'].loc[cluster_lookup.cluster_id==code].iloc[0],
                         cluster_lookup['city_id'].loc[cluster_lookup.cluster_id==code].iloc[0]])
	codes=[item[0] for item in rec_list]
	descriptions=[item[1] for item in rec_list]
	cities=[item[2] for item in rec_list]
	final_frame = pd.DataFrame({'ClusterID': codes, 'Hotelname': descriptions, 'Hotelcity':cities}) # Create a dataframe 
	final_frame[['ClusterID', 'Hotelname', 'Hotelcity']]
	#my_city=(final_frame['Hotelcity']==city_id)
	#return final_frame[my_city]
	return final_frame[['ClusterID', 'Hotelname', 'Hotelcity']] # Switch order of columns around

In [21]:
#example of recommendations:
print('Recommendations for Novotel Muenchen cCity Arnulfpark:')
print(rec_hotels(9526409, product_train, new_uservecs, new_itemvecs, customers_arr, products_arr, cluster_lookup,num_items=10))

Recommendations for Novotel Muenchen cCity Arnulfpark:
   ClusterID                               Hotelname  Hotelcity
0    9526409        Novotel Muenchen City Arnulfpark    37463.0
1    9530059               BEST WESTERN Atrium Hotel    37463.0
2    8851448           B%26B%20Affittacamere%20Larix        1.0
3    7068516           ibis Muenchen City Arnulfpark    37463.0
4    7104200                       Germania MÃ¼nchen    37463.0
5    9533861  Mercure Hotel Muenchen Neuperlach Sued    37463.0
6    4055500          ibis Styles MÃ¼nchen Ost-Messe    37463.0
7    9657709    Holiday Inn Express Munich City West    37463.0
8    9144045                           Bristol Hotel   208561.0
9    9530088                     ErzgieÃerei Europe    37463.0


In [None]:
#saving recommendations to a file
import csv

outfile=open('recommendationslist_upto21Feb.tsv','w')
writer=csv.writer(outfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for cluster_id in products_arr:
    cluster_ind=np.where(products_arr==cluster_id)[0][0]

    recommend_vector=new_model.similar_items(cluster_ind, N=500)
    product_idx = [x[0] for x in recommend_vector]# Sort the indices of the items into order 
    print(product_idx)
    rec_list = [] # start empty list to store items
    for i,index in enumerate(product_idx):
        code=products_arr[index]
        print(i,index,code)
        rec_list.append([i,cluster_id,code])
        writer.writerow([i,cluster_id, code])
outfile.close()
   #print(rec_list)
print('your file with recommendations is ready')