

#**Importing Necessary Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")




#**Data Ingestion**

In [None]:
flight_df=pd.read_csv("/content/flights.csv",on_bad_lines='skip')
hotel_df=pd.read_csv("/content/hotels.csv",on_bad_lines='skip')
user_df=pd.read_csv("/content/users.csv",on_bad_lines='skip')

#**Recommendation Engine for Users**

Build a recommendation model to provide hotel suggestions based on user preferences and historical data. Develop a Streamlit web application to display insights and visualizations derived from the deployed travel recommendation model, offering an interactive and user-friendly interface for data exploration.

### **Hotels Dataset:**

travelCode: Identifier for the travel, similar to the Flights dataset.

userCode: User identifier(linked to the Users dataset)

name: Name of the hotel.

place: Location of the hotel.

days: Number of days of the hotel stay.

price: Price per day.

total: Total price for the stay.

date: Date of the hotel booking.


In [None]:
hotel_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [None]:
hotel_df.shape

(40552, 8)

In [None]:
hotel_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40552 entries, 0 to 40551
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   travelCode  40552 non-null  int64  
 1   userCode    40552 non-null  int64  
 2   name        40552 non-null  object 
 3   place       40552 non-null  object 
 4   days        40552 non-null  int64  
 5   price       40552 non-null  float64
 6   total       40552 non-null  float64
 7   date        40552 non-null  object 
dtypes: float64(2), int64(3), object(3)
memory usage: 2.5+ MB


In [None]:
hotel_df.describe()

Unnamed: 0,travelCode,userCode,days,price,total
count,40552.0,40552.0,40552.0,40552.0,40552.0
mean,67911.794461,666.963726,2.499679,214.439554,536.229513
std,39408.199333,391.136794,1.119326,76.742305,319.331482
min,0.0,0.0,1.0,60.39,60.39
25%,33696.75,323.0,1.0,165.99,247.62
50%,67831.0,658.0,2.0,242.88,495.24
75%,102211.25,1013.0,4.0,263.41,742.86
max,135942.0,1339.0,4.0,313.02,1252.08


In [None]:
hotel_df.isnull().sum()

Unnamed: 0,0
travelCode,0
userCode,0
name,0
place,0
days,0
price,0
total,0
date,0


In [None]:
hotel_df.duplicated().sum()

np.int64(0)

In [None]:
hotel_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [None]:
data = hotel_df.copy()

In [None]:
# Combine relevant columns into a single column for hotel information
data['Hotel_Info'] = data['name'].str.cat(data['place'], sep='|')
data

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date,Hotel_Info
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019,Hotel A|Florianopolis (SC)
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019,Hotel K|Salvador (BH)
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019,Hotel K|Salvador (BH)
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019,Hotel K|Salvador (BH)
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019,Hotel A|Florianopolis (SC)
...,...,...,...,...,...,...,...,...,...
40547,135938,1339,Hotel BP,Brasilia (DF),3,247.62,742.86,06/18/2020,Hotel BP|Brasilia (DF)
40548,135939,1339,Hotel BP,Brasilia (DF),1,247.62,247.62,06/25/2020,Hotel BP|Brasilia (DF)
40549,135940,1339,Hotel BW,Campo Grande (MS),3,60.39,181.17,07/02/2020,Hotel BW|Campo Grande (MS)
40550,135941,1339,Hotel BW,Campo Grande (MS),3,60.39,181.17,07/09/2020,Hotel BW|Campo Grande (MS)


In [None]:
filtered_data = data[(data['Hotel_Info'] == 'Hotel A|Florianopolis (SC)') &
                         (data['days'] == 4) &
                         (data['price'] <= 313.02)]

In [None]:
filtered_data

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date,Hotel_Info
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019,Hotel A|Florianopolis (SC)
39,122,2,Hotel A,Florianopolis (SC),4,313.02,1252.08,04/02/2020,Hotel A|Florianopolis (SC)
54,192,2,Hotel A,Florianopolis (SC),4,313.02,1252.08,08/05/2021,Hotel A|Florianopolis (SC)
104,377,3,Hotel A,Florianopolis (SC),4,313.02,1252.08,08/18/2022,Hotel A|Florianopolis (SC)
138,481,4,Hotel A,Florianopolis (SC),4,313.02,1252.08,10/22/2020,Hotel A|Florianopolis (SC)
...,...,...,...,...,...,...,...,...,...
26567,89380,882,Hotel A,Florianopolis (SC),4,313.02,1252.08,04/09/2020,Hotel A|Florianopolis (SC)
26603,89501,882,Hotel A,Florianopolis (SC),4,313.02,1252.08,08/04/2022,Hotel A|Florianopolis (SC)
26616,89542,882,Hotel A,Florianopolis (SC),4,313.02,1252.08,05/18/2023,Hotel A|Florianopolis (SC)
26630,89588,884,Hotel A,Florianopolis (SC),4,313.02,1252.08,07/09/2020,Hotel A|Florianopolis (SC)


In [None]:
hotel_indices = filtered_data.index
hotel_indices

Index([    0,    39,    54,   104,   138,   141,   176,   190,   203,   259,
       ...
       26414, 26430, 26453, 26534, 26542, 26567, 26603, 26616, 26630, 26653],
      dtype='int64', length=846)

#**Content Based Recommendation Engine**

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
#import streamlit as st


# Load the dataset
file_path = '/content/hotels.csv'
sample_size = 5000  # Adjust the sample size as needed

# Set a random seed for reproducibility
import random
random.seed(42)

# Read a random sample of rows from the dataset
df = pd.read_csv(file_path, skiprows=lambda i: i > 0 and random.random() > (sample_size / 30000))

# Load the dataset
data = df.copy()

# Data Preprocessing
# Combine relevant columns into a single column for hotel information
data['Hotel_Info'] = data['name'].str.cat(data['place'], sep='|')

# Create a TF-IDF vectorizer to convert text data into numerical vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the vectorizer on the Hotel_Info column
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Hotel_Info'])

# Compute the cosine similarity between hotels based on TF-IDF vectors
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Function to get hotel recommendations based on Package Type, Start City, Price, and Destination
def get_hotel_recommendations(Hotel_Info, days, price,cosine_sim=cosine_sim):
    # Filter the dataset based on the given criteria
    filtered_data = data[(data['Hotel_Info'] == Hotel_Info) &
                         (data['days'] == days) &
                         (data['price'] <= price)]

    if filtered_data.empty:
        return "No matching hotels found."

    # Get the indices of the filtered hotels
    hotel_indices = filtered_data.index

    # Calculate the average cosine similarity score for each hotel with the filtered hotels
    avg_similarity_scores = []
    for idx in hotel_indices:
        avg_score = sum(cosine_sim[idx]) / len(cosine_sim[idx])
        avg_similarity_scores.append(avg_score)

    # Create a DataFrame to store the filtered hotels and their average similarity scores
    recommended_hotels_df = pd.DataFrame({'Uniq Id': filtered_data['userCode'],
                                          'Hotel Details': filtered_data['Hotel_Info'],
                                          'Avg Similarity Score': avg_similarity_scores})

    # Sort the hotels by average similarity score in descending order
    recommended_hotels_df = recommended_hotels_df.sort_values(by='Avg Similarity Score', ascending=False)

    # Return the recommended hotel details
    return recommended_hotels_df[['Uniq Id', 'Hotel Details','Avg Similarity Score']]

In [None]:
get_hotel_recommendations('Hotel A|Florianopolis (SC)',4,1252)

Unnamed: 0,Uniq Id,Hotel Details,Avg Similarity Score
30,4,Hotel A|Florianopolis (SC),0.121276
33,6,Hotel A|Florianopolis (SC),0.121276
99,19,Hotel A|Florianopolis (SC),0.121276
110,21,Hotel A|Florianopolis (SC),0.121276
207,44,Hotel A|Florianopolis (SC),0.121276
...,...,...,...
4380,860,Hotel A|Florianopolis (SC),0.121276
4383,860,Hotel A|Florianopolis (SC),0.121276
4401,865,Hotel A|Florianopolis (SC),0.121276
4431,873,Hotel A|Florianopolis (SC),0.121276


#**Collaborative Based Recommendation Engine**

##New

In [None]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

Recommender systems have a problem known as user cold-start, in which it is hard to provide personalized recommendations for users with none or a very few number of consumed items, due to the lack of information to model their preferences.

For this reason, we are keeping in the dataset only users with at least 5 interactions.

In [None]:
users_with_enough_interactions_df = hotel_df.groupby(['userCode']).size().groupby('userCode').size()
users_with_enough_interactions_df

Unnamed: 0_level_0,0
userCode,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
...,...
1335,1
1336,1
1337,1
1338,1


In [None]:
users_interactions_count_df = hotel_df.groupby(['userCode','name']).size().groupby('userCode').size()
print('No of users: %d' % len(users_interactions_count_df))

users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 2].reset_index()[['userCode']]
print('No of users with at least 2 interactions: %d' % len(users_with_enough_interactions_df))

No of users: 1310
No of users with at least 2 interactions: 1285


In [None]:
print('No of interactions: %d' % len(hotel_df))
interactions_from_selected_users_df = hotel_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'userCode',
               right_on = 'userCode')
print('No of interactions from users with at least 2 interactions: %d' % len(interactions_from_selected_users_df))

No of interactions: 40552
No of interactions from users with at least 2 interactions: 40524


In [None]:
interactions_from_selected_users_df.shape

(40524, 8)

In [None]:
interactions_from_selected_users_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [None]:
# Encode userCode and hotel name to numeric values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
#df_hotel['userCode'] = label_encoder.fit_transform(df_hotel['userCode'])
interactions_from_selected_users_df['name_encoded'] = label_encoder.fit_transform(interactions_from_selected_users_df['name'])

In [None]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df.groupby(['name_encoded','userCode'])['price'].sum().reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head()

# of unique user/item interactions: 9699


Unnamed: 0,name_encoded,userCode,price
0,0,0,939.06
1,0,2,1878.12
2,0,3,3130.2
3,0,4,2191.14
4,0,5,313.02


In [None]:
interactions_full_df.shape

(9699, 3)

In [None]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['userCode'],
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 7759
# interactions on Test set: 1940


In [None]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                                               stratify=interactions_full_df['userCode'],
                                   test_size=0.25,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 7274
# interactions on Test set: 2425


In [None]:
x_test=set(interactions_test_df['userCode'])
x_train=set(interactions_train_df['userCode'])

In [None]:
only_in_set1 = x_train - x_test
print("Elements in train but not in test:", only_in_set1)

Elements in train but not in test: set()


In [None]:
only_in_set2 = x_test - x_train
print("Elements in test but not in train:", only_in_set2)

Elements in test but not in train: set()


In [None]:
interactions_test_df.head()

Unnamed: 0,name_encoded,userCode,price
3170,3,168,1214.4
6794,6,594,829.95
7770,7,373,263.41
1201,1,522,139.1
6554,6,312,2323.86


In [None]:
#Creating a sparse pivot table with users in rows and items in columns
items_users_pivot_matrix_df = interactions_train_df.pivot(index='userCode',
                                                          columns='name_encoded',
                                                          values='price').fillna(0)

items_users_pivot_matrix_df.head()

name_encoded,0,1,2,3,4,5,6,7,8
userCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,939.06,0.0,625.66,971.52,247.62,120.78,0.0,1843.87,0.0
1,0.0,0.0,0.0,0.0,247.62,0.0,0.0,0.0,0.0
2,0.0,278.2,938.49,485.76,495.24,422.73,497.97,1317.05,0.0
3,3130.2,834.6,0.0,2671.68,495.24,543.51,1161.93,0.0,416.08
4,2191.14,834.6,2189.81,0.0,742.86,362.34,0.0,1843.87,832.16


In [None]:
items_users_pivot_matrix = items_users_pivot_matrix_df.values
items_users_pivot_matrix[:10]

array([[ 939.06,    0.  ,  625.66,  971.52,  247.62,  120.78,    0.  ,
        1843.87,    0.  ],
       [   0.  ,    0.  ,    0.  ,    0.  ,  247.62,    0.  ,    0.  ,
           0.  ,    0.  ],
       [   0.  ,  278.2 ,  938.49,  485.76,  495.24,  422.73,  497.97,
        1317.05,    0.  ],
       [3130.2 ,  834.6 ,    0.  , 2671.68,  495.24,  543.51, 1161.93,
           0.  ,  416.08],
       [2191.14,  834.6 , 2189.81,    0.  ,  742.86,  362.34,    0.  ,
        1843.87,  832.16],
       [ 313.02,  417.3 ,    0.  ,    0.  ,    0.  ,    0.  ,  165.99,
        1317.05,  208.04],
       [1252.08,  695.5 ,  312.83,  728.64,    0.  ,   60.39,  497.97,
           0.  ,    0.  ],
       [   0.  ,  139.1 ,    0.  ,  728.64,  247.62,    0.  ,  165.99,
           0.  ,  208.04],
       [1878.12,  556.4 , 2502.64, 2914.56,    0.  ,    0.  ,  663.96,
         790.23, 1040.2 ],
       [ 939.06,  417.3 ,  938.49,  242.88,    0.  ,  483.12,  497.97,
           0.  ,  416.08]])

In [None]:
user_ids = list(items_users_pivot_matrix_df.index)
user_ids[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [None]:
items_users_pivot_matrix.shape

(1285, 9)

In [None]:
# The number of factors to factor the item-user matrix.
NUMBER_OF_FACTORS_MF = 8

#Performs matrix factorization of the original user item matrix
U, sigma, Vt = svds(items_users_pivot_matrix, k = NUMBER_OF_FACTORS_MF)

In [None]:
sigma = np.diag(sigma)

In [None]:
U.shape,sigma.shape,Vt.shape

((1285, 8), (8, 8), (8, 9))

After the factorization, we try to  reconstruct the original matrix by multiplying its factors. The resulting matrix is not sparse any more. It was generated predictions for users the items have not yet interaction, which we will exploit for recommendations.

In [None]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)
all_user_predicted_ratings

array([[ 9.39033853e+02, -2.18741349e-01,  6.25511254e+02, ...,
        -4.58414051e-01,  1.84356976e+03, -1.77057361e-01],
       [-2.69709826e-02, -2.25631042e-01, -1.53431347e-01, ...,
        -4.72852712e-01, -3.09696812e-01, -1.82634134e-01],
       [ 8.77283244e-01,  2.85539085e+02,  9.43480651e+02, ...,
         5.13350447e+02,  1.32712349e+03,  5.94052757e+00],
       ...,
       [ 4.37716435e-01,  1.11646180e+03,  2.19230006e+03, ...,
         1.99955400e+03,  5.02611961e+00,  2.96399888e+00],
       [-1.71349584e-01,  1.37666541e+02, -9.74766021e-01, ...,
        -3.00408467e+00,  5.24852462e+02,  2.06879705e+02],
       [ 1.40860497e-01,  1.17839611e+00,  8.01321040e-01, ...,
         1.68459553e+02,  5.28437444e+02,  4.17033838e+02]])

In [None]:
all_user_predicted_ratings.shape

(1285, 9)

In [None]:
hotel_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [None]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings, columns = items_users_pivot_matrix_df.columns,index=user_ids).transpose()
cf_preds_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1327,1328,1330,1331,1332,1333,1334,1335,1337,1339
name_encoded,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,939.033853,-0.026971,0.877283,3131.144696,2191.70998,312.801302,1252.004137,-0.17345,1877.283855,940.30328,...,0.084619,0.321755,0.01702,0.176568,0.751868,-0.530439,-0.06489,0.437716,-0.17135,0.14086
1,-0.218741,-0.225631,285.539085,842.503043,839.368284,415.470438,694.865357,137.648972,549.405065,427.7009,...,0.707898,2.691703,139.242382,696.977116,1119.089897,273.762506,138.557152,1116.461803,137.666541,1.178396
2,625.511254,-0.153431,943.480651,5.374148,2193.05248,-1.24412,312.398436,-0.986713,2497.883375,945.562715,...,0.481378,627.490385,0.096821,1.004454,4.277192,622.64246,-0.369142,2192.300062,-0.974766,0.801321
3,971.158022,-0.373379,497.904887,2684.758137,7.890665,-3.027601,727.589777,726.238806,2902.98462,260.091649,...,486.931446,247.334292,1457.515617,731.084366,10.408666,1207.056732,-0.898317,1220.459635,483.38788,487.710037
4,247.317278,247.307743,505.396752,506.177228,749.458953,-2.531978,-0.8783,245.611885,-9.680474,14.394079,...,248.599679,3.725118,2476.397046,2230.62422,999.184753,1231.958835,494.488739,1490.787665,245.636199,496.870813


In [None]:
cf_preds_df.columns


Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
       ...
       1327, 1328, 1330, 1331, 1332, 1333, 1334, 1335, 1337, 1339],
      dtype='int64', length=1285)

In [None]:
class CFRecommender:

    MODEL_NAME = 'Collaborative Filtering'

    def __init__(self, cf_predictions_df, items_df=hotel_df):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df

    def get_model_name(self):
        return self.MODEL_NAME

    def recommend_items(self, user_id, items_to_ignore=[], topn=5, verbose=False):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False).reset_index().rename(columns={user_id: 'recStrength'})

        # Recommend the highest predicted rating content that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['name_encoded'].isin(items_to_ignore)].sort_values('recStrength', ascending=False).head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            # Merge recommendations_df with items_df
            recommendations_df = recommendations_df.merge(self.items_df, how='left',
                                                          left_on='name_encoded',
                                                          right_on='name_encoded')[['name_encoded','name','recStrength']]
            recommendations_df=pd.DataFrame(recommendations_df.groupby('name').max('recStrength').sort_values('recStrength', ascending=False))

        return recommendations_df

# Assuming cf_preds_df and interactions_from_selected_users_df are defined elsewhere
cf_recommender_model = CFRecommender(cf_preds_df, interactions_from_selected_users_df)


In [None]:
import pickle

In [None]:
with open('cf_recommender_model.pkl', 'wb') as f:
    pickle.dump(cf_recommender_model, f)

In [None]:
interactions_from_selected_users_df.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date,name_encoded
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019,0
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10/10/2019,7
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019,7
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12/12/2019,7
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019,0


In [None]:
cf_recommender_model.recommend_items(1,verbose=True)

Unnamed: 0_level_0,name_encoded,recStrength
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Hotel BP,4,247.307743
Hotel BW,5,8.755344
Hotel A,0,-0.026971
Hotel AU,2,-0.153431
Hotel Z,8,-0.182634


###**Evaluation**

In Recommender Systems, there are a set metrics commonly used for evaluation.

We choose to work with Top-N accuracy metrics, which evaluates the accuracy of the top recommendations provided to a user, comparing to the items the user has actually interacted in test set.

This evaluation method works as follows:

For each user

For each item the user has interacted in test set

Sample 100 other items the user has never interacted.

Ask the recommender model to produce a ranked list of recommended items, from a set composed of one interacted item and the 100 non-interacted items

Compute the Top-N accuracy metrics for this user and interacted item from the recommendations ranked list

Aggregate the global Top-N accuracy metrics

In [None]:
#Indexing by userId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('userCode')
interactions_train_indexed_df = interactions_train_df.set_index('userCode')
interactions_test_indexed_df = interactions_test_df.set_index('userCode')

The Top-N accuracy metric choosen was Recall@N which evaluates whether the interacted item is among the top N items (hit) in the ranked list of 101 recommendations for a user.

In [None]:
def get_items_interacted(userCode, interactions_df):
    interacted_items = interactions_df.loc[userCode]['name_encoded']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [None]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 10

class ModelEvaluator:

    # Function for getting the set of items which a user has not interacted with
    def get_not_interacted_items_sample(self, userCode, sample_size, seed=42):
        interacted_items = get_items_interacted(userCode, interactions_test_indexed_df)
        all_items = set(interactions_test_indexed_df['name_encoded'])
        non_interacted_items = all_items - set(interacted_items)


        return non_interacted_items



    # Function to verify whether a particular item_id was present in the set of top N recommended items
    def _verify_hit_top_n(self, item_id, recommended_items, topn):
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    # Function to evaluate the performance of model for each user
    def evaluate_model_for_user(self, model, userCode):

        # Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[userCode]

        if type(interacted_values_testset['name_encoded']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['name_encoded'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['name_encoded'])])

        interacted_items_count_testset = len(person_interacted_items_testset)

        # Getting a ranked recommendation list from the model for a given user
        person_recs_df = model.recommend_items(userCode, items_to_ignore=get_items_interacted(userCode, interactions_train_indexed_df),topn=10000)


        hits_at_2_count = 0
        hits_at_3_count = 0

        # For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:

            # Getting a random sample of 100 items the user has not interacted with
            non_interacted_items_sample = self.get_not_interacted_items_sample(userCode, sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, seed=item_id%(2**32))

            # Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            # Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['name_encoded'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['name_encoded'].values

            # Verifying if the current interacted item is among the Top-N recommended items
            hit_at_2, index_at_2 = self._verify_hit_top_n(item_id, valid_recs, 2)
            hits_at_2_count += hit_at_2
            hit_at_3, index_at_3 = self._verify_hit_top_n(item_id, valid_recs, 3)
            hits_at_3_count += hit_at_3

        # Recall is the rate of the interacted items that are ranked among the Top-N recommended items
        recall_at_2 = hits_at_2_count / float(interacted_items_count_testset)
        recall_at_3 = hits_at_3_count / float(interacted_items_count_testset)

        person_metrics = {'hits@2_count':hits_at_2_count,
                          'hits@3_count':hits_at_3_count,
                          'interacted_count': interacted_items_count_testset,
                          'recall@2': recall_at_2,
                          'recall@3': recall_at_3}
        return person_metrics

    # Function to evaluate the performance of model at overall level
    def evaluate_model(self, model):

        people_metrics = []

        for idx, person_id in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)

        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics).sort_values('interacted_count', ascending=False)

        global_recall_at_2 = detailed_results_df['hits@2_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_3 = detailed_results_df['hits@3_count'].sum() / float(detailed_results_df['interacted_count'].sum())

        global_metrics = {'modelName': model.get_model_name(),
                          'recall@2': global_recall_at_2,
                          'recall@3': global_recall_at_3}
        return global_metrics, detailed_results_df


model_evaluator = ModelEvaluator()

In [None]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)

print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...
1284 users processed

Global metrics:
{'modelName': 'Collaborative Filtering', 'recall@2': np.float64(0.894020618556701), 'recall@3': np.float64(0.9509278350515464)}


Unnamed: 0,hits@2_count,hits@3_count,interacted_count,recall@2,recall@3,_person_id
937,3,3,3,1.0,1.0,251
228,3,3,3,1.0,1.0,748
242,3,3,3,1.0,1.0,230
243,3,3,3,1.0,1.0,70
900,3,3,3,1.0,1.0,204
33,3,3,3,1.0,1.0,219
197,3,3,3,1.0,1.0,330
535,3,3,3,1.0,1.0,292
263,3,3,3,1.0,1.0,852
892,3,3,3,1.0,1.0,271


**Evaluating the Collaborative Filtering model (SVD matrix factorization), we observe that we got Recall@2 (89%) and Recall@3 (95%)**