# MultiCriteria Recommender System

This recommender system is based on the tripadvisor multicriteria dataset

In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


# Load the CSV file after replacing backslashes with forward slashes
df = pd.read_csv("C:/Users/shash/Documents/RS_NIT_SIKKIM/Datasets/TripAdvisor.csv")

print(df.shape,df.iloc[0,3])
df.columns

(21826, 6) 5


Index(['review_id', 'member_id', 'hotel_id', 'rating', 'recommend_list',
       'review_text'],
      dtype='object')

In [89]:
df.head()

Unnamed: 0,review_id,member_id,hotel_id,rating,recommend_list,review_text
0,review_100005210,BD140084F7ECD3FC293E888ECB7DE217,Traders_Hotel_Singapore,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...,Stayed here for 1 night en route to Australia....
1,review_100006218,BD140084F7ECD3FC293E888ECB7DE217,PARKROYAL_on_Kitchener_Road,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...,"<span class=""bold_review"">Stayed here coming b..."
2,review_10001396,34C7665DF1460A0AC20CEF543F537827,Royal_Adelphi_Hotel,5,5.0:Value;5.0:Location;5.0:Check in / front de...,I have stayed at this hotel on a number of occ...
3,review_100015116,0051648FC0754F9E7117225BAF68773D,Premier_Inn_London_Southwark_Borough_Market,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...,This was my third stay at Southwark and was ha...
4,review_10002709,8383CBD7FF2EEC00F1A2C64E6B6E7C4F,Falcon_Hotel,4,3:Value;5:Location;4:Check in / front desk;3:R...,We have just returned from a 3 night stay at t...


## Preprocessing

In [90]:
# There is a overall rating and 6 class(Value, Location, Sleep Quality, Rooms, Cleanliness, Service) for rating the hotel in recommend_list column
# Some users also rated for the Checkin and business service (so we are removing those ratings for consistency among datasets)
df=df[~df["recommend_list"].str.contains("check",case=False)]
df=df[~df["recommend_list"].str.contains("business",case=False)]
# we are removing duplication member,hotel pairs (removing users who have rated the same hotel many times)
df.drop_duplicates(subset=["member_id","hotel_id"],inplace=True)

# now dropping the irrelevant columns from our dataset
df.drop(["review_id","review_text"],inplace=True,axis=1)
df.to_csv("C:/Users/shash/Documents/RS_NIT_SIKKIM/Datasets/TripAdvisorClean.csv",index=False)
print(df.shape)
df.head()


(19600, 4)


Unnamed: 0,member_id,hotel_id,rating,recommend_list
0,BD140084F7ECD3FC293E888ECB7DE217,Traders_Hotel_Singapore,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...
1,BD140084F7ECD3FC293E888ECB7DE217,PARKROYAL_on_Kitchener_Road,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...
3,0051648FC0754F9E7117225BAF68773D,Premier_Inn_London_Southwark_Borough_Market,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...
5,14F9DD9816194FF75FDC26CC2B4D08D5,W_Hong_Kong,5,4:Value;4:Location;5:Sleep Quality;5:Rooms;5:C...
6,BF16241AA20DB4E9A6673F95285B1728,Hilton_London_Metropole,5,4:Value;4:Location;4:Sleep Quality;4:Rooms;5:C...


## Visualizing the dataset


In [91]:
def stats(df):
    print("users:",df["member_id"].value_counts().size)
    print("Hotels:",df["hotel_id"].value_counts().size)
    print("Reviews:",len(df),'\n')

In [92]:
stats(df)

users: 3446
Hotels: 1811
Reviews: 19600 



## Random Training, Test split

In [93]:
from sklearn.model_selection import train_test_split
# this function splits the dataset into train, test
def get_dataset(df):
    count=0
    while True:
        train,test=train_test_split(df,test_size=0.3)
        test=removeMissing(test,train)
        count+=1

        # stop when both test, train have enough candidates or stop after 100 trials
        if(len(test)>0 and len(train)>0 or count>100):
            break
    return train,test

# remove those users/hotels which are in test but on in the train set
def removeMissing(test,train):
    hotels_missing=~test["hotel_id"].isin(train["hotel_id"])
    users_missing=~test["member_id"].isin(train["member_id"])
    # remove this hotels from test
    new_test=test[~hotels_missing]
    new_test=test[~users_missing]
    return new_test





In [94]:
train, test=get_dataset(df)
print("Dataset: ",len(df))
print("Train: ",len(train))
print("Test: ",len(test),'\n')
print('***train***')
stats(train)
print('***test***')
stats(test)


Dataset:  19600
Train:  13720
Test:  5808 

***train***
users: 3418
Hotels: 1717
Reviews: 13720 

***test***
users: 2868
Hotels: 1440
Reviews: 5808 



In [95]:
train.head()

Unnamed: 0,member_id,hotel_id,rating,recommend_list
13238,09468CA7D158680AF427569074BD0049,Doubletree_by_Hilton_Hotel_London_Westminster,5,5:Value;5:Location;5:Sleep Quality;5:Rooms;5:C...
8570,A30E50C8C8AB34FA8ECFA14869E191AB,Holiday_Inn_Express_Causeway_Bay,3,4.0:Value;4.0:Location;2.0:Sleep Quality;3.0:R...
19440,7E7FAEE9B44AD589D73BD65821B2200D,Dream_New_York,4,4:Value;5:Location;5:Rooms;5:Cleanliness;3:Ser...
12667,218A47614781DB8C1FA53DC1E3C4793F,Holiday_Inn_Express_HONG_KONG_KOWLOON_EAST,4,4.0:Value;3.0:Location;4.0:Sleep Quality;4.0:R...
13664,4D02267895460D08DF4D71E34D51D7E1,Hilton_London_Docklands,2,3.0:Value;1.0:Location;3.0:Sleep Quality;4.0:R...


## Generating User X Hotel Utility Matrix

In [96]:
def gen_dataset(df):
    df1=df.copy()
    df1["rating"]=df1["rating"].astype(str)
    df1["recommend_list"]=df1["recommend_list"].astype(str)
    df1["ratings"]=df1["rating"]+":Overall;"+df1["recommend_list"]
    df1.drop(["recommend_list","rating"],inplace=True,axis=1)

    ratings_split = df1['ratings'].str.split(';', expand=True)
    df_overall=df1[['member_id','hotel_id']]
    df_overall.insert(loc=2,column="rating",value=ratings_split[0])
    df_overall["rating"]=pd.to_numeric(df_overall["rating"].str.split(":").str[0])
    df_overall=pd.pivot(df_overall,index="member_id",columns="hotel_id",values="rating")

    df_value=df1[['member_id','hotel_id']]
    df_value.insert(loc=2,column="rating",value=ratings_split[1])
    df_value["rating"]=pd.to_numeric(df_value["rating"].str.split(":").str[0])
    df_value=pd.pivot(df_value,index="member_id",columns="hotel_id",values="rating")

    df_location=df1[['member_id','hotel_id']]
    df_location.insert(loc=2,column="rating",value=ratings_split[2])
    df_location["rating"]=pd.to_numeric(df_location["rating"].str.split(":").str[0])
    df_location=pd.pivot(df_location,index="member_id",columns="hotel_id",values="rating")


    df_sleep_quality=df1[['member_id','hotel_id']]
    df_sleep_quality.insert(loc=2,column="rating",value=ratings_split[3])
    df_sleep_quality["rating"]=pd.to_numeric(df_sleep_quality["rating"].str.split(":").str[0])
    df_sleep_quality=pd.pivot(df_sleep_quality,index="member_id",columns="hotel_id",values="rating")

    df_rooms=df1[['member_id','hotel_id']]
    df_rooms.insert(loc=2,column="rating",value=ratings_split[4])
    df_rooms["rating"]=pd.to_numeric(df_rooms["rating"].str.split(":").str[0])
    df_rooms=pd.pivot(df_rooms,index="member_id",columns="hotel_id",values="rating")

    df_Cleanliness=df1[['member_id','hotel_id']]
    df_Cleanliness.insert(loc=2,column="rating",value=ratings_split[5])
    df_Cleanliness["rating"]=pd.to_numeric(df_Cleanliness["rating"].str.split(":").str[0])
    df_Cleanliness=pd.pivot(df_Cleanliness,index="member_id",columns="hotel_id",values="rating")

    df_service=df1[['member_id','hotel_id']]
    df_service.insert(loc=2,column="rating",value=ratings_split[6])
    df_service["rating"]=pd.to_numeric(df_service["rating"].str.split(":").str[0])
    df_service=pd.pivot(df_service,index="member_id",columns="hotel_id",values="rating")
    dfs=[df_overall,df_value,df_rooms,df_service,df_location,df_Cleanliness,df_sleep_quality]
    return dfs
    


In [97]:
# it is list containing dataframe for each criteria
# overall, value, rooms, service, location, cleanliness, sleep_quality 
train_dfs=gen_dataset(train)
test_dfs=gen_dataset(test)
    
train_dfs[0].head()

hotel_id,130_Queen_s_Gate_Apartments,196_Bishopsgate,1_Lexham_Gardens,3_West_Club,414_Hotel,44_Curzon_Street_Apartments,45_Park_Lane,51_Buckingham_Gate_Taj_Suites_and_Residences,54_Boutique_Hotel,60_Thompson_A_Thompson_Hotel,...,Wyndham_Garden_Chinatown,Wyndham_Garden_Hotel_Manhattan_Chelsea_West,Wyndham_Grand_Chicago_Riverfront,Wynn_s_Hotel,Xi_Hotel,YOTEL_New_York_at_Times_Square_West,YWCA_Fort_Canning_Lodge,Yi_Serviced_Apartments,York_Albany,York_Hotel
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000E04BC8F25CC3913FA914348589098,,,,,,,,,,,...,,,,,,,,,,
001A83233916A080AFEE0530D141E22E,,,,,,,,,,,...,,,,,,,,,,
002FE3426FEF527838C2641FA6744230,,,,,,,,,,,...,,,,,,,,,,
0031C3A6C8E230F8F3F06FCCC8DADCDE,,,,,,,,,,,...,,,,,,,,,,
003BEF8797D24D9C7F2A2EEE5B9A1FAB,,,,,,,,,,,...,,,,,,,,,,


## Normailization

In [10]:
# we are normalizing the ratings for each user by subtracting every user rating by average mean rating
normalized_train_dfs=[]
for i in range(len(train_dfs)):
    normalized_train_dfs.append(train_dfs[i].subtract(train_dfs[i].mean(axis=1),axis=0))

# when we use similarity criteria like pearson correlation and distance etc we need to impute the missing values 
# so we will impute the missing values with average rating of users or average rating of items but as we have already normalized
# we will impute the values with zeros( because the new average is 0)
# normalized_ratings_matrix=normalized_ratings_matrix.fillna(0)
# normalized_ratings_matrix.head()
# normalized_ratings_matrix.shape

## Similarity Functions

In [98]:
def chebyshev_similarity(df):
    users=df.index.tolist()
    num_rows=df.shape[0]
    mat=df.values
    sim=pd.DataFrame(index=df.index,columns=df.index)
    for i in range(num_rows):
        a=np.abs(mat-mat[i])
        b=np.nanmax(a,axis=1)
        col=1/(1+b)
        sim[users[i]]=col
    return sim

In [99]:
def custom_row_nansum(matrix):
    nan_mask = np.isnan(matrix)
    row_sums = np.where(np.all(nan_mask, axis=1), np.nan, np.nansum(matrix, axis=1))
    return row_sums
def manhattan_similarity(df):
    users=df.index.tolist()
    num_rows=df.shape[0]
    mat=df.values
    sim=pd.DataFrame(index=df.index,columns=df.index)
    for i in range(num_rows):
        print(i)
        col=custom_row_nansum(np.abs(mat-mat[i]))
        col=1/(1+col)
        sim[users[i]]=col
    return sim

In [13]:
from sklearn.metrics import pairwise_distances
def euclidean(df):
    sim=pairwise_distances(df,n_jobs=-1,force_all_finite=False,metric="nan_euclidean")
    sim=1/(1+sim)
    sim=pd.DataFrame(sim,index=df.index,columns=df.index)

    return sim


In [14]:
def similarity(df,metric):
    if metric=="euclidean" :
        return  euclidean(df)
    elif metric=="manhattan" :
        return manhattan_similarity(df)
    elif metric=="chebyshev":        
        return chebyshev_similarity(df)
    elif metric=="pearson":
        return df.T.corr()
        pass
    else:
        print("enter a valid metric")



In [100]:
def similarity_average(dfs,metric):
    sim=[]
    for df in dfs:
        sim.append(similarity(df,metric))
    dfs_array = np.stack([df.values for df in sim])
    average_array = np.mean(dfs_array, axis=0)
    average_sim = pd.DataFrame(average_array, columns=sim[0].index, index=sim[0].index)
    return average_sim

In [101]:
euclidean_sim_single=similarity(train_dfs[0],"euclidean")

In [17]:
pearson_sim_single=similarity(train_dfs[0],"pearson")

In [16]:
manhattan_sim_single=similarity(train_dfs[0],"manhattan")

In [29]:
euclidean_average=similarity_average(train_dfs,"euclidean")

In [30]:
chebyshev_sim_single=similarity(train_dfs[0],"chebyshev")

  col=1/(1+np.nanmax(np.abs(mat-mat[i]),axis=1))


In [32]:
chebyshev_sim_single.head(10)

member_id,000E04BC8F25CC3913FA914348589098,001A83233916A080AFEE0530D141E22E,002FE3426FEF527838C2641FA6744230,0031C3A6C8E230F8F3F06FCCC8DADCDE,003BEF8797D24D9C7F2A2EEE5B9A1FAB,003F2C5811E0919048D0C4BE99CE79E7,004095471C5136260CC39A9C819C3B1A,0051648FC0754F9E7117225BAF68773D,006A07FD14B6E7DEC274FF5FCC0E63D6,006B50D885BA657AE6CE53351D06CB28,...,FF74A2BF92C0CBFEA1266306B80107B9,FF7E4942F271CD121E06B15C2DE02765,FF9981615747FA080E127D7EDBB77807,FF9E7202742B15F825A7F9F769DDE818,FFA6462F1CF28F25EE71C572250A0206,FFC97C249C7A60F0FB5689F65308539B,FFD7FAD84974ADBC98AFB23ECE2DE686,FFDED38AAC83231EA9B8328B972B83EE,FFFAE78A74255FB995AEFABC6F94564A,FFFDAD2B0C2DA2BE9917C12D20FFD4CF
member_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000E04BC8F25CC3913FA914348589098,1.0,,1.0,,,0.333333,,,,,...,,,,,,,,,,
001A83233916A080AFEE0530D141E22E,,1.0,,,,,,,,,...,,,,,,,,,,
002FE3426FEF527838C2641FA6744230,1.0,,1.0,,,,,,,,...,,,,,,,,,,
0031C3A6C8E230F8F3F06FCCC8DADCDE,,,,1.0,,,,,,,...,,,,,,,,,,
003BEF8797D24D9C7F2A2EEE5B9A1FAB,,,,,1.0,,,,,,...,,,,,,,,,,
003F2C5811E0919048D0C4BE99CE79E7,0.333333,,,,,1.0,,,,,...,,,,,,,,,,
004095471C5136260CC39A9C819C3B1A,,,,,,,1.0,,,,...,,,,,,,,,,
0051648FC0754F9E7117225BAF68773D,,,,,,,,1.0,,,...,,,,,,,,,,
006A07FD14B6E7DEC274FF5FCC0E63D6,,,,,,,,,1.0,,...,,,,,,,,,,
006B50D885BA657AE6CE53351D06CB28,,,,,,,,,,1.0,...,,,,,,,,,,


In [39]:

# yeh hamara rating predict karne ka function hai jisme user jaayega aur ek item jaayega aur rating predict karni hai
# yaha ratings 0-5 tak hai toh avergae rating is 2.5
def calculate_score(user,item,sim_matrix):
    # if item phle se training set me nahi hai toh apun nahi jaante uske baare me kuch toh direct avergae rating 2.5 return kardo
    if item not in train_dfs[0].columns:
        return 2.5
    # yaha apun user u ki similarity sabke saath nikaalenge and user u ki similiariy user u ke saath hi drop kardenge
    # extract the similarity scores of all users with the user u and remove similarity with itself
    similarity_scores=sim_matrix[user].drop(labels=user)
    # ab hum i ko kisne kisne rate kia hai woh nikalenge and usme se user u ne joh rate kia hai usko nikaal denge
    # extract the ratings of the items i given by all users and remove the rating given by u user
    normalized_ratings=normalized_train_dfs[0][item].drop(index=user)
    if similarity_scores.isna().all():
        return 2.5
    # aur nahi toh weighted average nikaal do bachche hue users ka
    # rating_pred=summation(ratings*simililarities)/summation(similariites)
    ans=np.nansum(similarity_scores*normalized_ratings)/np.nansum(similarity_scores)
    if ans==0: return 2.5
    avg_user_rating=train_dfs[0].T.mean()[user]
    return avg_user_rating+ans



    

In [40]:
# isme hum saare test ratings ko e array me bhar rahe hai
test_ratings=np.array(test['rating'])
print(test_ratings.shape)
# abhi saare users and items ka pair bana rahe hai from test set
user_item_pairs=zip(test["member_id"],test["hotel_id"])
user_item_pairs=list(user_item_pairs)
# user_item_pairs=user_item_pairs[0:5000]
# test_ratings=test_ratings[0:5000]

(3904,)


In [41]:

from sklearn.metrics import mean_squared_error
# abhi joh function banaya tha upar prediciton ka (calculate score ) usme yeh saare pairs daalenge aur pred_rating nikaalenge
pred_ratings=np.array([calculate_score(user_id,item_id,euclidean_average) for user_id,item_id in user_item_pairs])
# abhi mrean_squared error nikaal denge actual ratings ka and predicted ratings ka
contains_nan = np.isnan(pred_ratings).any()
print(contains_nan)
print(np.sqrt(mean_squared_error(test_ratings,pred_ratings)))

False
1.4518555428479198


In [71]:
a=np.array([[1,1,1],[1,2,4],[1,np.nan,5]])
a=pd.DataFrame(a)
a=a.subtract(a.mean(axis=1),axis=0)
a=a.fillna(0)
C=a.T.corr()
# b=np.array([4,np.nan])
# print()
# print(np.nansum(a*b))
C.head()

Unnamed: 0,0,1,2
0,,,
1,,1.0,0.981981
2,,0.981981,1.0
