In [1]:
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import dump

import os
import numpy as np
import pandas as pd

In [2]:
ratings_df = pd.read_csv('./csv_output/ratingInfo.csv')

ratings_df = ratings_df[['userID', 'listing_id', 'rating']]
# ratings_df = ratings_df.groupby(['userID', 'listing_id']).agg({'estimated_rating_flair': 'mean'}).reset_index()
ratings_df.columns = ['user_id','item_id','rating']
ratings_df = ratings_df.drop(ratings_df[ratings_df.rating == 0].index)

 # convert to surprise format
reader = Reader(rating_scale=(0,5)) # assumes datafile contains: user, item, ratings (in this order)
data = Dataset.load_from_df(ratings_df, reader)

In [3]:
svd_param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30, 50],
    'lr_all': [0.002, 0.005, 0.007],
    'reg_all': [0.02, 0.04, 0.06],
    'lr_bu': [0.005, 0.01],
    'lr_bi': [0.005, 0.01],
    'reg_bu': [0.01, 0.02],
    'reg_bi': [0.01, 0.02],
    'lr_pu': [0.005, 0.01],
    'lr_qi': [0.005, 0.01],
    'reg_pu': [0.01, 0.02],
    'reg_qi': [0.01, 0.02]
}

nmf_param_grid = {
    'n_factors': [10, 20, 30, 40, 50,100],
    'n_epochs': [50, 100, 150],
    'lr_bu': [0.005, 0.01],
    'lr_bi': [0.005, 0.01],
    'reg_pu': [0.01, 0.02],
    'reg_qi': [0.01, 0.02],
    'reg_bu': [0.01, 0.02],
    'reg_bi': [0.01, 0.02]
}

models = {'SVD': SVD, 'NMF': NMF}

best_model = None
best_params = None
best_mae = float('inf')
best_model_name = ""

for model_name, model_class in models.items():
    if model_name == 'SVD':
        param_grid = svd_param_grid
    elif model_name == 'NMF':
        param_grid = nmf_param_grid
    
    gs = GridSearchCV(model_class, param_grid, measures=['rmse', 'mae'], cv=3)
    gs.fit(data)
    
    print(f"\nResults for {model_name}:")
    print("Best parameters: ", gs.best_params['mae'])
    print("Best MAE: ", gs.best_score['mae'])

    if gs.best_score['mae'] < best_mae:
        best_mae = gs.best_score['mae']
        best_model = gs.best_estimator['mae']
        best_params = gs.best_params['mae']
        best_model_name = model_name

print(f"\nOverall best model: {best_model_name} with parameters {best_params} and MAE = {best_mae}")



Results for SVD:
Best parameters:  {'n_factors': 100, 'n_epochs': 50, 'lr_all': 0.007, 'reg_all': 0.02, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.02, 'reg_bi': 0.01, 'lr_pu': 0.01, 'lr_qi': 0.01, 'reg_pu': 0.01, 'reg_qi': 0.02}
Best MAE:  0.6341113734545956

Results for NMF:
Best parameters:  {'n_factors': 100, 'n_epochs': 50, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_pu': 0.01, 'reg_qi': 0.01, 'reg_bu': 0.01, 'reg_bi': 0.01}
Best MAE:  0.6542914596567718

Overall best model: SVD with parameters {'n_factors': 100, 'n_epochs': 50, 'lr_all': 0.007, 'reg_all': 0.02, 'lr_bu': 0.005, 'lr_bi': 0.005, 'reg_bu': 0.02, 'reg_bi': 0.01, 'lr_pu': 0.01, 'lr_qi': 0.01, 'reg_pu': 0.01, 'reg_qi': 0.02} and MAE = 0.6341113734545956


In [31]:

trainset = data.build_full_trainset()
test_model=best_model
test_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fadc7dfb690>

In [32]:
from surprise import dump
model_filename = './fileOutput/best_matrix_factorization_model.surprise'
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=test_model)
# joblib.dump(best_model, model_file_name)
print(f"Model saved to {file_name}")

Model saved to ./fileOutput/best_matrix_factorization_model.surprise


In [11]:
# import joblib
from surprise import dump
model_filename = './fileOutput/best_matrix_factorization_model.surprise'
file_name = os.path.expanduser(model_filename)
dump.dump(file_name, algo=best_model)
# joblib.dump(best_model, model_file_name)
print(f"Model saved to {file_name}")

Model saved to ./fileOutput/best_matrix_factorization_model.surprise


In [35]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True) # sort on predicted rating
        top_n[uid] = user_ratings[:n]
    return top_n
\

# def recommend(df, model,user_id):
#     recommend_default_topn=10
#     if 'user_id' not in df.columns or 'item_id' not in df.columns:
#         raise ValueError("DataFrame must contain 'user_id' and 'item_id' columns")
    
#     # trainset = data.build_full_trainset()
#     # model.trainset=trainset
#     user_data = df[df['user_id'] == user_id]
#     testset = list(user_data.itertuples(index=False, name=None))

#     predictions = model.test(testset)
#     top_n = get_top_n(predictions, n=recommend_default_topn)
#     return top_n

def recommend(df, model, user_id, top_n=10):
    if 'user_id' not in df.columns or 'item_id' not in df.columns:
        raise ValueError("DataFrame must contain 'user_id' and 'item_id' columns")
    
    # 获取所有唯一的物品ID
    item_ids = df['item_id'].unique()
    
    # 存储预测的评分
    predictions = defaultdict(float)
    
    # 对于每个物品ID，预测用户的评分
    for item_id in item_ids:
        # uid = str(user_id)
        # iid = str(item_id)
        # model.predict() 返回的是一个Prediction对象，需要通过.est获取预测评分
        predictions[item_id] = model.predict(user_id, item_id, verbose=True).est
    
    # 根据评分对物品进行排序，取前top_n个
    recommended_items = sorted(predictions.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # 返回推荐物品的ID和预测评分
    return recommended_items

In [37]:
# model = joblib.load(model_file_name)
model_filename = './fileOutput/best_matrix_factorization_model.surprise'
file_name = os.path.expanduser(model_filename)
trainset, model = dump.load(file_name)
# model.trainset=trainset
rating_info = pd.read_csv('./csv_output/ratingInfo.csv')
item_info=pd.read_csv('./dataSource/final_data.csv')



# 创建一个新的DataFrame，为特定用户生成推荐

rating_info = rating_info[['userID', 'listing_id', 'rating']]
# ratings_df = ratings_df.groupby(['userID', 'listing_id']).agg({'estimated_rating_flair': 'mean'}).reset_index()
rating_info.columns = ['user_id','item_id','rating']
rating_info = rating_info.drop(rating_info[rating_info.rating == 0].index)

# reader = Reader(rating_scale=(0,5)) # assumes datafile contains: user, item, ratings (in this order)
# data = Dataset.load_from_df(rating_info, reader)
# trainset = data.build_full_trainset()

recommendations = recommend(rating_info,model,5)
recommendations

user: 5          item: 71609      r_ui = None   est = 4.67   {'was_impossible': False}
user: 5          item: 3209752    r_ui = None   est = 4.48   {'was_impossible': False}
user: 5          item: 3304326    r_ui = None   est = 4.59   {'was_impossible': False}
user: 5          item: 3667894    r_ui = None   est = 4.48   {'was_impossible': False}
user: 5          item: 3717196    r_ui = None   est = 4.28   {'was_impossible': False}
user: 5          item: 71896      r_ui = None   est = 3.93   {'was_impossible': False}
user: 5          item: 3802621    r_ui = None   est = 4.29   {'was_impossible': False}
user: 5          item: 3803160    r_ui = None   est = 4.26   {'was_impossible': False}
user: 5          item: 3980202    r_ui = None   est = 4.78   {'was_impossible': False}
user: 5          item: 3981252    r_ui = None   est = 4.29   {'was_impossible': False}
user: 5          item: 71903      r_ui = None   est = 4.33   {'was_impossible': False}
user: 5          item: 3990431    r_ui = No

[(1941719, 5),
 (6287204, 5),
 (16044788, 5),
 (24979153, 5),
 (28264364, 4.996494226982046),
 (4541183, 4.98494562290707),
 (31003554, 4.982231646421834),
 (12702541, 4.980697755075249),
 (675870259202962455, 4.97545886946243),
 (17601150, 4.957347009186385)]

In [40]:
recommendations_df = pd.DataFrame(recommendations, columns=['id', 'est_rating'])


recommended_items_info = item_info[item_info['id'].isin(recommendations_df['id'])]

recommended_items_info = recommended_items_info.merge(recommendations_df, on='id').sort_values(by='est_rating', ascending=False)
recommended_items_info=recommended_items_info.drop('est_rating', axis=1)
recommended_items_info


Unnamed: 0,id,name,details,neighbourhood_cleansed,neighbourhood_group_cleansed,picture_url,latitude,longitude,room_type,price,...,stove,fan,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,listing_url
0,1941719.0,Condo in Downtown Core,1 bedroom · 1 bed · 1 shared bath,Downtown Core,Central Region,https://a0.muscache.com/pictures/ba895238-3fb4...,1.27525,103.84524,Private room,3570.0,...,1.0,0.0,4.51,4.62,4.63,4.67,4.67,4.85,4.46,https://www.airbnb.com/rooms/1941719
2,6287204.0,Condo in Woodlands,1 bedroom · 2 beds · 1 bath,Woodlands,North Region,https://a0.muscache.com/pictures/81317317/1934...,1.43012,103.78434,Entire home/apt,5400.0,...,1.0,1.0,4.8,4.8,4.6,5.0,4.9,4.5,4.3,https://www.airbnb.com/rooms/6287204
4,16044790.0,Rental unit in Newton,1 bedroom · 1 bed · 1 private bath,Newton,Central Region,https://a0.muscache.com/pictures/5e2ce849-45d1...,1.30641,103.84045,Private room,3180.0,...,1.0,0.0,4.36,4.31,3.85,4.62,4.62,4.85,4.23,https://www.airbnb.com/rooms/16044788
5,24979150.0,Rental unit in Bukit Merah,1 bedroom · 1 bed · 1.5 shared baths,Bukit Merah,Central Region,https://a0.muscache.com/pictures/802c2389-ecd4...,1.28286,103.83403,Private room,1800.0,...,0.0,0.0,4.62,4.8,4.8,4.9,4.91,4.72,4.73,https://www.airbnb.com/rooms/24979153
6,28264360.0,Rental unit in Rochor,1 bedroom · 1 bed · 1 bath,Rochor,Central Region,https://a0.muscache.com/pictures/c7489e19-2cd6...,1.30331,103.8487,Entire home/apt,4350.0,...,1.0,0.0,4.69,4.73,4.65,5.0,4.96,4.88,4.58,https://www.airbnb.com/rooms/28264364
1,4541183.0,Aparthotel in Singapore River,Studio · 1 bed · 1 bath,Singapore River,Central Region,https://a0.muscache.com/pictures/5dd4acc0-79c5...,1.28816,103.84826,Entire home/apt,5430.0,...,0.0,0.0,4.79,4.81,4.84,4.79,4.89,4.89,4.69,https://www.airbnb.com/rooms/4541183
7,31003550.0,Serviced apartment in Kallang,1 bedroom · 1 bed · 1 bath,Kallang,Central Region,https://a0.muscache.com/pictures/miso/Hosting-...,1.29966,103.88497,Entire home/apt,5460.0,...,1.0,0.0,4.83,4.92,4.91,4.95,4.83,4.59,4.75,https://www.airbnb.com/rooms/31003554
3,12702540.0,Condo in Rochor,Studio · 1 bed · 1 bath,Rochor,Central Region,https://a0.muscache.com/pictures/8a9a8d7a-4926...,1.30409,103.8465,Entire home/apt,7200.0,...,0.0,0.0,4.69,4.88,4.81,4.83,4.96,4.75,4.83,https://www.airbnb.com/rooms/12702541
8,6.758703e+17,Serviced apartment in Outram,2 bedrooms · 2 beds · 1 bath,Outram,Central Region,https://a0.muscache.com/pictures/dede8e29-6341...,1.28168,103.84673,Entire home/apt,12360.0,...,0.0,0.0,4.75,4.75,4.5,4.75,5.0,5.0,4.75,https://www.airbnb.com/rooms/675839404447270831
