In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# 用户设定

# data = pd.read_csv('processed_data_modified.csv')
data=pd.read_csv('./csv_output/processed_data_merged_final_v3.csv')
print("Number of rows:", data.shape[0])  
print("Number of columns:", data.shape[1])


In [None]:
import matplotlib.pyplot as plt

column_name = 'price'
plt.boxplot(data[column_name], vert=False)  # vert=False表示水平方向绘制箱线图
plt.xlabel(column_name)  # 设置x轴标签
plt.title(f'Box Plot of {column_name}')  # 设置标题
plt.show()  # 显示

In [None]:
Q1 = data[column_name].quantile(0.25)
Q3 = data[column_name].quantile(0.75)

# 计算IQR（四分位数间距）
IQR = Q3 - Q1

# 定义上界，超出这个范围的数据点被认为是离群值
upper_bound = Q3 + 3.0 * IQR

# 剔除单个离群值，保留非离群值的数据
outliers_index = data[data[column_name] > upper_bound].index
filtered_data = data[data[column_name] <= upper_bound]
data_cleaned = data.drop(outliers_index)
data_cleaned = data_cleaned.reset_index(drop=True)

plt.boxplot(filtered_data[column_name], vert=False)  # vert=False表示水平方向绘制箱线图
plt.xlabel(column_name)  # 设置x轴标签
plt.title(f'Box Plot of {column_name}')  # 设置标题
plt.show()  # 显示

In [None]:
print("Number of rows:", data_cleaned.shape[0])  
print("Number of columns:", data_cleaned.shape[1])

In [None]:
desired_price = 1400
desired_roomtype = "Private room"
desired_month = 12
desired_latitude = 1.36288
desired_longitude = 103.86575
user_amenities_input = [0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

# 设定的权重
weights = {
    'price': 0.5,
    'room_type': 0.1,
    'distance': 0.2,
    'amenities': 0.2  
}

# 计算距离
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 地球半径，单位为公里
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c
    return d

# source = filtered_data.copy()
data_cleaned['distance'] = data_cleaned.apply(lambda row: haversine_distance(desired_latitude, desired_longitude, row['latitude'], row['longitude']), axis=1)

# 数据集中的amenities列
amenities_columns = ['conditioning', 'BBQ', 'gym', 'pool', 'dryer', 'Wifi', 'kitchen', 'Backyard', 'TV', 'refrigerator', 'Microwave', 'Oven', 'Pets', 'stove', 'fan']

# 选择对应的roomtype列
roomtype_col = "room_type_" + desired_roomtype

# 选择对应的amenities列
selected_amenities = [amenities_columns[i] for i, val in enumerate(user_amenities_input) if val == 1]
room_types=["room_type_Entire home/apt","room_type_Hotel room","room_type_Private room","room_type_Shared room"]
price=data_cleaned['price'].values
price_loss=price-desired_price


scaler = StandardScaler()
# price_loss_scaled=scaler.fit_transform()

# scaler=MinMaxScaler(feature_range=(-1,1))
#scaler=MinMaxScaler()
price_loss_scaled=scaler.fit_transform(price_loss.reshape(-1,1)).flatten()
# def custom_scaling(X):
#     min_val = X.min()
#     max_val = X.max()

#     scaled_X = -1 + 2 * (X - min_val) / (max_val - min_val)
#     return scaled_X
# price_loss_scaled=custom_scaling(price_loss)
price_loss_scaled_df=pd.DataFrame(price_loss_scaled,columns=['price_loss_scaled'])

vector_cols = selected_amenities + ['distance']+room_types
user_room_type = [1 if roomtype == roomtype_col else 0 for roomtype in room_types]

# 创建user_vector
user_vector = np.array([1] * len(selected_amenities) + [0]+user_room_type+[0])

data_vector = pd.concat([data_cleaned[vector_cols],price_loss_scaled_df],axis=1).values

user_vector=user_vector.reshape(1, -1)
# 计算余弦相似度
similarities = cosine_similarity(user_vector, data_vector)



user_vector_amenities = np.array([1] * len(selected_amenities))
data_vector_amenities = data_cleaned[selected_amenities].values

data_room_type=data_cleaned[room_types].values
user_room_type=np.array(user_room_type).reshape(1, -1)
room_type_loss=1-cosine_similarity(user_room_type, data_room_type)
room_type_loss_df=pd.DataFrame(room_type_loss.T, columns=['room_type_loss'])

amenities_loss=1-cosine_similarity(user_vector_amenities.reshape(1, -1), data_vector_amenities)
amenities_loss_df=pd.DataFrame(amenities_loss.T, columns=['amenities_loss'])
scaler=MinMaxScaler()
distance_scaled=scaler.fit_transform(data_cleaned['distance'].values.reshape(-1, 1))
distance_scaled_df=pd.DataFrame(distance_scaled,columns=['distance'])
loss=pd.concat([amenities_loss_df,distance_scaled_df],axis=1)

# loss_scaled = scaler.fit_transform(loss)

loss_scaled=pd.concat([loss,price_loss_scaled_df],axis=1)
loss_scaled=pd.concat([loss_scaled,room_type_loss_df],axis=1)
# cost=(1-similarities[0])
cost=(1-similarities[0])+ weights['amenities']*loss_scaled['amenities_loss'].values+weights['distance']*loss_scaled['distance'].values+weights['price']*loss_scaled['price_loss_scaled'].values+weights['room_type']*loss_scaled['room_type_loss'].values
data_cleaned['cost'] = cost

# 获取得分最高的Top 10房源
data_cleaned['similarity_score'] = similarities[0]
top_10 = data_cleaned.nsmallest(10, 'cost')
top_10.to_csv('content_based_result.csv')
top_10

# Our content-based

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler

# 用户设定
desired_price_min = 1400
desired_price_max= 3000
desired_roomtype = "Private room"
desired_month = 12
desired_latitude = 1.36288
desired_longitude = 103.86575
user_amenities_input = [0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

# 设定的权重
weights = {
    'price': 0.5,
    'room_type': 0.1,
    'distance': 0.25,
    'amenities': 0.15  
}

# data = pd.read_csv('processed_data_modified.csv')
data=pd.read_csv('./dataSource/final_data.csv')

filtered_data = data[(data['maximum_months'] >= desired_month) & (data['minimum_months'] <= desired_month)]



filtered_data

Unnamed: 0,id,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,room_type,price,minimum_months,maximum_months,distance_to_mrt,...,accommodates,picture_url,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,listing_url
0,71609,Tampines,East Region,1.345370,103.958870,Private room,4170,3,37,0.496792,...,3,https://a0.muscache.com/pictures/24453191/3580...,4.44,4.37,4.00,4.63,4.78,4.26,4.32,https://www.airbnb.com/rooms/71609
1,71896,Tampines,East Region,1.347540,103.959580,Private room,2100,3,37,0.678619,...,1,https://a0.muscache.com/pictures/2440674/ac4f4...,4.16,4.22,4.09,4.43,4.43,4.17,4.04,https://www.airbnb.com/rooms/71896
2,71903,Tampines,East Region,1.345310,103.961000,Private room,2430,3,37,0.400694,...,2,https://a0.muscache.com/pictures/568743/7bc623...,4.41,4.39,4.52,4.63,4.64,4.50,4.36,https://www.airbnb.com/rooms/71903
3,275343,Bukit Merah,Central Region,1.290150,103.808140,Private room,1650,3,33,0.580759,...,1,https://a0.muscache.com/pictures/miso/Hosting-...,4.40,4.16,4.26,4.47,4.42,4.53,4.63,https://www.airbnb.com/rooms/275343
4,275344,Bukit Merah,Central Region,1.288360,103.811440,Private room,2070,2,33,0.612490,...,1,https://a0.muscache.com/pictures/miso/Hosting-...,4.54,4.64,4.21,4.64,4.57,4.64,4.43,https://www.airbnb.com/rooms/275344
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3436,920638892602895042,Tanglin,Central Region,1.296851,103.829829,Private room,3000,3,12,0.836969,...,2,https://a0.muscache.com/pictures/miso/Hosting-...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,https://www.airbnb.com/rooms/920638892602895042
3437,921632579190690942,Geylang,Central Region,1.310910,103.879490,Entire home/apt,13680,0,12,0.623004,...,13,https://a0.muscache.com/pictures/miso/Hosting-...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,https://www.airbnb.com/rooms/921632579190690942
3438,921860100684819207,River Valley,Central Region,1.292341,103.837234,Private room,7500,0,12,0.788851,...,2,https://a0.muscache.com/pictures/miso/Hosting-...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,https://www.airbnb.com/rooms/921860100684819207
3439,922639973995552700,Marine Parade,Central Region,1.303162,103.900534,Private room,2880,3,12,1.441732,...,1,https://a0.muscache.com/pictures/34975b5f-4d66...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,https://www.airbnb.com/rooms/922639973995552700


In [2]:
# def convert_amenities_to_input(form):
#     amenities_columns = ['conditioning', 'BBQ', 'gym', 'pool', 'dryer', 'Wifi', 'kitchen', 'Backyard', 'TV', 'refrigerator', 'Microwave', 'Oven', 'Pets', 'stove', 'fan']
#     user_amenities_input = []

#     # 将所有表单中的多选字段合并到一个列表中
#     all_selected_amenities = form.public_facilities.data + form.cooking_facilities.data + form.interior_facilities.data + form.other_needs.data

#     for amenity in amenities_columns:
#         if amenity in all_selected_amenities:
#             user_amenities_input.append(1)
#         else:
#             user_amenities_input.append(0)

#     return user_amenities_input

In [3]:

base_distance = 2
inverse_distance=1/(base_distance + 1e-5)  # 加一个小的常数防止除以0

In [4]:

desired_price = (desired_price_min + desired_price_max) / 2

In [5]:

encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(filtered_data[['room_type']])
onehot_df = pd.DataFrame(onehot, columns=encoder.get_feature_names_out(['room_type']))
onehot_df



Unnamed: 0,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0
...,...,...,...,...
3174,0.0,0.0,1.0,0.0
3175,1.0,0.0,0.0,0.0
3176,0.0,0.0,1.0,0.0
3177,0.0,0.0,1.0,0.0


In [6]:

filtered_data = filtered_data.reset_index(drop=True)
onehot_df = onehot_df.reset_index(drop=True)
filtered_data = pd.concat([filtered_data, onehot_df], axis=1)

In [7]:

def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 地球半径，单位为公里
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c
    return d

In [8]:

filtered_data['distance'] = filtered_data.apply(lambda row: haversine_distance(desired_latitude, desired_longitude, row['latitude'], row['longitude']), axis=1)
amenities_columns = ['conditioning', 'BBQ', 'gym', 'pool', 'dryer', 'Wifi', 'kitchen', 'Backyard', 'TV', 'refrigerator', 'Microwave', 'Oven', 'Pets', 'stove', 'fan']
roomtype_col = "room_type_" + desired_roomtype

In [9]:

selected_amenities = [amenities_columns[i] for i, val in enumerate(user_amenities_input) if val == 1]
room_types=["room_type_Entire home/apt","room_type_Hotel room","room_type_Private room","room_type_Shared room"]

In [10]:
price_scaler = RobustScaler()
distance_scaler = RobustScaler()


# 使用数据集中的价格来拟合scaler
# scaler.fit(filtered_data['price'].values.reshape(-1, 1))

# # 转换数据集中的价格
# filtered_data['scaled_price'] = scaler.transform(filtered_data['price'].values.reshape(-1, 1))

# 转换desired_price
# scaled_desired_price = scaler.transform(np.array(desired_price).reshape(-1, 1))[0, 0]
filtered_data['inverse_distance'] = 1 / (filtered_data['distance'] + 1e-5)  # 加一个小的常数防止除以0
filtered_data['similarity_distance'] = distance_scaler.fit_transform(filtered_data['inverse_distance'].values.reshape(-1, 1))
similarity_distance=distance_scaler.transform(np.array(inverse_distance).reshape(-1, 1))[0, 0]

In [11]:
filtered_data['price_difference'] = abs(filtered_data['price'] - desired_price)
filtered_data['reciprocal_price_difference'] = 1 / (filtered_data['price_difference'] + 1e-5)
filtered_data['normalized_reciprocal_price_difference'] = price_scaler.fit_transform(filtered_data['reciprocal_price_difference'].values.reshape(-1, 1))
desired_pricediff=50
inverse_desired_pricediff=1/desired_pricediff
normalized_reciprocal_pricediff=price_scaler.transform(np.array(inverse_desired_pricediff).reshape(-1, 1))[0, 0]

In [12]:
room_types=[roomtype_col]
onehot_cols=selected_amenities + room_types
filtered_data[onehot_cols] = filtered_data[onehot_cols].replace(0, -1)
vector_cols = selected_amenities + ['similarity_distance']+room_types+['normalized_reciprocal_price_difference']
user_room_type = [1 if roomtype == roomtype_col else -1 for roomtype in room_types]

user_vector = np.array([1] * len(selected_amenities) + [similarity_distance]+user_room_type+[normalized_reciprocal_pricediff])
data_vector = filtered_data[vector_cols].values


In [13]:

user_vector=user_vector.reshape(1, -1)
similarities = cosine_similarity(user_vector, data_vector)

In [14]:

filtered_data['similarity'] = similarities.transpose()
filtered_data['similarity']

0       0.134565
1       0.921655
2       0.761986
3       0.437772
4       0.878812
          ...   
3174    0.306542
3175    0.033772
3176    0.008733
3177    0.352404
3178   -0.027843
Name: similarity, Length: 3179, dtype: float64

In [15]:
top_10=filtered_data.nlargest(10, 'similarity')
columns_to_display = ['id', 'price', 'distance', roomtype_col] + selected_amenities + ['similarity']
top_10[columns_to_display]

Unnamed: 0,id,price,distance,room_type_Private room,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,similarity
2419,678746686572598499,2100,2.919098,1.0,1,1,1,1,-1,1,-1,0.971289
1415,39679521,2100,6.335719,1.0,1,1,1,1,1,1,1,0.95591
2801,823988366264665533,2250,5.944039,1.0,-1,1,1,1,1,1,1,0.955596
2429,681528708664601432,2250,5.454478,1.0,-1,1,1,1,-1,1,1,0.954192
646,22133845,2220,3.661861,1.0,-1,1,1,1,-1,-1,-1,0.952665
2578,746289351820967930,2100,3.449073,1.0,-1,-1,1,1,-1,1,-1,0.951592
78,4541072,2280,3.633151,1.0,-1,-1,1,1,-1,-1,-1,0.951147
378,15364942,2160,7.319424,1.0,-1,1,1,1,1,1,1,0.947407
2448,691733988252942677,2100,6.472738,1.0,1,-1,1,1,1,1,1,0.944498
2118,53483449,2100,6.490358,1.0,1,-1,1,1,1,1,1,0.944305


In [16]:
top_10['similarity']

2419    0.971289
1415    0.955910
2801    0.955596
2429    0.954192
646     0.952665
2578    0.951592
78      0.951147
378     0.947407
2448    0.944498
2118    0.944305
Name: similarity, dtype: float64

In [17]:
max_base_distance = 10  # 假设20公里是用户能接受的最大距离
# weight_distance = max_base_distance * (1 - weights['distance'])
weight_distance = 2
inverse_weight_distance=1/(weight_distance + 1e-5)  # 加一个小的常数防止除以0
similarity_weight_distance=distance_scaler.transform(np.array(inverse_weight_distance).reshape(-1, 1))[0, 0]

In [18]:
max_base_pricediff = desired_price_max - desired_price_min  # 假设20公里是用户能接受的最大距离

weight_pricediff = max_base_pricediff * (1 - weights['price'])
inverse_weight_pricediff=1/weight_pricediff  # 加一个小的常数防止除以0
similarity_weight_pricediff=price_scaler.transform(np.array(inverse_weight_pricediff).reshape(-1, 1))[0, 0]

In [19]:
amenities_weight_per_feature = weights['amenities'] / len(selected_amenities)
room_type_weight_per_feature = weights['room_type'] / len(room_types)
weight_vector = [amenities_weight_per_feature] * len(selected_amenities) + [weights['distance']]+[room_type_weight_per_feature] * len(room_types)+ [weights['price']]

In [20]:
user_vector_w = np.array([1] * len(selected_amenities) + [similarity_weight_distance]+user_room_type+[similarity_weight_pricediff])

In [21]:
weighted_data_vector = data_vector * weight_vector
weighted_user_vector =  user_vector_w * weight_vector
weighted_user_vector = weighted_user_vector.reshape(1, -1)
# total_scaler = RobustScaler()
# data_vector_scaled = total_scaler.fit_transform(weighted_data_vector)
# user_vector_scaled = total_scaler.transform(weighted_user_vector)
weighted_similarities = cosine_similarity(weighted_user_vector, weighted_data_vector)
# weighted_similarities = cosine_similarity(user_vector_scaled, data_vector_scaled)

In [22]:
filtered_data['weighted_similarity'] = weighted_similarities.transpose()
filtered_data['weighted_similarity']

0      -0.482440
1       0.241597
2       0.202365
3       0.115852
4       0.236917
          ...   
3174    0.228208
3175    0.558237
3176   -0.276111
3177    0.311968
3178   -0.304002
Name: weighted_similarity, Length: 3179, dtype: float64

In [23]:
top_10=filtered_data.nlargest(10, 'weighted_similarity')
columns_to_display = ['id', 'price', 'distance', roomtype_col] + selected_amenities + ['weighted_similarity']
top_10[columns_to_display]

Unnamed: 0,id,price,distance,room_type_Private room,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,weighted_similarity
1155,34489654,1260,2.269759,1.0,-1,1,1,1,-1,-1,1,0.998646
134,6718219,3000,2.145181,1.0,-1,1,1,1,-1,1,-1,0.998593
1630,42829911,1800,0.839181,1.0,-1,1,1,1,-1,1,1,0.997864
1637,42910492,1800,0.807462,1.0,-1,1,1,1,-1,1,1,0.99739
176,8522750,1800,1.167626,1.0,-1,-1,1,1,-1,1,-1,0.997272
867,28369982,1650,1.11351,1.0,-1,-1,1,-1,-1,-1,-1,0.996901
56,3034137,2910,2.087209,1.0,-1,1,1,-1,-1,1,-1,0.996769
1580,42084241,3090,2.616272,1.0,-1,-1,1,1,-1,1,-1,0.995659
799,25366424,1050,2.326341,1.0,-1,1,1,1,-1,-1,-1,0.995485
1210,35744548,1800,0.655827,1.0,-1,1,1,1,-1,-1,1,0.993912


In [24]:
top_10['weighted_similarity']

1155    0.998646
134     0.998593
1630    0.997864
1637    0.997390
176     0.997272
867     0.996901
56      0.996769
1580    0.995659
799     0.995485
1210    0.993912
Name: weighted_similarity, dtype: float64

In [40]:

from itertools import combinations
from deap import base, creator, tools, algorithms
import random
from functools import partial
import multiprocessing
import dill

In [41]:

def checkIndividual(individual, item_features, N):
    items = [item for item, selected in zip(item_features['id'], individual) if selected]
    return len(items) == N

In [42]:

def fix_individual(individual, item_features, N, id_to_index):
    print("Original Individual:", individual)
    
    # 确保个体中有 N 个 1
    while sum(individual) > N:
        idx = random.choice([i for i, val in enumerate(individual) if val == 1])
        individual[idx] = 0
        print("Removed 1 at position:", idx)
        
    while sum(individual) < N:
        idx = random.choice([i for i, val in enumerate(individual) if val == 0])
        individual[idx] = 1
        print("Added 1 at position:", idx)
    
    # 确保所有选中的物品都在 id_to_index 字典中
    selected_items = [item for item, selected in zip(item_features['id'], individual) if selected]
    for item in selected_items:
        if item not in id_to_index:
            print("Item not in id_to_index:", item)
            individual[item_features[item_features['id'] == item].index[0]] = 0
            replacement = random.choice([i for i, val in enumerate(individual) if val == 0 and item_features['id'].iloc[i] in id_to_index])
            individual[replacement] = 1
            print("Replaced with item at position:", replacement)
    
    print("Fixed Individual:", individual)
    return individual


In [43]:

def fix_and_evaluate(individual, item_features, n, id_to_index, toolbox):
    # 确保个体满足约束条件
    fixed_individual = fix_individual(individual, item_features, n, id_to_index)
    
    # 计算适应度
    fitness = toolbox.evaluate(fixed_individual)
    
    # 设置适应度
    fixed_individual.fitness.values = fitness
    
    return fixed_individual


In [44]:

def cxSet(ind1, ind2, n, item_features, toolbox, id_to_index):
    set1 = set([i for i, val in enumerate(ind1) if val == 1])
    set2 = set([i for i, val in enumerate(ind2) if val == 1])
    
    temp = set1.copy()
    set1 &= set2  # 交集
    set2 ^= temp  # 对称差集
    
    new_ind1 = [1 if i in set1 else 0 for i in range(len(ind1))]
    new_ind2 = [1 if i in set2 else 0 for i in range(len(ind2))]
    
    new_ind1 = creator.Individual(new_ind1)
    new_ind2 = creator.Individual(new_ind2)
    
    # 修复并计算适应度
    new_ind1 = fix_and_evaluate(new_ind1, item_features, n, id_to_index, toolbox)
    new_ind2 = fix_and_evaluate(new_ind2, item_features, n, id_to_index, toolbox)
    
    return new_ind1, new_ind2

def mutSet(individual,n,item_features,id_to_index, toolbox):
    """Mutation that ensures the number of 1's remains constant."""
    # 随机选择一个1的位置，将其变为0
    one_positions = [i for i, val in enumerate(individual) if val == 1]
    if one_positions:
        pos_to_zero = random.choice(one_positions)
        individual[pos_to_zero] = 0
    
    # 随机选择一个0的位置，将其变为1
    zero_positions = [i for i, val in enumerate(individual) if val == 0]
    if zero_positions:
        pos_to_one = random.choice(zero_positions)
        individual[pos_to_one] = 1
    fixed_individual = fix_and_evaluate(individual, item_features, n, id_to_index, toolbox)
    
    return fixed_individual,

In [45]:

def diversity(items, diversity_matrix,id_to_index):
    indices = [id_to_index[item] for item in items]
    selected_diversity = diversity_matrix[np.ix_(indices, indices)]
    diversity_score = selected_diversity[np.triu_indices(len(items), k=1)].mean()
    return diversity_score

In [46]:

def objective(items, item_features, epsilon,diversity_matrix,id_to_index):
    similarity_score = item_features.set_index('id').loc[items]['weighted_similarity'].mean()
    diversity_score = diversity(items, diversity_matrix,id_to_index)
    weighted_score = (1 - epsilon) * similarity_score + epsilon * diversity_score
    return weighted_score  # 加权用户权重的平均值

In [47]:

# 定义遗传算法的目标函数
def evalSolution(individual,item_features,N, epsilon,diversity_matrix,id_to_index):
    individual = np.array(individual)  # 将列表转换为NumPy数组
    selected_items = item_features['id'][individual == 1].tolist()
    if len(selected_items) != N:
        return -np.inf,  # 如果选中的物品数量不等于N，返回负无穷
    missing_items = [item for item in selected_items if item not in id_to_index]
    if missing_items:
        print("Missing items:", missing_items)
    if any(item not in id_to_index for item in selected_items):
        return -np.inf,
    return objective(selected_items, item_features, epsilon,diversity_matrix,id_to_index),

In [48]:
def initIndividual(item_features, N,individual_creator):
    items = random.sample(item_features['id'].tolist(), N)
    individual = [1 if item in items else 0 for item in item_features['id']]
    return individual_creator(individual)

In [49]:

def genetic_algorithm(item_features, epsilon, N,diversity_matrix,id_to_index):
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("individual", initIndividual, item_features=item_features, N=N,individual_creator=creator.Individual)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    # 使用单个进程池
    with multiprocessing.Pool(processes=24) as pool:
        toolbox.register("map", pool.map)
        toolbox.register("evaluate", partial(evalSolution, item_features=item_features, N=N, epsilon=epsilon,diversity_matrix=diversity_matrix,id_to_index=id_to_index))
        toolbox.register("mate", cxSet, n=N,item_features=item_features,toolbox=toolbox,id_to_index=id_to_index)
        toolbox.register("mutate", mutSet, n=N,item_features=item_features,id_to_index=id_to_index,toolbox=toolbox)
        toolbox.register("select", tools.selTournament, tournsize=3)
        
        population = toolbox.population(n=5)
        algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=40, verbose=False)
    
    top1 = tools.selBest(population, 1)[0]
    print("Top 1 Individual:", top1)
    print("Number of 1s in Top 1:", sum(top1))
    selected_indices = [i for i, val in enumerate(top1) if val == 1]
    selected_items = item_features.loc[selected_indices, 'id'].tolist()

    print("Selected Items:", selected_items)
    return selected_items, top1.fitness.values[0]

In [50]:
# def genetic_algorithm(item_features, epsilon, user_weights, N):
#     creator.create("FitnessMax", base.Fitness, weights=(1.0,))
#     creator.create("Individual", list, fitness=creator.FitnessMax)

#     toolbox = base.Toolbox()
#     toolbox.register("attr_bool", random.randint, 0, 1)
#     toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(item_features))
#     toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    
#     toolbox.register("evaluate", partial(evalSolution, item_features=item_features, N=N, epsilon=epsilon, user_weights=user_weights))
#     toolbox.register("mate", cxSet, n=N)
#     toolbox.register("mutate", mutSet, n=N)
#     toolbox.register("select", tools.selTournament, tournsize=3)
    
#     population = toolbox.population(n=300)
#     algorithms.eaSimple(population, toolbox, cxpb=0.7, mutpb=0.2, ngen=40, verbose=False)
    
#     top1 = tools.selBest(population, 1)[0]
#     selected_items = item_features.loc[item_features['id'].isin(np.array(top1)[np.array(top1) == 1]), 'id'].tolist()
#     return selected_items, top1.fitness.values[0]

In [51]:

N = 10
best_epsilon = 0
best_score = -np.inf
best_recommendation = None
item_features=filtered_data[vector_cols+['similarity','weighted_similarity','id']]


item_features = item_features[item_features['weighted_similarity'] >= 0]
item_features_without_similarity = item_features.drop(['similarity', 'weighted_similarity', 'id'], axis=1)
item_features = item_features.reset_index(drop=True)
# item_features_without_similarity=total_scaler.transform(item_features_without_similarity)

item_features

Unnamed: 0,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,similarity_distance,room_type_Private room,normalized_reciprocal_price_difference,similarity,weighted_similarity,id
0,1,1,1,1,1,1,1,-0.511664,1.0,9.448397,0.921655,0.241597,71896
1,1,1,1,1,-1,1,-1,-0.541530,1.0,3.921272,0.761986,0.202365,71903
2,-1,1,1,1,-1,1,1,-0.471810,1.0,1.447593,0.437772,0.115852,275343
3,-1,1,1,1,-1,1,1,-0.461513,1.0,7.191760,0.878812,0.236917,275344
4,-1,1,1,1,-1,-1,1,0.530931,1.0,2.463568,0.680583,0.371046,294281
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1839,-1,1,1,1,-1,1,1,-0.289196,-1.0,0.698980,0.226455,0.048932,919083092811192983
1840,-1,1,1,-1,-1,-1,1,-0.148792,-1.0,6.654465,0.844061,0.253330,919330824746585701
1841,1,-1,1,1,-1,1,-1,-0.082038,1.0,0.891982,0.306542,0.228208,920638892602895042
1842,-1,1,1,1,-1,1,1,0.735744,-1.0,-0.245183,0.033772,0.558237,921632579190690942


In [52]:

similarity_matrix = cosine_similarity(item_features_without_similarity)

# 计算多样性矩阵
diversity_matrix = 1 - similarity_matrix
id_to_index = {row['id']: index for index, row in item_features.iterrows()}


# for epsilon in np.linspace(0, 0.5, 1):
epsilon=0.2
recommendation, score = genetic_algorithm(item_features, epsilon, N,diversity_matrix,id_to_index)
if score > best_score:
    best_epsilon = epsilon
    best_score = score
    best_recommendation = recommendation

print("Best Epsilon:", best_epsilon)
print("Best Recommendation:", best_recommendation)
print("Best Score:", best_score)



Missing items: [563596809177444958, 725927989859971677]
Missing items: [669791782739891346]
Missing items: [542953504353473979, 840153205449250813, 886561621182331272]
Missing items: [915681736655786529]
Missing items: [700230313582080038, 809553938268692523]


Original Individual: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [53]:
recomm = filtered_data[filtered_data['id'].isin(best_recommendation)]

# 按'weighted_similarity'列从大到小排序

sorted_recomm = recomm.sort_values(by='weighted_similarity', ascending=False)

In [54]:
columns_to_display = ['id', 'price', 'distance', roomtype_col] + selected_amenities + ['weighted_similarity']
sorted_recomm[columns_to_display]

Unnamed: 0,id,price,distance,room_type_Private room,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,weighted_similarity
75,4486217,3180,2.782149,-1.0,1,1,1,1,-1,1,1,0.986328
1050,32534020,3600,3.575251,-1.0,-1,1,1,1,-1,1,1,0.976782
553,19774868,4500,2.326537,1.0,-1,-1,1,1,-1,1,-1,0.971105
1783,46057791,12690,1.994856,-1.0,1,1,1,1,1,1,1,0.934613
1305,37924392,4080,5.526946,-1.0,-1,1,1,1,-1,1,-1,0.88778
1882,47927795,4650,5.48454,-1.0,-1,1,1,1,-1,1,1,0.880315
1851,47295577,6900,5.40496,-1.0,-1,1,1,-1,-1,1,-1,0.782228
1927,48816051,2970,5.674996,1.0,-1,1,1,-1,1,-1,1,0.647966
869,28522862,1440,6.010339,1.0,-1,1,1,1,-1,-1,1,0.585361
831,26583889,2430,8.745655,1.0,-1,1,1,1,-1,1,-1,0.247596
