一个无聊的测试：用随机森林看一下什么特征对price的贡献最大

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
data = pd.read_csv('processed_data_merged.csv')
data = data.select_dtypes(include=['float64', 'int64'])

X = data.drop(['price','id','latitude','longitude'], axis=1)
y = data['price']

# 创建随机森林模型
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)

# 输出特征重要性
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

                  Feature  Importance
3   closest_mall_distance    0.450548
2         distance_to_mrt    0.176417
1          maximum_months    0.088375
17                  stove    0.048059
6                     gym    0.029090
12                     TV    0.027015
7                    pool    0.026324
10                kitchen    0.026132
5                     BBQ    0.026105
11               Backyard    0.019612
0          minimum_months    0.019260
13           refrigerator    0.012250
15                   Oven    0.011824
14              Microwave    0.010230
16                   Pets    0.010024
4            conditioning    0.009412
9                    Wifi    0.003522
8                   dryer    0.003393
18                    fan    0.002409


这是全部原始数据塞进去直接计算，相似度全都是0.9999服了

In [87]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# 用户设定
desired_price = 1400
desired_roomtype = "Private room"
desired_month = 12
desired_latitude = 1.36288
desired_longitude = 103.86575
data = pd.read_csv('processed_data_modified.csv')
user_amenities_input = [0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

# 设定的权重
weights = {
    'price': 0.5,
    'room_type': 0.1,
    'distance': 0.2,
    'amenities': 0.2  
}

# 计算距离
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 地球半径，单位为公里
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c
    return d

data['distance'] = data.apply(lambda row: haversine_distance(desired_latitude, desired_longitude, row['latitude'], row['longitude']), axis=1)

# 数据集中的amenities列
amenities_columns = ['conditioning', 'BBQ', 'gym', 'pool', 'dryer', 'Wifi', 'kitchen', 'Backyard', 'TV', 'refrigerator', 'Microwave', 'Oven', 'Pets', 'stove', 'fan']

# 选择对应的roomtype列
roomtype_col = "room_type_" + desired_roomtype

# 选择对应的amenities列
selected_amenities = [amenities_columns[i] for i, val in enumerate(user_amenities_input) if val == 1]
vector_cols = selected_amenities + ['distance', 'price', roomtype_col]

# 创建user_vector
user_vector = np.array([1] * len(selected_amenities) + [0, desired_price, 1])
data_vector = data[vector_cols].values

# 计算余弦相似度
similarities = cosine_similarity(user_vector.reshape(1, -1), data_vector)

# 获取得分最高的Top 10房源
data['similarity_score'] = similarities[0]
top_10 = data.nlargest(10, 'similarity_score')
top_10

Unnamed: 0,id,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price,minimum_months,maximum_months,distance_to_mrt,closest_mrt_name,...,Oven,Pets,stove,fan,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,distance,similarity_score
1767,42910492,Serangoon,North-East Region,1.35582,103.86405,1800,3,37,0.468027,LORONG CHUAN MRT STATION,...,1,1,1,0,0,0,1,0,0.807462,0.999999
1760,42829911,Serangoon,North-East Region,1.3556,103.86376,1800,3,37,0.445562,LORONG CHUAN MRT STATION,...,1,1,1,0,0,0,1,0,0.839181,0.999999
1311,35744548,Serangoon,North-East Region,1.35731,103.86381,1800,3,37,0.63471,LORONG CHUAN MRT STATION,...,1,1,1,0,0,0,1,0,0.655827,0.999999
3366,899732642408799379,Serangoon,North-East Region,1.383397,103.861367,2250,3,5,1.83432,YIO CHU KANG MRT STATION,...,1,0,1,1,0,0,1,0,2.332882,0.999999
607,19490734,Ang Mo Kio,North-East Region,1.368,103.85688,1470,6,37,0.786649,ANG MO KIO MRT STATION,...,0,1,0,0,0,0,1,0,1.138576,0.999999
2256,53280269,Serangoon,North-East Region,1.356,103.86437,3000,3,12,0.488507,LORONG CHUAN MRT STATION,...,1,1,1,1,0,0,1,0,0.78025,0.999999
876,25282863,Serangoon,North-East Region,1.38143,103.86984,2400,3,37,1.367039,FERNVALE LRT STATION,...,0,0,0,0,0,0,1,0,2.11218,0.999999
339,12738896,Ang Mo Kio,North-East Region,1.36741,103.85285,2370,6,37,0.389297,ANG MO KIO MRT STATION,...,0,0,0,0,0,0,1,0,1.519903,0.999999
381,14131949,Serangoon,North-East Region,1.36464,103.86352,4440,3,37,1.450333,LORONG CHUAN MRT STATION,...,1,0,1,0,0,0,1,0,0.315834,0.999999
12,468782,Serangoon,North-East Region,1.36288,103.86575,1410,3,6,1.265459,LORONG CHUAN MRT STATION,...,0,0,0,0,0,0,1,0,0.0,0.999999


这块不要管了，我脑洞大开乱想的

In [None]:
# 修改amenities
for col in amenities_columns:
    data[col] = data[col] * 1000

# 修改distance
data['distance'] = data['distance'] * 1000

以下是先对向量里的特征先加权再特征缩放，结果看起来是最最合理的
如果说还有待改进的话：
1.价格惩罚项我设置的是价格大于1/2小于2倍，由于我们的输入是minprice和maxprice，可以把这个区间以外的都设置为惩罚项，缺点就是数据集会不会太少导致推荐结果不够
2.距离惩罚项随手设置的5km
3.……不知道了要崩溃了这个结果还行了555

In [107]:
from sklearn.preprocessing import StandardScaler

# 权重设定
amenity_weight = 1 - (weights['price'] + weights['room_type'] + weights['distance'])
average_amenity_weight = amenity_weight / len(selected_amenities)

weight_vector = [average_amenity_weight] * len(selected_amenities) + [weights['distance'], weights['price'], weights['room_type']]
weight_vector = np.array(weight_vector)

# 创建user_vector
user_vector = np.array([1] * len(selected_amenities) + [0, desired_price, 1])

# 加权用户和数据向量
user_vector_weighted = user_vector * weight_vector
data_vector = data[selected_amenities + ['distance', 'price', roomtype_col]].values
data_vector_weighted = data_vector * weight_vector

# 特征缩放
scaler = StandardScaler()
data_vector_scaled = scaler.fit_transform(data_vector_weighted)
user_vector_scaled = scaler.transform([user_vector_weighted])

# 计算价格偏差
def price_penalty(row_price, desired_price):
    if row_price < desired_price * 0.5 or row_price > desired_price * 2:
        return 0.5  
    return 1.0

def distance_penalty(row_distance):
    if row_distance > 5:
        return 0.5  
    return 1.0

data['price_penalty'] = data['price'].apply(lambda x: price_penalty(x, desired_price))
data['distance_penalty'] = data['distance'].apply(distance_penalty)

# 计算相似度
similarities = cosine_similarity(user_vector_scaled, data_vector_scaled)

# 获取得分最高的Top 10房源
data['similarity_score'] = similarities[0] * data['price_penalty'] * data['distance_penalty']
top_10 = data.nlargest(10, 'similarity_score')

columns_to_display = ['id', 'price', 'distance', roomtype_col] + selected_amenities + ['similarity_score']
top_10[columns_to_display]

Unnamed: 0,id,price,distance,room_type_Private room,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,similarity_score
3366,899732642408799379,2250,2.332882,1,1,1,1,1,1,0,1,0.863189
95,4712676,1650,3.055456,1,1,1,1,0,1,1,1,0.84964
1797,43177325,1350,4.633893,1,1,1,1,1,1,1,0,0.841723
1117,32205289,1200,3.559773,1,1,1,1,0,1,1,1,0.839036
58,2156372,1800,3.578798,1,0,1,1,1,1,1,1,0.772226
1364,36989732,1500,3.798745,1,0,1,1,1,1,1,1,0.766886
1698,41926070,2460,4.634836,1,0,1,1,1,1,1,1,0.742751
33,982909,2550,4.73584,1,1,1,1,1,0,1,1,0.716797
2598,678746686572598499,2100,2.919098,1,1,1,1,1,0,1,0,0.641719
147,6620261,2730,4.546571,0,1,1,1,1,0,1,1,0.58922


以下为对比实验！！！先特征缩放再加权的做法，结果很差

In [106]:
from sklearn.preprocessing import StandardScaler

# 权重设定
amenity_weight = 1 - (weights['price'] + weights['room_type'] + weights['distance'])
average_amenity_weight = amenity_weight / len(selected_amenities)

weight_vector = [average_amenity_weight] * len(selected_amenities) + [weights['distance'], weights['price'], weights['room_type']]
weight_vector = np.array(weight_vector)

# 创建user_vector
user_vector = np.array([1] * len(selected_amenities) + [0, desired_price, 1])

# 特征缩放
scaler = StandardScaler()
data_vector_scaled = scaler.fit_transform(data_vector)
user_vector_scaled = scaler.transform([user_vector])

# 加权用户和数据向量
user_vector_weighted = user_vector_scaled * weight_vector
data_vector = data[selected_amenities + ['distance', 'price', roomtype_col]].values
data_vector_weighted = data_vector_scaled * weight_vector

# 计算价格偏差
def price_penalty(row_price, desired_price):
    if row_price < desired_price * 0.5 or row_price > desired_price * 2:
        return 0.5
    return 1.0

def distance_penalty(row_distance):
    if row_distance > 5:
        return 0.5
    return 1.0

data['price_penalty'] = data['price'].apply(lambda x: price_penalty(x, desired_price))
data['distance_penalty'] = data['distance'].apply(distance_penalty)

# 重新计算相似度
similarities = cosine_similarity(user_vector_weighted, data_vector_weighted)

# 获取得分最高的Top 10房源
data['similarity_score'] = similarities[0] * data['price_penalty'] * data['distance_penalty']
top_10 = data.nlargest(10, 'similarity_score')

columns_to_display = ['id', 'price', 'distance', roomtype_col] + selected_amenities + ['similarity_score']
top_10[columns_to_display]


Unnamed: 0,id,price,distance,room_type_Private room,BBQ,dryer,Wifi,kitchen,Backyard,TV,refrigerator,similarity_score
3366,899732642408799379,2250,2.332882,1,1,1,1,1,1,0,1,0.985478
1767,42910492,1800,0.807462,1,0,1,1,1,0,1,1,0.980151
1760,42829911,1800,0.839181,1,0,1,1,1,0,1,1,0.980105
95,4712676,1650,3.055456,1,1,1,1,0,1,1,1,0.977541
876,25282863,2400,2.11218,1,0,1,1,1,0,1,1,0.976963
58,2156372,1800,3.578798,1,0,1,1,1,1,1,1,0.974216
607,19490734,1470,1.138576,1,0,1,1,1,0,1,0,0.973864
2598,678746686572598499,2100,2.919098,1,1,1,1,1,0,1,0,0.973835
1311,35744548,1800,0.655827,1,0,1,1,1,0,0,1,0.973178
2831,775240664166599463,1530,2.82087,1,0,1,1,1,0,1,1,0.972415
