In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
data = pd.read_csv('processed_data_merged.csv')
data = data.select_dtypes(include=['float64', 'int64'])

X = data.drop(['price','id','latitude','longitude'], axis=1)
y = data['price']

# 创建随机森林模型
model = RandomForestRegressor(n_estimators=100)
model.fit(X, y)

# 输出特征重要性
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

                  Feature  Importance
3   closest_mall_distance    0.450548
2         distance_to_mrt    0.176417
1          maximum_months    0.088375
17                  stove    0.048059
6                     gym    0.029090
12                     TV    0.027015
7                    pool    0.026324
10                kitchen    0.026132
5                     BBQ    0.026105
11               Backyard    0.019612
0          minimum_months    0.019260
13           refrigerator    0.012250
15                   Oven    0.011824
14              Microwave    0.010230
16                   Pets    0.010024
4            conditioning    0.009412
9                    Wifi    0.003522
8                   dryer    0.003393
18                    fan    0.002409


In [44]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


# 用户设定
desired_price = 1400
desired_roomtype = "Private room"
desired_month = 12
desired_latitude = 1.36288
desired_longitude = 103.86575

# 设定的权重
weights = {
    'price': 0.5,
    'room_type': 0.3,
    'distance': 0.2
}

# 计算每个列表与期望位置的相似度
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # 地球半径，单位为公里
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = R * c
    return d

# 计算与目标经纬度之间的距离
data['distance'] = data.apply(lambda row: haversine_distance(desired_latitude, desired_longitude, row['latitude'], row['longitude']), axis=1)

# 将距离转化为相似度得分
max_distance = data['distance'].max()
data['location_similarity'] = 1 - (data['distance'] / max_distance)

# 根据价格计算得分


# 根据房间类型计算得分
room_type_column = f'room_type_{desired_roomtype}'
data['room_type_score'] = data[room_type_column]

# 使用余弦相似度计算总得分
data['total_score'] = weights['price'] * data['price_similarity'] + \
                      weights['room_type'] * data['room_type_score'] + \
                      weights['distance'] * data['location_similarity']

# 过滤month要求
filtered_data = data[(data['minimum_months'] <= desired_month) & (data['maximum_months'] >= desired_month)]

# 获取得分最高的Top 10房源
selected_columns = ['id', 'latitude', 'longitude', 'price', 'price_similarity', 'location_similarity','room_type_score','total_score']
top_10 = filtered_data.nlargest(10, 'total_score')[selected_columns]

top_10


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['price_similarity'] = price_similarity


Unnamed: 0,id,latitude,longitude,price,price_similarity,location_similarity,room_type_score,total_score
607,19490734,1.368,103.85688,1470,0.998811,0.955884,1,0.990583
954,28369982,1.3692,103.85798,1650,0.986697,0.956856,1,0.98472
1255,34489654,1.38296,103.86942,1260,0.994483,0.912055,1,0.979652
1311,35744548,1.35731,103.86381,1800,0.969467,0.974589,1,0.979652
1767,42910492,1.35582,103.86405,1800,0.969467,0.968714,1,0.978476
1760,42829911,1.3556,103.86376,1800,0.969467,0.967485,1,0.978231
2831,775240664166599463,1.360688,103.891031,1530,0.996074,0.890702,1,0.976178
202,8522750,1.35476,103.87241,1800,0.969467,0.954759,1,0.975685
891,25768720,1.35699,103.89315,1230,0.991696,0.879286,1,0.971705
700,21716537,1.35573,103.83402,1500,0.997626,0.859904,1,0.970794


  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(lambda x: 1 if amenity in x else 0)
  df[amenity] = df['amenities'].apply(

In [7]:
import pandas as pd

processed_data = pd.read_csv('./processed_data_modified.csv')
merged_data=pd.read_csv('./data_merged(final+version).csv')

# def replace_value(row):
#     if 'Singapore' in row['name']:
#         return row['name'].replace('Singapore', row['neighbourhood_cleansed'])
#     else:
#         return row['name']
    

# merged_data['name'] = merged_data['name'].str.split('Singapore').str[0]
column1_to_add = merged_data['name'].str.split(' · ',n=2).str[2]

merged_data['name'] = merged_data['name'].str.split(' in ').str[0]
merged_data['name']= merged_data['name']+' in '+merged_data['neighbourhood_cleansed']

column2_to_add=merged_data['name']

processed_data.insert(1, 'name', column1_to_add)
processed_data.insert(1, 'details', column2_to_add)

processed_data.to_csv('processed_data_merged_addname.csv', index=False)

new=pd.read_csv('./processed_data_merged_addname.csv')
new.head()

Unnamed: 0,id,details,name,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,price,minimum_months,maximum_months,...,refrigerator,Microwave,Oven,Pets,stove,fan,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room
0,71609,Villa in Tampines,2 bedrooms · 3 beds · 1 private bath,Tampines,East Region,1.34537,103.95887,4170,3,37,...,1,0,0,0,0,0,0,0,1,0
1,71896,Home in Tampines,1 bedroom · 1 bed · Shared half-bath,Tampines,East Region,1.34754,103.95958,2100,3,37,...,1,0,0,0,0,0,0,0,1,0
2,71903,Home in Tampines,1 bedroom · 2 beds · Shared half-bath,Tampines,East Region,1.34531,103.961,2430,3,37,...,0,0,0,0,0,0,0,0,1,0
3,275343,Rental unit in Bukit Merah,1 bedroom · 1 bed · 2 shared baths,Bukit Merah,Central Region,1.29015,103.80814,1650,3,33,...,1,0,0,0,1,0,0,0,1,0
4,275344,Rental unit in Bukit Merah,1 bedroom · 1 bed · 2.5 shared baths,Bukit Merah,Central Region,1.28836,103.81144,2070,2,33,...,1,1,0,0,1,0,0,0,1,0
