In [1]:
import pandas as pd
import overpy
from tqdm.notebook import tqdm
from distance_calculator import *
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

api = overpy.Overpass()
api = overpy.Overpass(url="https://overpass.osm.jp/api/interpreter")

# 创建线程锁，用于安全地更新共享数据
lock = threading.Lock()

# POI数据爬取

In [2]:
def get_pois(lat, lon, radius_m=1000, timeout_s=40):
    # 典型“主要 POI”键：覆盖公共设施、商业、交通、教育、医疗等
    keys = (
        "amenity|shop|tourism|leisure|healthcare|public_transport|"
        "railway|aeroway|sport|education|office"
    )
    q = f"""
    [out:json][timeout:{timeout_s}];
    (
      nwr(around:{radius_m},{lat},{lon})[~"^({keys})$"~"."];
      /* 如需补充仅有 name 的地标并排除住宅，可取消下面的注释：
      nwr(around:{radius_m},{lat},{lon})["name"]["building"!~"^(apartments|residential|house|detached|terrace|dormitory|semidetached_house)$"][landuse!="residential"];
      */
    );
    out tags center;
    """
    api = overpy.Overpass(url="https://overpass.osm.jp/api/interpreter")
    result = api.query(q)

    pois = []
    seen = set()

    # nodes
    for n in result.nodes:
        oid = f"node/{n.id}"
        if oid in seen: continue
        seen.add(oid)
        pois.append({"id": oid, "lat": float(n.lat), "lon": float(n.lon), "tags": dict(n.tags)})

    # ways (用 center)
    for w in result.ways:
        if w.center_lat is None or w.center_lon is None:
            continue
        oid = f"way/{w.id}"
        if oid in seen: continue
        seen.add(oid)
        pois.append({"id": oid, "lat": float(w.center_lat), "lon": float(w.center_lon), "tags": dict(w.tags)})

    # relations (用 center)
    for r in result.relations:
        if r.center_lat is None or r.center_lon is None:
            continue
        oid = f"relation/{r.id}"
        if oid in seen: continue
        seen.add(oid)
        pois.append({"id": oid, "lat": float(r.center_lat), "lon": float(r.center_lon), "tags": dict(r.tags)})

    return pois

In [3]:
# 示例
get_pois(31.2304, 121.4737)

[{'id': 'node/267195089',
  'lat': 31.2346453,
  'lon': 121.4716535,
  'tags': {'addr:city': '上海',
   'addr:housenumber': '290',
   'addr:street': '西藏中路',
   'amenity': 'cinema',
   'description': 'Very crowded\nnear downtown\nIMAX\n3D',
   'dsscreated': '#cinema',
   'name': '和平影都',
   'name:en': 'Peace Cinema',
   'name:zh': '和平影都',
   'name:zh-Hans': '和平影都',
   'name:zh-Hant': '和平影都',
   'note': 'Show This to your Taxi Driver: 西藏中路290号， 近汉口路',
   'opening_hours': 'Daily 8am-10pm',
   'phone': '+86 6361 2898',
   'wheelchair': 'no'}},
 {'id': 'node/301638556',
  'lat': 31.2341554,
  'lon': 121.4711371,
  'tags': {'name': '人民广场',
   'name:en': "People's Square",
   'name:zh': '人民广场',
   'name:zh-Hans': '人民广场',
   'name:zh-Hant': '人民广场',
   'public_transport': 'stop_position',
   'railway': 'stop',
   'subway': 'yes'}},
 {'id': 'node/301638561',
  'lat': 31.2291572,
  'lon': 121.474956,
  'tags': {'name': '大世界',
   'name:en': 'Dashijie',
   'name:zh': '大世界',
   'name:zh-Hans': '大世界',
 

In [2]:
house_df = pd.read_csv("data/house_property.csv")
house_df.head()

Unnamed: 0,CommunityName,TradeDate,TotalPrice,Price,ListedPrice,ListedTime,PriceChange,RoomShowing,Followers,Views,...,ParkingLot,lng,lat,Floor_level,Floor_total,Size_num,StairsUnitRatio_num,Only_South,Only_North,South_North
0,磨房北里,2011.01.14,108.0,26641,115.0,暂无数据,3,0,0,暂无数据,...,0,116.480977,39.886406,底层,6,40.54,0.333333,0,True,1
1,团结大院,2011.01.18,177.0,30891,180.0,暂无数据,1,0,0,暂无数据,...,0,116.356735,39.942857,中楼层,6,57.3,0.5,0,True,1
2,独墅逸致,2011.05.15,680.0,26559,680.0,暂无数据,0,0,0,暂无数据,...,0,116.338468,39.615828,地下室,6,256.04,1.0,0,True,1
3,西木小区,2011.06.10,150.0,24229,180.0,暂无数据,0,0,0,1,...,0,116.278744,39.939056,顶层,6,61.91,0.333333,0,True,1
4,新华街六里,2011.06.10,120.0,19802,125.0,暂无数据,1,0,0,暂无数据,...,0,116.295536,39.848391,中楼层,6,60.6,0.333333,0,False,0


In [5]:
def process_single_house(house_id, price, coordinate, max_retries=10):
    """处理单个房屋的POI数据"""
    lon, lat = coordinate.split(",")
    house = {
        "id": house_id,
        "price_per_meter": price,
        "lon": lon,
        "lat": lat,
        "pois": []
    }
    
    error_count = 0
    while error_count < max_retries:
        try:
            pois = get_pois(float(lat), float(lon))
            house["pois"].extend(pois)
            break
        except Exception as e:
            error_count += 1
            if error_count >= max_retries:
                print(f"\nFailed to get POIs for house {house_id} after {max_retries} attempts: {str(e)}")
                house["pois"] = []
            else:
                continue
    
    return house
# 多线程版本
house_dict = {"houses": [], "total_houses": len(house_df)}

# 设置线程数（建议根据CPU核心数和网络情况调整，一般4-8个线程）
max_workers = 10

# 创建任务列表
tasks = []
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # 提交所有任务
    for house_id, (price, coordinate) in enumerate(zip(house_df['Price'], house_df['Coordinates']), start=1):
        future = executor.submit(process_single_house, house_id, price, coordinate)
        tasks.append(future)
    
    # 使用tqdm显示进度
    for future in tqdm(as_completed(tasks), total=len(tasks), desc="Processing houses"):
        try:
            house = future.result()
            with lock:  # 线程安全地添加结果
                house_dict["houses"].append(house)
        except Exception as e:
            print(f"\nError processing house: {str(e)}")

# 按house_id排序结果（因为多线程执行顺序不确定）
house_dict["houses"].sort(key=lambda x: x["id"])

print(f"\n完成！共处理 {len(house_dict['houses'])} 个房屋")

Processing houses:   0%|          | 0/9487 [00:00<?, ?it/s]


完成！共处理 9487 个房屋


## 房产-poi数据存储

In [6]:
import pickle
with open('data/house_poi_data.pkl', 'wb') as f:
    pickle.dump(house_dict, f)

In [9]:
import json
with open('data/house_poi_data.json', 'w', encoding="utf-8") as f:
    json.dump(house_dict, f, ensure_ascii=False, indent=4)

## poi数据单独存储

In [22]:
pois = []
for house in house_dict['houses']:
    pois.extend(house['pois'])

In [23]:
# 根据id去重
seen_ids = set()
unique_pois = []
for poi in pois:
    if poi['id'] not in seen_ids:
        seen_ids.add(poi['id'])
        unique_pois.append(poi)

print(f"去重前: {len(pois)} 个POI")
print(f"去重后: {len(unique_pois)} 个POI")

去重前: 2752477 个POI
去重后: 40887 个POI


In [26]:
with open('data/poi_data.json', 'w') as f:
    json.dump(unique_pois, f, ensure_ascii=False, indent=4)

## 房产数据单独存储

In [5]:
house_infos = []
for house in house_dict['houses']:
    # 补齐一些信息
    house_info = {
        "id": house['id'],
        "price_per_meter": house['price_per_meter'],
        "lon": house['lon'],
        "lat": house['lat'],
    }
    coordinate = house['lon'] + ',' + house['lat']
    house_mes = house_df[house_df['Coordinates'] == coordinate]
    house_info['county'] = house_mes['County'].astype(str).tolist()[0]
    house_info['town'] = house_mes['Town'].astype(str).tolist()[0]
    house_info['community_name'] = house_mes['CommunityName'].astype(str).tolist()[0]
    house_info['floor'] = house_mes['Floor'].astype(str).tolist()[0]
    house_infos.append(house_info)
print(len(house_infos))

9487


In [6]:
with open('data/house_data.json', 'w') as f:
    json.dump(house_infos, f, ensure_ascii=False, indent = 4)

In [7]:
"""house_id = 1
house_dict = {"houses":[], "total_houses":len(house_df)}
for price, coordinate in tqdm(zip(house_df['Price'], house_df['Coordinates']), 
                               total=len(house_df), 
                               desc="Processing houses"):
    lon, lat = coordinate.split(",")
    house = {
        "id": house_id,
        "price_per_meter": price,
        "lon": lon,
        "lat": lat,
        "pois":[]
    }
    error_count = 0
    while error_count < 10:
        try:
            pois = get_pois(float(lat), float(lon), 800)
            house["pois"].extend(pois)
            break
        except Exception as e:
            error_count += 1
            if error_count >= 10:
                print(f"Failed to get POIs for house {house_id} after 10 attempts")
                house["pois"] = []
            else:
                continue

    house_dict["houses"].append(house)
    
    house_id += 1
    """

'house_id = 1\nhouse_dict = {"houses":[], "total_houses":len(house_df)}\nfor price, coordinate in tqdm(zip(house_df[\'Price\'], house_df[\'Coordinates\']), \n                               total=len(house_df), \n                               desc="Processing houses"):\n    lon, lat = coordinate.split(",")\n    house = {\n        "id": house_id,\n        "price_per_meter": price,\n        "lon": lon,\n        "lat": lat,\n        "pois":[]\n    }\n    error_count = 0\n    while error_count < 10:\n        try:\n            pois = get_pois(float(lat), float(lon), 800)\n            house["pois"].extend(pois)\n            break\n        except Exception as e:\n            error_count += 1\n            if error_count >= 10:\n                print(f"Failed to get POIs for house {house_id} after 10 attempts")\n                house["pois"] = []\n            else:\n                continue\n\n    house_dict["houses"].append(house)\n\n    house_id += 1\n    '