In [220]:
import pickle
import pandas as pd
import re
import numpy as np
import requests
import json
import math
from tqdm import tqdm, trange

In [2]:
# 读取
shanghai_ershou = pickle.load(open( './shanghai_ershou_v2.pkl', 'rb'))
# hangzhou_new = pickle.load(open( './hangzhou_new.pkl', 'rb'))

In [3]:
l = []
for key in shanghai_ershou.keys():
    l.append(len(shanghai_ershou[key].keys()))
    
print('爬取房屋的总数：',sum(l))

爬取房屋的总数： 14780


In [4]:
shanghai_ershou_df = pd.DataFrame(columns = pd.DataFrame(shanghai_ershou['locationbeicaipg1']).index)
for i in shanghai_ershou.keys():
    temp = pd.DataFrame(shanghai_ershou[i]).T
    shanghai_ershou_df = pd.concat([shanghai_ershou_df, temp], ignore_index = True)

In [5]:
# 发现dataframe中每个元素都是一个列表（有可能为空）
def extract_0(x):
    """
    提取列表中第一个元素，若列表为空，则返回none
    """
    try:
        return x[0]
    except:
        return None   

In [6]:
df_shanghai = shanghai_ershou_df.copy()
# dataframe中每一列数据都从列表中提取出来
for col in df_shanghai.columns:
    df_shanghai[col] = df_shanghai[col].apply(extract_0)

In [7]:
df_shanghai.head()

Unnamed: 0,house_name,total_price,total_price_desc,unit_price,info,location1
0,芳芯苑,461,万,"66,456元/平",2室1厅 | 69.37平米 | 南 | 精装 | 高楼层(共6层) | 1994年建 | 板楼,北蔡
1,环龙公寓,821,万,"70,070元/平",3室2厅 | 117.17平米 | 南 | 精装 | 高楼层(共8层) | 2002年建 | 板楼,北蔡
2,虹南小区(浦东),360,万,"55,633元/平",2室1厅 | 64.71平米 | 南 | 简装 | 高楼层(共6层) | 1993年建 | 板楼,北蔡
3,金旋小区,365,万,"62,997元/平",2室1厅 | 57.94平米 | 南 | 精装 | 高楼层(共6层) | 1995年建 | 板楼,北蔡
4,绿川小区,360,万,"53,620元/平",2室1厅 | 67.14平米 | 南 北 | 精装 | 高楼层(共6层) | 1996年建 ...,北蔡


对总价进行处理

In [8]:
df_shanghai.total_price = df_shanghai.total_price.astype('float')

In [9]:
# total_price 只有一个单位：万
# df_shanghai.total_price_desc.unique()

对单价进行处理

In [10]:
df_shanghai.unit_price = df_shanghai.unit_price.str.extract(r'([\d,]+)').replace(',','', regex = True).astype('float')[0]

对面积进行处理

In [11]:
df_shanghai['area'] = df_shanghai.loc[:,'info'].str.extract('([\d.]+)平米')

批量调用高德接口

In [70]:
res_dict = {}
for i in trange(df_shanghai.shape[0]):
# for i in tqdm(range(5)):
    url = 'https://restapi.amap.com/v3/geocode/geo?key=c00a9fc63a97c64fe63bf1ff051a285e&address=上海市{}&city=上海市'
    location = df_shanghai.iloc[i, 5]+df_shanghai.iloc[i,0]
    try:
        res = requests.get(url.format(location.rstrip())).text
    except:
        res = None
    finally:
        res_dict[i] = res

In [84]:
df_shanghai['api'] = pd.Series(res_dict)

In [108]:
def parse_location(res):
    if res != None:
        geocodes = json.loads(res).get('geocodes')[0]
        location = geocodes.get('location')
        return location
    else:
        return None

In [207]:
# 经纬度
df_shanghai['location'] = df_shanghai.api.map(parse_location)

# 经度，纬度
df_shanghai['longitude'] = df_shanghai.location.str.extract('([\d.]+),')
df_shanghai['latitude'] = df_shanghai.location.str.extract(',([\d.]+)')

In [208]:
df_shanghai['longitude'] = df_shanghai['longitude'].astype('float')
df_shanghai['latitude'] = df_shanghai['latitude'].astype('float')

In [209]:
# 获取人民广场的经纬度
url = 'https://restapi.amap.com/v3/geocode/geo?key=c00a9fc63a97c64fe63bf1ff051a285e&address=上海市{}&city=上海市'
location = '人民广场'
res = requests.get(url.format(location.rstrip())).text
rg_location = json.loads(res).get('geocodes')[0].get('location')

matchObj = re.search('([\d.]+),', rg_location)
rg_longitude = float(matchObj.group(1)) # 经度
matchObj = re.search(',([\d.]+)', rg_location)
rg_latitude = float(matchObj.group(1)) # 纬度

In [210]:
def angle2radian(x):
    "角度转换为弧度"
    return x*math.pi/180

def rec2sphere(lng1, lat1):
    "球坐标系->直角坐标系"
    R = 6371
    x1 = R*math.cos(lng1)*math.cos(lat1)
    y1 = R*math.cos(lng1)*math.sin(lat1)
    z1 = R*math.sin(lat1)
    return x1, y1, z1

def get_chord_length(x1, y1, z1, x2, y2, z2):
    "获取直角坐标系中的直线距离"
    dx = x1 - x2
    dy = y1 - y2
    dz = z1 - z2
    lenth = np.sqrt(dx**2+dy**2+dz**2)
    return lenth

def get_distance(lng1, lat1, lng2 , lat2):
    "输入经纬度，得到两地距离(km)"
    R = 6371
    # 角度转化为弧度
    lng1 = angle2radian(lng1)
    lat1 = angle2radian(lat1)
    lng2 = angle2radian(lng2)
    lat2 = angle2radian(lat2)
    
    # 球坐标->直角坐标
    x1, y1, z1 = rec2sphere(lng1, lat1)
    x2, y2, z2 = rec2sphere(lng2, lat2)
    
    # 三维空间中的距离（大圆中的弦长）
    lenth = get_chord_length(x1, y1, z1, x2, y2, z2)
    
    # 大圆中的弧长
    alpha = math.asin(lenth/2/R)*2
    r = alpha*R
    return r

In [221]:
df_shanghai.shape[0]

14780

In [225]:
distance_dict = {}
for i in trange(df_shanghai.shape[0]):
    lng1 = df_shanghai.iloc[i].longitude
    lat1 = df_shanghai.iloc[i].latitude
    distance = get_distance(lng1, lat1, rg_longitude, rg_latitude)
    distance_dict[i] = distance
    

100%|███████████████████████████████████████████████████████████████████████████| 14780/14780 [00:15<00:00, 937.94it/s]


In [228]:
df_shanghai['distance_rg'] = pd.Series(distance_dict)

In [230]:
df_shanghai.drop(columns = ['api', 'location'], inplace=True)

数据存储

In [233]:
# df_shanghai.to_csv('sh_ershou_clean.csv')
df_shanghai.to_csv('sh_ershou_clean_v2.csv') # v2添加了房子到人民广场的距离