# 직방 데이터 크롤링 

In [1]:
import requests
import pandas as pd
import urllib
from urllib.parse import urlencode
from pandas.io.json import json_normalize
from scipy.spatial import distance

In [2]:
# 상세주소로 검색하므로 넣을 필요 없는 함수
# def get_addr_info(keyword):
#     url = "https://apis.zigbang.com/search/?q={}".format(keyword)
#     response = requests.get(url)
#     json_obj = response.json()
#     return json_obj["items"][0]["lat"], json_obj["items"][0]["lng"]

In [3]:
def make_target(target):
    encText = urllib.parse.quote(target)
    url = "https://maps.googleapis.com/maps/api/geocode/json?address=" + encText \
    + "&key=AIzaSyD7jDQUoMG1bS8SvukFrySE7aKhzSxSDts"
    response = requests.get(url)
    geo_info = response.json()
    target_lat = geo_info["results"][0]["geometry"]["location"]["lat"]
    target_lng = geo_info["results"][0]["geometry"]["location"]["lng"]
    return target_lat, target_lng

In [4]:
def get_ids(lat, lng):
    params = {
        "lat_south": lat - 0.01,
        "lat_north": lat + 0.01,
        "lng_west": lng - 0.01,
        "lng_east": lng + 0.01,
        "room": "[01,02,03,04,05]",
    }
    params_str = urlencode(params)
    url = "https://api.zigbang.com/v3/items2?" + params_str
    response = requests.get(url)
    json_obj = response.json()
    items = json_obj["list_items"]
    return [item["simple_item"]["item_id"] for item in items]

In [55]:
image = "https://ic.zigbang.com/ic/items/{}/1.jpg?w=150&h=112".format(id)

In [5]:
def get_items(ids):
    url = "https://api.zigbang.com/v3/items?detail=true&item_ids={}".format(str(ids).replace(" ",""))
    response = requests.get(url)
    json_obj = response.json()
    items = json_obj["items"]
    datas = [item["item"] for item in items]
    result_df = json_normalize(datas)
    
    # 위도 경도 나누어 주기
    result_df["lat"] = result_df.random_location.apply(lambda x: float(x.split(",")[0]))
    result_df["lng"] = result_df.random_location.apply(lambda x: float(x.split(",")[1]))

    filter_columns = ["id", "rent", "deposit", "floor", "size", "address1", "address2", "lat", "lng", "options", "manage_cost", "parking", "elevator", "movein_date",\
                  "agent_name", "agent_phone", "agent_email"]
    return result_df[filter_columns]

In [9]:
def main(target, rent=None, deposit=None, parking=None):
    '''
    target: target의 주소
    rent: 관리비 포함 월세
    deposit: 보증금
    parking: 주차 가능 여부 (가능 / 불가능)
    '''
    target_lat, target_lng = make_target(target)
    ids = get_ids(target_lat, target_lng)
    result_df = get_items(ids)
    
    
    # target 값과 euclidean distance 구하기
    distance_ls = []
    for lat, lng in zip(result_df["lat"], result_df["lng"]):
        distance_ls.append(distance.euclidean((target_lat, target_lng),\
                                            (lat, lng))*10000)
    result_df["distance"] = distance_ls
    
    # rent, deposit, parking을 만족시키는 리스트 뽑아내기
    result_df.manage_cost = result_df.manage_cost.apply(lambda x: 0 if x == "없음" else x.replace("만원", ""))
    result_df.rent = result_df.rent.astype("float")
    result_df.manage_cost = result_df.manage_cost.astype("float")
    
    if rent is not None:
        result_df = result_df[result_df.rent + result_df.manage_cost <= rent]

    elif deposit is not None:
        result_df = result_df[result_df.deposit <= deposit]
    
    elif parking is not None:
        result_df = result_df[result_df.parking == parking]
    
    result_df = result_df.sort_values(by=["distance"]).reset_index(drop=True)
    return result_df

In [10]:
df = main("홍대 상상마당")

In [11]:
df.shape

(650, 17)

In [143]:
df.to_csv("zigbang_crawling.csv", index=False)