In [None]:
# summary

# web : server-client : url
# 동적페이지 : URL변화 없이 페이지의 데이터 수정 : json(str) > response.json() > DataFrame
# 정적페이지 : URL변화 있이 페이지의 데이터 수정 : html(str) > BeautifulSoup > css-selector > DataFrame
# selenium : 웹브라우저를 python 코드로 컨트롤해서 데이터 수집
# requests(동적페이지,API) > requests(정적페이지) > selenium

# 웹크롤링 절차
#1. 웹서비스분석(개발자도구), API문서 : URL
#2. request(URL) > response(data) : data(json(str)), html(str)
#3. data(json(str), html(str)) > response.json(), BeautifulSouop(css-selector) > DataFrame

# request 시 401, 403, 5XX error가 발생하는 경우 > headers 수정해서 데이터 요청(user-agent, refere)
# API 수집 : request token 발행 후 크롤링

### Zigbang 원룸 매물 데이터 수집

In [1]:
import pandas as pd
import requests
import geohash2

#### Process
    - 동이름으로 위도 경도 구하기
    - 위도 경도로 geohash 알아내기
    - geohash로 매물 아이디 가져오기
    - 매물 아이디로 매물 정보 가져오기

> 목표 : 동 이름 입력 > 매물 데이터 출력

In [None]:
data_interface = \
{
  "success": True,
  "code": "200",
  "items": [
    {
      "id": 2654,
      "type": "address",
      "name": "첨단2동",
      "hint": "",
      "description": "광주시 광산구 첨단2동",
      "lat": 35.21628952026367,
      "lng": 126.84334564208984,
      "zoom": 5,
      "polygon": [],
      "_score": "",
      "_source": {
        "name_length": 4,
        "local1": "광주시",
        "local2": "광산구",
        "local3": "첨단2동",
        "web_level": 15,
        "web_lat": 35.21628952026367,
        "web_lng": 126.84334564208984,
        "app_level": 15,
        "app_lat": 35.21628952026367,
        "app_lng": 126.84334564208984,
        "법정동코드": ""
      },
      "zoom_level": {
        "google": 15,
        "daum": 4
      },
      "zoom_level_v2": {
        "app": 5,
        "web": 4
      }
    }
  ],
  "next": "",
  "limit": 0
}

In [4]:
a = {"success":True,"code":"200","items":[{"id":11707,"type":"school","name":"첨단고등학교","hint":"","description":"광주광역시 광산구 월계동","lat":35.206729146557365,"lng":126.83726443553984,"zoom":3,"polygon":[],"_score":"","_source":{"name_length":6,"school_code":"F100001155","gender":"both","stage":"high","sullib":"public","foundation_date":"2004-12-29","address":"광주광역시 광산구 월계동 864-5","local1":"광주광역시","local2":"광산구","local3":"월계동","suggestions_insensitive":["첨단고등학교"],"distance":1000},"zoom_level":{"google":15,"daum":4},"zoom_level_v2":{"app":4,"web":4}}],"next":"","limit":0}
pd.DataFrame(a['items'])

Unnamed: 0,id,type,name,hint,description,lat,lng,zoom,polygon,_score,_source,zoom_level,zoom_level_v2
0,11707,school,첨단고등학교,,광주광역시 광산구 월계동,35.206729,126.837264,3,[],,"{'name_length': 6, 'school_code': 'F100001155'...","{'google': 15, 'daum': 4}","{'app': 4, 'web': 4}"


#### 1. 동이름으로 위도 경도 구하기

In [13]:
address = "첨단2동"
url = f"https://apis.zigbang.com/v2/search?leaseYn=N&q={address}&serviceType=원룸"
response = requests.get(url)

data = response.json()['items'][0]
lat,lng = data['lat'],data['lng']
lat,lng

(35.21628952026367, 126.84334564208984)

#### 2. 위도 경도로 geohash 알아내기

In [24]:
geohash = geohash2.encode(lat,lng,5)
geohash

'wy60q'

In [25]:
geohash2.decode(geohash)

('35.2', '126.8')

#### 3. geohash로 매물 아이디 가져오기

In [27]:
url = f"https://apis.zigbang.com/v2/items?deposit_gteq=0&deposit_lteq=5000&domain=zigbang&geohash={geohash}&needHasNoFiltered=\
true&rent_gteq=0&sales_type_in=전세|월세&service_type_eq=원룸"
url

'https://apis.zigbang.com/v2/items?deposit_gteq=0&deposit_lteq=5000&domain=zigbang&geohash=wy60q&needHasNoFiltered=true&rent_gteq=0&sales_type_in=전세|월세&service_type_eq=원룸'

In [38]:
response = requests.get(url)
data = response.json()['items']
ids = [item['item_id'] for item in data]
len(ids), ids[:2]

(90, [35146489, 35437152])

#### 4. 매물 아이디로 매물 정보 가져오기

In [39]:
url = 'https://apis.zigbang.com/v2/items/list'
#if len(ids) > 999:
#    ids = ids[:990]
params = {
    "domain":"zigbang",
    "withCoalition":'true',
    "item_ids": ids, # 직방에선 아이템 갯수를 999까지 사용 가능함
}
response = requests.post(url,params)
response

<Response [200]>

In [55]:
df = pd.DataFrame(response.json()['items'])
df.tail(2)

Unnamed: 0,section_type,item_id,images_thumbnail,sales_type,sales_title,deposit,rent,size_m2,공급면적,전용면적,계약면적,room_type_title,floor,floor_string,building_floor,title,is_first_movein,room_type,address,random_location,is_zzim,status,service_type,tags,address1,address2,address3,manage_cost,reg_date,is_new,contract
88,,35215746,https://ic.zigbang.com/ic/items/35215746/1.jpg,월세,월세,200,30,33.06,"{'m2': 33.06, 'p': '10'}","{'m2': 33.06, 'p': '10'}",,,2,2,4,🔵첨단3지구🔵인기많은🔵오피형🔵신축원룸🔵,,2,장성군 남면 삼태리,"{'lat': 35.23625398068905, 'lng': 126.83339638...",False,True,원룸,[],전라남도 장성군 남면 삼태리,,,5,2023-02-01T14:25:01+09:00,False,
89,,34939568,https://ic.zigbang.com/ic/items/34939568/1.jpg,월세,월세,200,33,36.36,"{'m2': 36.36, 'p': '11'}","{'m2': 36.36, 'p': '11'}",,,3,3,4,♣신축첫입주(인기)오피스텔형♣내부깔끔♣화이트톤♣대박원룸♣,,2,장성군 남면 삼태리,"{'lat': 35.23641843071505, 'lng': 126.83318128...",False,True,원룸,[추천],전라남도 장성군 남면 삼태리,,,5,2023-02-13T16:59:36+09:00,False,


In [56]:
pd.options.display.max_columns=50

In [57]:
df.tail(2)

Unnamed: 0,section_type,item_id,images_thumbnail,sales_type,sales_title,deposit,rent,size_m2,공급면적,전용면적,계약면적,room_type_title,floor,floor_string,building_floor,title,is_first_movein,room_type,address,random_location,is_zzim,status,service_type,tags,address1,address2,address3,manage_cost,reg_date,is_new,contract
88,,35215746,https://ic.zigbang.com/ic/items/35215746/1.jpg,월세,월세,200,30,33.06,"{'m2': 33.06, 'p': '10'}","{'m2': 33.06, 'p': '10'}",,,2,2,4,🔵첨단3지구🔵인기많은🔵오피형🔵신축원룸🔵,,2,장성군 남면 삼태리,"{'lat': 35.23625398068905, 'lng': 126.83339638...",False,True,원룸,[],전라남도 장성군 남면 삼태리,,,5,2023-02-01T14:25:01+09:00,False,
89,,34939568,https://ic.zigbang.com/ic/items/34939568/1.jpg,월세,월세,200,33,36.36,"{'m2': 36.36, 'p': '11'}","{'m2': 36.36, 'p': '11'}",,,3,3,4,♣신축첫입주(인기)오피스텔형♣내부깔끔♣화이트톤♣대박원룸♣,,2,장성군 남면 삼태리,"{'lat': 35.23641843071505, 'lng': 126.83318128...",False,True,원룸,[추천],전라남도 장성군 남면 삼태리,,,5,2023-02-13T16:59:36+09:00,False,


In [58]:
df.columns

Index(['section_type', 'item_id', 'images_thumbnail', 'sales_type',
       'sales_title', 'deposit', 'rent', 'size_m2', '공급면적', '전용면적', '계약면적',
       'room_type_title', 'floor', 'floor_string', 'building_floor', 'title',
       'is_first_movein', 'room_type', 'address', 'random_location', 'is_zzim',
       'status', 'service_type', 'tags', 'address1', 'address2', 'address3',
       'manage_cost', 'reg_date', 'is_new', 'contract'],
      dtype='object')

In [59]:
columns = ['item_id', 'sales_type','deposit', 'rent', 'size_m2',
    'floor','building_floor', 'title','address',
    'status', 'service_type', 'tags', 'address1','manage_cost']
df=df[columns]
df.tail(2)

Unnamed: 0,item_id,sales_type,deposit,rent,size_m2,floor,building_floor,title,address,status,service_type,tags,address1,manage_cost
88,35215746,월세,200,30,33.06,2,4,🔵첨단3지구🔵인기많은🔵오피형🔵신축원룸🔵,장성군 남면 삼태리,True,원룸,[],전라남도 장성군 남면 삼태리,5
89,34939568,월세,200,33,36.36,3,4,♣신축첫입주(인기)오피스텔형♣내부깔끔♣화이트톤♣대박원룸♣,장성군 남면 삼태리,True,원룸,[추천],전라남도 장성군 남면 삼태리,5


In [60]:
df = df[df['address1'].str.contains("월계동").reset_index(drop=True)]
df.tail(2)

Unnamed: 0,item_id,sales_type,deposit,rent,size_m2,floor,building_floor,title,address,status,service_type,tags,address1,manage_cost
75,35410308,월세,300,37,39.67,2,4,♣인기(깔끔)투룸크기♣엘리베이터♣방크기大♣대박원룸♣,광산구 월계동,True,원룸,[추천],광주시 광산구 월계동,5
76,35406439,월세,300,38,49.59,2,4,❤월계동 끝판왕❤ 가성비 좋은방 ❤깔끔한 방❤,광산구 월계동,True,빌라,[],광주시 광산구 월계동,5


In [None]:
# python pep documents
# pep20, pep8 
# flake8 - 코드효율성 체크

In [61]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [None]:
# pep8 : 문법(에러O,코드실행X),컨벤션(에러X,코드실행O)