In [1]:
import requests
import pandas as pd
from pprint import pprint

```graphql
[out:json][timeout:180];

// 서울지역 설정
// area["ISO3166-2"="KR-11"][admin_level=4][boundary="administrative"]->.searchArea;

// 전국 설정
area["ISO3166-1"="KR"][admin_level=2]->.searchArea;

// 서울 안에 있는 도서관 선택
(
  node["amenity"="library"](area.searchArea);
  // way["amenity"="library"](area.searchArea);
  // relation["amenity"="library"](area.searchArea);
);

// JSON으로 자료 (좌표, 이름 등) 포함해서 출력
out center;
```

In [2]:
url = "https://overpass-api.de/api/interpreter"

# 전국 범위 쿼리 진행 (QL)
query = """
[out:json][timeout:180];
area["ISO3166-1"="KR"][admin_level=2]->.searchArea;
(
  node["amenity"="library"](area.searchArea);
);
out center;
"""

r = requests.post(url, data={"data": query})
data: dict = r.json()

In [3]:
[print(k, type(v)) for k, v in data.items()]
print()
pprint(data['elements'][0])

version <class 'float'>
generator <class 'str'>
osm3s <class 'dict'>
elements <class 'list'>

{'id': 368652466,
 'lat': 37.497318,
 'lon': 127.003258,
 'tags': {'amenity': 'library',
          'check_date': '2024-01-26',
          'name': '국립중앙도서관',
          'name:en': 'National Jungang Library',
          'name:ko': '국립중앙도서관',
          'name:ko-Latn': 'Gungnipjungangdoseogwan',
          'ncat': '도서관',
          'source': 'http://kr.open.gugi.yahoo.com'},
 'type': 'node'}


In [4]:
pd.DataFrame(data['elements'])

Unnamed: 0,type,id,lat,lon,tags
0,node,368652466,37.497318,127.003258,"{'amenity': 'library', 'check_date': '2024-01-..."
1,node,368652476,37.554120,127.024842,"{'amenity': 'library', 'name': '성동구립금호도서관', 'n..."
2,node,368652481,37.566794,127.051319,"{'amenity': 'library', 'name': '성동구립용답도서관', 'n..."
3,node,368652535,37.565982,126.806533,"{'amenity': 'library', 'name': '원이영어도서관', 'nam..."
4,node,368652537,37.495475,127.033180,"{'amenity': 'library', 'name': '역삼도서관', 'name:..."
...,...,...,...,...,...
1491,node,12797156099,36.594207,129.090320,"{'amenity': 'library', 'name': '입암면작은도서관'}"
1492,node,12811661729,37.593318,126.904741,"{'access': 'permit', 'amenity': 'library', 'na..."
1493,node,12811661730,37.593300,126.905438,"{'access': 'permit', 'amenity': 'library', 'na..."
1494,node,12821508806,37.490632,127.004958,"{'addr:city': '서울특별시', 'addr:district': '서초구',..."


In [5]:
pd.DataFrame(pd.json_normalize(data['elements']))

Unnamed: 0,type,id,lat,lon,tags.amenity,tags.check_date,tags.name,tags.name:en,tags.name:ko,tags.name:ko-Latn,...,tags.opening_date,tags.description,tags.operator:wikidata,tags.ref,tags.name:es,tags.access,tags.addr:unit,tags.baby_feeding,tags.image,tags.not:name
0,node,368652466,37.497318,127.003258,library,2024-01-26,국립중앙도서관,National Jungang Library,국립중앙도서관,Gungnipjungangdoseogwan,...,,,,,,,,,,
1,node,368652476,37.554120,127.024842,library,,성동구립금호도서관,Seongdongguripgeumho Library,성동구립금호도서관,Seongdongguripgeumhodoseogwan,...,,,,,,,,,,
2,node,368652481,37.566794,127.051319,library,,성동구립용답도서관,Seongdongguripyongdap Library,성동구립용답도서관,Seongdongguripyongdapdoseogwan,...,,,,,,,,,,
3,node,368652535,37.565982,126.806533,library,,원이영어도서관,Woniyeongeo Library,원이영어도서관,Woniyeongeodoseogwan,...,,,,,,,,,,
4,node,368652537,37.495475,127.033180,library,,역삼도서관,Yeoksam Library,역삼도서관,Yeoksamdoseogwan,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1491,node,12797156099,36.594207,129.090320,library,,입암면작은도서관,,,,...,,,,,,,,,,
1492,node,12811661729,37.593318,126.904741,library,,숭실고등학교도서관,,,,...,,,,,,permit,,,,
1493,node,12811661730,37.593300,126.905438,library,,숭실중학교도서관,,,,...,,,,,,permit,,,,
1494,node,12821508806,37.490632,127.004958,library,2025-05-09,서초 그림책 도서관,Seocho Illustrated Book Library,서초 그림책 도서관,Seocho Grimchak Doseogwan,...,,,,,,,,yes,https://picturebook.seocholib.or.kr/img/%EB%8F...,


`json_normalize`써서 묶여있는 `tags` 펼치기

In [6]:
df = pd.DataFrame(pd.json_normalize(data['elements']))[['lat', 'lon', 'tags.name']]
df.columns = ['lat', 'lon', 'name']
df

Unnamed: 0,lat,lon,name
0,37.497318,127.003258,국립중앙도서관
1,37.554120,127.024842,성동구립금호도서관
2,37.566794,127.051319,성동구립용답도서관
3,37.565982,126.806533,원이영어도서관
4,37.495475,127.033180,역삼도서관
...,...,...,...
1491,36.594207,129.090320,입암면작은도서관
1492,37.593318,126.904741,숭실고등학교도서관
1493,37.593300,126.905438,숭실중학교도서관
1494,37.490632,127.004958,서초 그림책 도서관


Nan있는지 확인하기 (이름 없는 경우가 있음)

In [7]:
print(any(df.lat.isna()))
print(any(df.lon.isna()))
print(any(df.name.isna()))

False
False
True


빈 값들 있는 행들은 제거

In [8]:
df.dropna(inplace=True)
len(df)

1466

이름 안에 '도서관'이 안 들어간 경우가 있음 - 서점, 유사 시설 또는 오표기인 것으로 확인

(`~`는 series의 bool mask를 리버스한거)

In [10]:
df[~df.name.str.contains('도서관', case=False, regex=False)]

Unnamed: 0,lat,lon,name
14,37.450003,126.906022,금천은행나무어린이문고
19,37.557305,127.151679,고덕1동문고
20,37.628858,127.069827,공릉3동문고
21,37.664508,127.063828,상계9동문고
22,37.514502,126.859455,신정7동문고
...,...,...,...
1474,34.740504,127.732387,2019물벼락만화카페
1477,37.623550,127.068815,한내지혜의 숲
1483,36.005855,129.335460,캔두 스터디카페
1489,37.330421,127.090194,Dongcheon Library


이중에서 '문고'가 들어간 이름이 많은데 이는 주민센터 도서관인데 이름이 잘못 기입된것이 대부분

(한글 -> 영어 -> 한글 번역 도중 오역으로 판단, OSM에서 자동 수집할 때 일어난 문제인 듯 함)

소규모 도서관이고 모든 도서관의 진위 확인 여부가 어려우므로 제외하기로 결정함.

In [11]:
df[
    df.name.str.contains('문고', case=False, regex=False) &
    ~df.name.str.contains('도서관', case=False, regex=False)
]

Unnamed: 0,lat,lon,name
14,37.450003,126.906022,금천은행나무어린이문고
19,37.557305,127.151679,고덕1동문고
20,37.628858,127.069827,공릉3동문고
21,37.664508,127.063828,상계9동문고
22,37.514502,126.859455,신정7동문고
...,...,...,...
1006,34.609471,127.289045,고흥학림문고
1007,34.744359,127.263374,신기구심문고
1008,34.676561,127.266517,성두구심문고
1151,37.489409,126.948201,파랑새문고


최종 필터링은 '도서관'이 포함된 장소만 포함하기로 결정

In [12]:
df_final = df[
    df.name.str.contains('도서관', case=False, regex=False)
]
df_final

Unnamed: 0,lat,lon,name
0,37.497318,127.003258,국립중앙도서관
1,37.554120,127.024842,성동구립금호도서관
2,37.566794,127.051319,성동구립용답도서관
3,37.565982,126.806533,원이영어도서관
4,37.495475,127.033180,역삼도서관
...,...,...,...
1490,35.168822,129.155652,부산광역시립해운대도서관 우동분관
1491,36.594207,129.090320,입암면작은도서관
1492,37.593318,126.904741,숭실고등학교도서관
1493,37.593300,126.905438,숭실중학교도서관


---

https://programmablesearchengine.google.com

역삼도서관 샘플 하나로 돌려보기

In [17]:
import os
from dotenv import load_dotenv
load_dotenv()

CX = os.getenv("CX")
KEY = os.getenv("KEY")

In [18]:
params = {
    "cx": CX,
    "key": KEY,
    "exactTerms": '역삼도서관',
}

r = requests.get(
    "https://customsearch.googleapis.com/customsearch/v1",
    params=params
)

results = r.json()

In [19]:
results.keys()

dict_keys(['kind', 'url', 'queries', 'context', 'searchInformation', 'items'])

In [None]:
len(results['items'])

10

In [None]:
import pickle as pkl

links = [l['link'] for l in results['items']]
[print(link) for link in links]

with open('./data/links_.pkl', 'wb') as f:
    pkl.dump(links, f)

https://blog.naver.com/cco0531/223368440851
https://blog.naver.com/nsomdary/220603902769?viewType=pc
https://blog.naver.com/cityinorange/223412420184
https://blog.naver.com/vividashley/222079124615?viewType=pc
https://blog.naver.com/PostView.naver?blogId=swimyans&logNo=222409327145&parentCategoryNo=&categoryNo=48&viewDate=&isShowPopularPosts=false&from=postView
https://blog.naver.com/alicegangnam/221499626658
https://blog.naver.com/nosung/221384969610?viewType=pc
https://blog.naver.com/yaloo77/223055150611?viewType=pc
https://blog.naver.com/mist6721/222048991731?viewType=pc
https://blog.naver.com/khykhyhy24/221225567053?viewType=pc


**note:**

`parser.py`에서 구현한 "꼼수":
1. 프린트 링크 사용해서 필요없는 내용 정리
2. BeautifulSoup써서 parsing 진행
3. `span`안에 든 모든 텍스트 추출

In [None]:
import pickle as pkl
from parser import *
from tqdm import tqdm

with open('./data/links_.pkl', 'rb') as f:
    links = pkl.load(f)

postprint_links = [to_postprint(url) for url in links]
print('\n' + '='*25 + '\n')
[print(p) for p in postprint_links]

with open('./data/lib1.txt', mode='w+', encoding='utf-8') as f:
    for pl in tqdm(postprint_links):
        if pl:
            f.write(pl + '\n')
            f.write(get_span(pl) + '\n\n')
        else:
            pass
        

skipping: https://blog.naver.com/PostView.naver?blogId=swimyans&logNo=222409327145&parentCategoryNo=&categoryNo=48&viewDate=&isShowPopularPosts=false&from=postView


https://blog.naver.com/PostPrint.naver?blogId=cco0531&logNo=223368440851
https://blog.naver.com/PostPrint.naver?blogId=nsomdary&logNo=220603902769
https://blog.naver.com/PostPrint.naver?blogId=cityinorange&logNo=223412420184
https://blog.naver.com/PostPrint.naver?blogId=vividashley&logNo=222079124615
None
https://blog.naver.com/PostPrint.naver?blogId=alicegangnam&logNo=221499626658
https://blog.naver.com/PostPrint.naver?blogId=nosung&logNo=221384969610
https://blog.naver.com/PostPrint.naver?blogId=yaloo77&logNo=223055150611
https://blog.naver.com/PostPrint.naver?blogId=mist6721&logNo=222048991731
https://blog.naver.com/PostPrint.naver?blogId=khykhyhy24&logNo=221225567053


100%|██████████| 10/10 [00:01<00:00,  5.96it/s]
