### os_pdf_info 크롤링
GLOBAL X Thematic ETF의 1)구성종목 수, 2)구성종목티커, 3)구성종목명을 크롤링합니다.

In [1]:
from fetch_data import fetch_theme_info


theme_info = fetch_theme_info()

- 웹 크롤링은 병렬로 처리합니다

In [2]:
import aiohttp
import asyncio
import nest_asyncio


nest_asyncio.apply()

BASE_URL="https://www.globalxetfs.com/funds/"

async def fetch_from_web(session, url):
    async with session.get(url) as response:
        html = await response.text()
        return url.split("/")[-1], html
        
async def parallel_fetch_from_web():
    urls = [BASE_URL+theme['etf_tkr'] for theme in theme_info]
    
    async with aiohttp.ClientSession() as session:
        result = await asyncio.gather(*[fetch_from_web(session, url) for url in urls])
        
    return result

result = asyncio.run(parallel_fetch_from_web())

- BeautifulSoup을 사용해 특정 값만 추출합니다

In [3]:
from typing import Tuple
import bs4


pdf_cnt_html_selector = "#fund > div.fd-3 > div > div > div:nth-child(2) > table:nth-child(2) > tr"
    
def child_stk_tkr_html_selector(idx: int) -> str:
    return f"#top-ten > tbody > tr:nth-child({idx}) > td:nth-child(2)"

def child_stk_en_name_html_selector(idx: int) -> str:
    return f"#top-ten > tbody > tr:nth-child({idx}) > td.label"

def get_stk_tkr_and_en_name(*args, **kwargs) -> Tuple[str, str]:
    soup = kwargs.pop("soup")
    idx = kwargs.pop("idx")
    
    child_stk_tkr = soup.select(child_stk_tkr_html_selector(idx))[0].text
    child_stk_en_name = soup.select(child_stk_en_name_html_selector(idx))[0].text
    
    return child_stk_tkr, child_stk_en_name

pdf_dict = {}

for etf_tkr, html in result:
    soup = bs4.BeautifulSoup(html, 'html.parser')
    
    pdf_cnt = soup.select(pdf_cnt_html_selector)[6].find_all("td")[1].text
    
    child_stk_tkr_list, child_stk_en_name_list = [], []

    for i in range(int(pdf_cnt)):
        child_stk_tkr, child_stk_en_name = get_stk_tkr_and_en_name(soup=soup, idx=i+1)
        child_stk_tkr_list.append(child_stk_tkr)
        child_stk_en_name_list.append(child_stk_en_name)
        
    pdf_dict[etf_tkr] = {
        "pdf_cnt": int(pdf_cnt),
        "child_stk_tkr_list": child_stk_tkr_list,
        "child_stk_en_name_list": child_stk_en_name_list
        }

- DB 적재하기 편하게 csv 파일로 저장합니다

In [4]:
import pandas as pd

df_list = []

for k, v in pdf_dict.items():
    df = pd.DataFrame()
    
    df["child_stk_tkr"] = v["child_stk_tkr_list"]
    df["child_stk_name"] = v["child_stk_en_name_list"]
    df["etf_tkr"]=k
    
    df_list.append(df)
    
pdf_info_df = pd.concat(df_list)
pdf_info_df = pdf_info_df[pdf_info_df["child_stk_tkr"]!=""]
pdf_info_df.reset_index(drop=True, inplace=True)

pdf_info_df.head()

Unnamed: 0,child_stk_tkr,child_stk_name,etf_tkr
0,LLY,ELI LILLY & CO,AGNG
1,4519 JP,CHUGAI PHARMA CO,AGNG
2,ALC SW,ALCON INC,AGNG
3,WELL,WELLTOWER INC,AGNG
4,NOVOB DC,NOVO NORDISK A/S-B,AGNG


In [6]:
pdf_info_df.to_csv("./data_before_db_insert/os_pdf_info.csv")