In [23]:
import re
from pathlib import Path
from random import random
from time import sleep
from urllib.parse import urljoin

import pandas as pd
from bs4 import BeautifulSoup
from furl import furl
from selenium import webdriver

transcript_path = Path('transcripts')

In [24]:
def parse_html(html):
    """Main html parser function"""
    date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
    quarter_pattern = re.compile(r'(\bQ\d\b)')
    soup = BeautifulSoup(html, 'lxml')

    meta, participants, content = {}, [], []
#     h1 = soup.find('h1', itemprop='headline')
#     if h1 is None:
#         return
#     h1 = h1.text
#     meta['company'] = h1[:h1.find('(')].strip()
#     meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]

#     title = soup.find('div', class_='title')
#     if title is None:
#         return
#     title = title.text
#     match = date_pattern.search(title)
#     if match:
#         m, d, y = match.groups()
#         meta['month'] = int(m)
#         meta['day'] = int(d)
#         meta['year'] = int(y)

#     match = quarter_pattern.search(title)
#     if match:
#         meta['quarter'] = match.group(0)

    qa = 0
    speaker_types = ['Executives', 'Analysts']
    for header in [p.parent for p in soup.find_all('strong')]:
        text = header.text.strip()
        if text.lower().startswith('copyright'):
            continue
        elif text.lower().startswith('question-and'):
            qa = 1
            continue
        elif any([type in text for type in speaker_types]):
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    participants.append([text, participant.text])
        else:
            p = []
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    p.append(participant.text)
            content.append([header.text, qa, '\n'.join(p)])
    #print(content)
    return meta, participants, content

In [25]:
def store_result(meta, participants, content):
    """Save parse content to csv"""
    path = transcript_path / 'parsed' / meta['symbol']
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)
    pd.DataFrame(content, columns=['speaker', 'q&a', 'content']).to_csv(path / 'content.csv', index=False)
    pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 'participants.csv', index=False)
    pd.Series(meta).to_csv(path / 'earnings.csv')

In [26]:
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])

driver = webdriver.Chrome(options = options, executable_path='..\chromedriver.exe')

In [27]:
link = 'https://seekingalpha.com/article/4460156-siyata-mobile-inc-syta-ceo-marc-seelenfreund-on-q2-2021-results-earnings-call-transcript'
#link = 'https://www.fedsearch.org/board_public/search?source=board_pub&text=+The+Fed+FOMC+Projections+materials%2C+accessible+version&submit=Search'
driver.get(link)
html = driver.page_source

In [111]:
soup = BeautifulSoup(html, 'lxml')
soup

<html class="js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers no-applicationcache svg inlinesvg smil svgclippaths grunticon" lang="en" style=""><head><style>@charset "UTF-8";[ng\:cloak],[ng-cloak],[data-ng-cloak],[x-ng-cloak],.ng-cloak,.x-ng-cloak,.ng-hide:not(.ng-hide-animate){display:none !important;}ng\:form{display:block;}.ng-animate-shim{visibility:hidden;}.ng-anchor{position:absolute;}</style>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1.0, minimum-scale=1.0 maximum-scale=1.6, user-scalable=1" name="viewport"/>
<meta content="Board of Go

In [107]:
meta, participants, content = parse_html(html)

In [108]:
content

[['\nFederal Open Market Committee\n', 0, ''],
 ['\nMonetary Policy Principles and Practice\n', 0, ''],
 ['\nPolicy Implementation\n', 0, ''],
 ['\nReports\n', 0, ''],
 ['\nReview of Monetary Policy Strategy, Tools, and Communications\n', 0, ''],
 ['\nInstitution Supervision\n', 0, ''],
 ['\nReports\n', 0, ''],
 ['\nReporting Forms\n', 0, ''],
 ['\nSupervision & Regulation Letters\n', 0, ''],
 ['\nBanking Applications & Legal Developments\n', 0, ''],
 ['\nRegulatory Resources\n', 0, ''],
 ['\nBanking & Data Structure\n', 0, ''],
 ['\nRegulations & Statutes\n', 0, ''],
 ['\nPayment Policies\n', 0, ''],
 ['\nReserve Bank Payment Services & Data\n', 0, ''],
 ['\nFinancial Market Utilities & Infrastructures\n', 0, ''],
 ['\nResearch, Committees, and Forums\n', 0, ''],
 ['\nWorking Papers and Notes\n', 0, ''],
 ['\nModels and Tools\n', 0, ''],
 ['\nBank Assets and Liabilities\n', 0, ''],
 ['\nBank Structure Data\n', 0, ''],
 ['\nBusiness Finance\n', 0, ''],
 ['\nDealer Financing Terms\n', 0

## FED 점도표 데이터로 뽑아보자

In [15]:
# FED 점도표 뽑아보자
link = 'https://www.fedsearch.org/board_public/search?source=board_pub&text=+The+Fed+FOMC+Projections+materials%2C+accessible+version&submit=Search'

driver.get(link)
html = driver.page_source
soup = BeautifulSoup(html, 'lxml')
urls = soup.find_all('a', href=re.compile("https://www.federalreserve.gov/monetarypolicy/fomcprojtabl"))

scrap_url = []

for url in urls:
    scrap_url.append(url['href'])
    
scrap_url = list(set(scrap_url))
scrap_url

['https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20200610.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20130619.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20130918.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20210922.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20210616.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20200916.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20131218.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20130320.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20210317.htm',
 'https://www.federalreserve.gov/monetarypolicy/fomcprojtabl20201216.htm']

In [17]:
url_df = pd.DataFrame(columns=['date', 'url'])

for i, r_url in enumerate(scrap_url):
    date = re.findall("\d+", r_url)
    url = scrap_url[i]
    url_df = url_df.append({'date':int(date[0]), 'url':r_url}, ignore_index=True)
    
url_df = url_df.sort_values('date', ascending=False)
url_df = url_df.reset_index(drop=True)
url_df

Unnamed: 0,date,url
0,20210922,https://www.federalreserve.gov/monetarypolicy/...
1,20210616,https://www.federalreserve.gov/monetarypolicy/...
2,20210317,https://www.federalreserve.gov/monetarypolicy/...
3,20201216,https://www.federalreserve.gov/monetarypolicy/...
4,20200916,https://www.federalreserve.gov/monetarypolicy/...
5,20200610,https://www.federalreserve.gov/monetarypolicy/...
6,20131218,https://www.federalreserve.gov/monetarypolicy/...
7,20130918,https://www.federalreserve.gov/monetarypolicy/...
8,20130619,https://www.federalreserve.gov/monetarypolicy/...
9,20130320,https://www.federalreserve.gov/monetarypolicy/...


In [28]:
fred_url = url_df.url[0]
fred_df = pd.read_html(fred_url)[0]

# 숫자만 추출
fred_df = fred_df[~fred_df.iloc[:,0].str.contains("Memo")]

# 결측열 삭제
fred_df = fred_df.dropna(axis=1)
fred_df

Unnamed: 0_level_0,Variable,Median 1,Median 1,Median 1,Central Tendency 2,Central Tendency 2,Central Tendency 2,Range 3,Range 3,Range 3
Unnamed: 0_level_1,Variable,2021,2022,2023,2021,2022,2023,2021,2022,2023
0,Change in real GDP,5.9,3.8,2.5,5.8-6.0,3.4-4.5,2.2-2.5,5.5-6.3,3.1-4.9,1.8-3.0
1,June projection,7.0,3.3,2.4,6.8-7.3,2.8-3.8,2.0-2.5,6.3-7.8,2.6-4.2,1.7-2.7
2,Unemployment rate,4.8,3.8,3.5,4.6-4.8,3.6-4.0,3.3-3.7,4.5-5.1,3.0-4.0,2.8-4.0
3,June projection,4.5,3.8,3.5,4.4-4.8,3.5-4.0,3.2-3.8,4.2-5.0,3.2-4.2,3.0-3.9
4,PCE inflation,4.2,2.2,2.2,4.0-4.3,2.0-2.5,2.0-2.3,3.4-4.4,1.7-3.0,1.9-2.4
5,June projection,3.4,2.1,2.2,3.1-3.5,1.9-2.3,2.0-2.2,3.0-3.9,1.6-2.5,1.9-2.3
6,Core PCE inflation 4,3.7,2.3,2.2,3.6-3.8,2.0-2.5,2.0-2.3,3.5-4.2,1.9-2.8,2.0-2.3
7,June projection,3.0,2.1,2.1,2.9-3.1,1.9-2.3,2.0-2.2,2.7-3.3,1.7-2.5,2.0-2.3
9,Federal funds rate,0.1,0.3,1.0,0.1,0.1-0.4,0.4-1.1,0.1,0.1-0.6,0.1-1.6
10,June projection,0.1,0.1,0.6,0.1,0.1-0.4,0.1-1.1,0.1,0.1-0.6,0.1-1.6
