In [12]:
from pathlib import Path
import regex as re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from furl import furl
from selenium import webdriver


In [55]:
transcript_path = Path('transcripts')
SA_URL = 'https://seekingalpha.com/'
TRANSCRIPT = re.compile('Earnings Call Transcript')
next_page = True
page = 1
driver = webdriver.Chrome()
while next_page:
    url = f'{SA_URL}/earnings/earnings-call-transcripts'
    print(f'Accessing: {url}')
    driver.get(urljoin(SA_URL, url))
    response = driver.page_source
    soup = BeautifulSoup(response, 'lxml')
    links = soup.find_all(name='a', string=TRANSCRIPT)
    if len(links) == 0:
        next_page = False
    else:
        for link in links:
            transcript_url = link.attrs.get('href')
            article_url = furl(urljoin(SA_URL, 
                           transcript_url)).add({'part': 'single'})
            print(f'-   transcript: {article_url}')
            driver.get(article_url.url)
            html = driver.page_source
            meta, participants, content = parse_html(html)
            meta['link'] = link
driver.close()

Accessing: https://seekingalpha.com//earnings/earnings-call-transcripts
-   transcript: https://seekingalpha.com/article/4479944-hdfc-bank-limited-hdb-management-on-q3-2022-results-earnings-call-transcript?part=single
HDFC Bank Limited (HDB) Management on Q3 2022 Results - Earnings Call Transcript
{'company': 'HDFC Bank Limited', 'symbol': 'HDB'}
Jan. 17, 2022 4:40 AM ET
-   transcript: https://seekingalpha.com/article/4479741-chr-hansen-holding-s-chyhy-ceo-mauricio-graber-on-q1-2022-results-earnings-call-transcript?part=single
Chr. Hansen Holding A/S' (CHYHY) CEO Mauricio Graber on Q1 2022 Results - Earnings Call Transcript
{'company': "Chr. Hansen Holding A/S'", 'symbol': 'CHYHY'}
Jan. 14, 2022 5:56 PM ET
-   transcript: https://seekingalpha.com/article/4479735-citigroup-inc-s-c-ceo-jane-fraser-on-q4-2021-results-earnings-call-transcript?part=single
Citigroup Inc.'s (C) CEO Jane Fraser on Q4 2021 Results - Earnings Call Transcript
{'company': "Citigroup Inc.'s", 'symbol': 'C'}
Jan. 1

KeyboardInterrupt: 

In [54]:
def parse_html(html):
    date_pattern = re.compile(r'(\d{2})-(\d{2})-(\d{2})')
    quarter_pattern = re.compile(r'(\bQ\d\b)')
    soup = BeautifulSoup(html, 'lxml')
    meta, participants, content = {}, [], []
    # <h1 class="kzA baGI bdQ bd9 bdD bdCB bdDW bdEE" data-test-id="post-title">HDFC Bank Limited (HDB) Management on Q3 2022 Results - Earnings Call Transcript</h1>
    h1 = soup.find(attrs={'data-test-id':'post-title'}).text
    print(h1)
    meta['company'] = h1[:h1.find('(')].strip()
    meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
    print(meta)
    
    #<span class="kzD kzF bhB" data-test-id="post-date">Jan. 17, 2022 4:40 AM ET</span>
    post_date = soup.find(attrs={'data-test-id':'post-date'}).text
    print(post_date)
    match = date_pattern.search(post_date)
    if match:
        m, d, y = match.groups()
        meta['month'] = int(m)
        meta['day'] = int(d)
        meta['year'] = int(y)
    match = quarter_pattern.search(post_date)
    if match:
        meta['quarter'] = match.group(0)
    qa = 0
    speaker_types = ['Executives', 'Analysts']
    for header in [p.parent for p in soup.find_all('strong')]:
        text = header.text.strip()
        if text.lower().startswith('copyright'):
            continue
        elif text.lower().startswith('question-and'):
            qa = 1
            continue
        elif any([type in text for type in speaker_types]):
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    participants.append([text, participant.text])
        else:
            p = []
            for participant in header.find_next_siblings('p'):
                if participant.find('strong'):
                    break
                else:
                    p.append(participant.text)
            content.append([header.text, qa, '\n'.join(p)])
    return meta, participants, content

In [14]:
def store_result(meta, participants, content):
    path = transcript_path / 'parsed' / meta['symbol']
    print(path)
    pd.DataFrame(content, columns=['speaker', 'q&a', 
              'content']).to_csv(path / 'content.csv', index=False)
    pd.DataFrame(participants, columns=['type', 'name']).to_csv(path / 'participants.csv', index=False)
    pd.Series(meta).to_csv(path / 'earnings.csv')