In [0]:
import requests
from bs4 import BeautifulSoup, NavigableString, Tag

site = "https://www.fool.com"
list_url_prefix = "https://www.fool.com/earnings-call-transcripts/?page="

import re
import pickle
import os
import pandas as pd

#%% returns tags between two tags
def between(cur, end):
    while cur and cur != end:
        if (isinstance(cur, Tag)) and (set([z.name for z in cur.contents])=={None}) and (cur.name != "script"):
            yield cur
        
        cur = cur.next_element 

#%% build list of call transcript links
links = []

for i in range(5):
   page = requests.get(list_url_prefix+str(i))
   soup = BeautifulSoup(page.text, "html.parser")
   items = soup.find_all(class_="card-image")
   
   for item in items:
       links.append(site+item.a["href"])

#%%
with open('C:/data/calllinks.txt', 'w') as f:
   for item in links:
       f.write("%s\n" % item)

#%% read csv
f = open('C:/data/calllinks.txt', 'r')
links = f.read().splitlines()
f.close()

#%%
#import random
#linkstest = random.sample(links,3000)

#%% get the pickle of previously processed calls
# used to pick up where we left off
if os.path.exists("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle"):
    pfile = open("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle", "rb") 
    calls = pickle.load(pfile)                      
    pfile.close()

#%%
#calls = {}

#%% processes calls
qa_pattern = "Question.*Answer"
duration_pattern = "Duration:.*[M|m]inutes"
participants_pattern = "Call [P|p]articipants"
prepared_remarks_pattern = "Prepared Remarks:"

for link in links: #linkstest:#links[0:1000]:
    if link not in calls.keys():
        page = requests.get(link)
        soup = BeautifulSoup(page.text, "html.parser")
        header1 = soup.find(class_="usmf-new article-header").find("h1").text
        header2 = soup.find(class_="usmf-new article-header").find("h2").text
        body = soup.find(class_="article-content")
        print(header1)
        print(header2)
        
        #############################################################################################################################################
        # metadata
        #############################################################################################################################################
        metadata = {}
        
        # storing important pieces of text to allow for any parsing after pickling
        metadata["header1"] = header1
        metadata["header2"] = header2
        if body.find("h2", string=re.compile("Contents")) != None:
            metadata["first paragraph"] = str(body.find("h2", string=re.compile("Contents")).find_previous_sibling("p"))
        else:
            metadata["first paragraph"] = None
        
        # company
        metadata["company"] = body.find_all("strong")[0].text  
        
        # ticker and exchange
        metadata["ticker"] = header1.split(")")[0].split("(")[-1].strip()
        metadata["exchange"] = None
        if body.find(class_="ticker") != None:
            metadata["ticker"] = body.find(class_="ticker").text
            metadata["ticker"] = metadata["ticker"].replace("(","").replace(")","")
            metadata["exchange"], metadata["ticker"]=metadata["ticker"].split(":")
        else:
            ticker=body.find(string=re.compile("[OTC|NYSE|NASDAQ|TSX|NYSEMKT|NASDAQOTH|AMEX]: ?[A-Z]{1,5}"))
            if ticker != None:
                metadata["ticker"] = ticker.split(":")[1].replace("(","").replace(")","").strip()
                metadata["exchange"] = ticker.split(":")[0].replace("(","").replace(")","").strip()
        
        # earnings period
        metadata["period"] = None
        match = re.search("(Q[1-4]|FY) ?20[0-9][0-9]", header1)
        if match:
            metadata["period"] = match.group(0).strip()
        
        # earnings period end date
        metadata["period_end_date"] = header2.split("ending")[-1].replace(".","").strip()
        
        # call date
        metadata["call date"] = None
        if body.find(id="date") != None:
            metadata["call date"] = body.find(id="date").text
        elif metadata["first paragraph"] != None:
            match = re.search("(\"date\"|/)> ?.*[0-9]?[0-9],? ?20[0-9][0-9]", metadata["first paragraph"])
            if match:
                metadata["call date"] = match.group(0).split(">")[-1].strip()

        # call time
        metadata["call time"] = None
        if body.find(id="time") != None:
            metadata["call time"] = body.find(id="time").text
        elif metadata["first paragraph"] != None:
            match = re.search(" ?[0-9]?[0-9]:[0-9][0-9] ?(a\.?m\.?|p\.?m\.?)( ?ET)?", metadata["first paragraph"])
            if match:
                metadata["call time"] = match.group(0).strip()           
        
        # call participants (find h2 with text.contains("Call particpants"))
        if body.find("h2", string=re.compile("Call [P|p]articipants")) != None:
            call_participant_start = body.find("h2", string=re.compile("Call [P|p]articipants")).find_next_siblings()
            metadata["call participants"] = []
            for res in call_participant_start:           
                if (res.find("strong") != None) & (res.find("em") != None) & (res.text != "More "+metadata["ticker"]+" analysis"):
                    participant_name = res.find("strong").text
                    company_and_title = res.find("em").text.split("--")
                    if len(company_and_title)==2:
                        participant_company = res.find("em").text.split("--")[0].strip()
                        participant_title = res.find("em").text.split("--")[1].strip()
                    else:
                        participant_company = metadata["company"]
                        participant_title = res.find("em").text.split("--")[0].strip()
                        
                    metadata["call participants"].append((participant_name, participant_company, participant_title))
                elif (res.text == "More "+metadata["ticker"]+" analysis"):
                    break
            metadata["call participants"]=pd.DataFrame(metadata["call participants"], columns=["Speaker","Company","Title"])
        else:
            metadata["call participants"] = None       
        
        # call duration
        metadata["duration"] = None
        for paragraph in body.find_all("p"):
            match = re.search(duration_pattern, paragraph.text)
            if match and (len(match.group(0).split())==3):
                metadata["duration"] = match.group(0).split()[-2]
    
    
        #############################################################################################################################################
        # text
        #############################################################################################################################################
        # prepared remarks
        prepared_remarks = []
        if body.find('h2', string=re.compile(prepared_remarks_pattern)) != None:
            prepared_remarks = [(tag.name,tag.text) for tag in 
                                        between(body.find('h2', string=re.compile(prepared_remarks_pattern)).next_element, 
                                                body.find('h2', string=re.compile(qa_pattern)))]
        elif body.find('strong', string=re.compile(prepared_remarks_pattern)) != None:
            prepared_remarks = [(tag.name,tag.text) for tag in 
                                        between(body.find('strong', string=re.compile(prepared_remarks_pattern)).next_element, 
                                                body.find('strong', string=re.compile(qa_pattern)))]
        text_prepared_remarks = []
        for item in prepared_remarks:
            tag_type = item[0]
            text = item[1]
            if tag_type == "strong":
                speaker=text
            elif tag_type == "p":
                text_prepared_remarks.append((speaker,"Prepared Remarks",text))
            
        # call questions and answers
        text_qa = []
        if body.find('h2', string=re.compile(qa_pattern)) != None:
            qa = [(tag.name,tag.text) for tag in 
                                        between(body.find('h2', string=re.compile(qa_pattern)), 
                                                body.find('strong', string=re.compile("Duration")))]
            for item in qa:
                tag_type = item[0]
                text = item[1]
                if tag_type == "strong":
                    speaker=text
                elif tag_type == "p":
                    text_qa.append((speaker,"Q&A",text))
    
        text_all = pd.DataFrame(text_prepared_remarks + text_qa, columns=["Speaker","Call Section","Text"])
        
        calls[link]={"metadata":metadata, "text":text_all}
        print()

if os.path.exists("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle"):
    os.remove("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle")
pfile = open("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle", "ab")
pickle.dump(calls, pfile)                  
pfile.close()

#%% in case you stop in between a run
if os.path.exists("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle"):
    os.remove("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle")
pfile = open("C:/Users/MLabadie/Downloads/NLP Project Research/ProcessedCallsPickle", "ab")
pickle.dump(calls, pfile)                  
pfile.close()







#%% testing
link="https://www.fool.com/earnings/call-transcripts/2019/02/25/carters-inc-cri-q4-2018-earnings-conference-call-t.aspx"
page = requests.get(link)
soup = BeautifulSoup(page.text, "html.parser")
header1 = soup.find(class_="usmf-new article-header").find("h1").text
header2 = soup.find(class_="usmf-new article-header").find("h2").text
body = soup.find(class_="article-content")

#%%
body.find(string=re.compile("[OTC|NYSE|NASDAQ|TSX|NYSEMKT|NASDAQOTH|AMEX]:.{1,5}"))