This script will obtain all the earnings date from Dec. 2016 to Dec. 2020.

NOTE: Not all of the stocks are reported by Yahoo Finance. Some stocks are reported by NASDAQ and Yahoo.

In [None]:
### Goal:
#### To extract all the earnings dates for every company in all quarters
##### Method is similar to the Google_News_Web_Parsing.py method of web parsing but with Yahoo Finance
###### Example URL: https://finance.yahoo.com/calendar/earnings?from=2020-03-01&to=2020-03-07&day=2020-03-05&offset=0&size=100

import pandas, json, numpy, requests, os, datetime, pytz, tweepy, sqlite3, time, re, random
from bs4 import BeautifulSoup

## Need to gather the earnings date for at least 3 years to get a average date/range of dates of when earnings might occur
## The "Reported EPS" and "Estimated EPS" and all other information will also be extracted (Earnings Per Share)

# Creating two list of dates from 2016-2019: the first list = date of the beginning of the week, the second list = date of the end of that week
# The range of the dates are from 2016-12-08 to 2020-01-04. Yahoo Finance does not provide many dates for 2016 and older.
# So the first date for the first list is 2016-12-04 and the last date is 2020-12-27
# For the second list, the first date is 2016-12-10 and the last date is 2021-01-2
## A third list is needed for the days in-between those dates, which will be created as a temporary list in a loop
begin_week_list = []
end_week_list =  []
week_counter = 0
start_week = datetime.date(2016, 12, 4)
x = 0

while x != datetime.date(2021, 1, 3): # The first Sunday of 2021
    begin_week_list.append(start_week + datetime.timedelta(days= week_counter * 7))
    end_week_list.append(start_week + datetime.timedelta(days= (week_counter + 1) * 7 - 1))

    week_counter += 1

    x = start_week + datetime.timedelta(days= week_counter * 7)

##### The length of both lists should be the same for the next part to work
print(len(begin_week_list), len(end_week_list))

#####____________________________________Defining a Function to Loop__________(See Bottom of Script for Explanation on Single URL Sample)____________
## Creating a function to all the above to allow looping of all the dates:

def yahoo_earnings_extraction(begin_week_list, end_week_list):
    earnings_df_quarter_1 = pandas.DataFrame()
    earnings_df_quarter_2 = pandas.DataFrame()
    earnings_df_quarter_3 = pandas.DataFrame()
    earnings_df_quarter_4 = pandas.DataFrame()      

    for u in range(len(begin_week_list)):
        begin_week_date = begin_week_list[u]
        end_week_date = end_week_list[u]
        weekdays_list = []
        weekdays_counter = 1
        offset_list = [0, 100, 200, 300] # This is controls the page numbers. There are up to 100 stocks listed in each page. 400 max earnings in a day is a guess.

        for v in range(5):  # 0-4 for the 5 weekdays in the week; this creates a list of the weekdays starting from the beginning of the week
            weekdays_list.append(begin_week_date + datetime.timedelta(days= weekdays_counter))
            weekdays_counter += 1

        for t in weekdays_list:
            print(t)
            for p in offset_list:
                # Standard URL format used by Yahoo Finance
                # from = start of the week
                # to = end of the week
                # day = specific weekday to examine
                # offset = page number
                # size = number of companies to dispay in one page; 
                ### If size is changed, offset needs to be changed to increment in that order: size = 50, then offset = [0, 50, 100, 150]
                ### The defaul size is 100 and the max seems to be 100. Size parameter is not needed in URL and will default to 100
                print(str(begin_week_date), str(end_week_date))
                get_URL = "https://finance.yahoo.com/calendar/earnings?from=" + str(begin_week_date) + "&to=" + str(end_week_date) + "&day=" + str(t) + "&offset=" + str(p) + "&size=100" 
                Yahoo_Earnings_Date_requests = requests.get(get_URL)

                soup = BeautifulSoup(Yahoo_Earnings_Date_requests.text, 'html.parser')

                if bool(str(soup.find_all('td')) == '[]') == True: # This means that the offset value does not populate the webpage with earnings
                    print(p, 'no offset')
                    break

                print(p, 'yes offset')

                # Obtaining stock company name:
                company_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'Company'}))
                company_name_list = []

                for q in company_attribute_list:
                    company_name = re.search('-->(.*)<!--', str(q))
                    company_name_list.append(company_name.group(1))


                # Obtianing symbols/tickers of stocks:
                symbol_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'Symbol'}))
                symbol_name_list = []

                for q in symbol_attribute_list:
                    symbol_name = re.search('title="">(.*)</a></td>', str(q))
                    symbol_name_list.append(symbol_name.group(1))

                # Obtaining "Earnings Call Time":
                earnings_call_time_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'Earnings Call Time'}))
                earnings_call_time_list = []

                for q in earnings_call_time_attribute_list:
                    if bool(re.search('TAS', str(q))) == True:
                        earnings_call_time = re.search('-->(.*)<!--', str(q))

                    if bool(re.search('TAS', str(q))) == False:
                        reactid = '"' + str(int(re.search('data-reactid="(.*)"><span data-reactid=', str(q)).group(1)) + 1) + '"'
                        earnings_call_time = re.search('"><span data-reactid=' + reactid +'>(.*)</span>', str(q))
                        
                    earnings_call_time_list.append(earnings_call_time.group(1))


                # Obtaining EPS estimate:
                eps_estimate_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'EPS Estimate'}))
                eps_estimate_list = []

                for q in eps_estimate_attribute_list:
                    if bool(re.search('N/A', str(q))) == True: # <td aria-label="EPS Estimate" class="Va(m) Ta(end) Pstart(15px) W(10%) Fz(s)" colspan="" data-reactid="315"><span data-reactid="316">N/A</span></td>
                        eps_estimate = numpy.NaN
                        eps_estimate_list.append(eps_estimate)

                    if bool(re.search('N/A', str(q))) == False: # <td aria-label="EPS Estimate" class="Va(m) Ta(end) Pstart(15px) W(10%) Fz(s)" colspan="" data-reactid="198"><!-- react-text: 199 -->0.58<!-- /react-text --></td>
                        reactid = str(int(re.search('data-reactid="(.*)"><!-- react-text: ', str(q)).group(1)) + 1)
                        eps_estimate = re.search('"><!-- react-text: ' + reactid +' -->(.*)<!-- /react-text --></td>', str(q))
                        
                        eps_estimate_list.append(float(eps_estimate.group(1)))


                # Obtaining EPS reported:
                eps_reported_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'Reported EPS'}))
                eps_reported_list = []

                for q in eps_reported_attribute_list:
                    if bool(re.search('N/A', str(q))) == True: #  <td aria-label="Reported EPS" class="Va(m) Ta(end) Pstart(15px) W(10%) Fz(s)" colspan="" data-reactid="57"><span data-reactid="58">N/A</span></td>
                        eps_reported = numpy.NaN
                        eps_reported_list.append(eps_reported)

                    if bool(re.search('N/A', str(q))) == False: # <td aria-label="Reported EPS" class="Va(m) Ta(end) Pstart(15px) W(10%) Fz(s)" colspan="" data-reactid="44"><!-- react-text: 45 -->0.88<!-- /react-text --></td>
                        reactid = str(int(re.search('data-reactid="(.*)"><!-- react-text: ', str(q)).group(1)) + 1)
                        eps_reported = re.search('"><!-- react-text: ' + reactid +' -->(.*)<!-- /react-text --></td>', str(q))
                        
                        eps_reported_list.append(float(eps_reported.group(1)))


                # Obtaining Surprise%:
                surprise_attribute_list = list(soup.find_all('td', attrs={'aria-label' :'Surprise(%)'}))
                surprise_list = []
 
                for q in surprise_attribute_list:
                    if bool(re.search('N/A', str(q))) == True: # <td aria-label="Surprise(%)" class="Va(m) Ta(end) Px(15px) W(10%) Fz(s)" colspan="" data-reactid="59"><span data-reactid="60">N/A</span></td>
                        surprise = numpy.NaN
                        surprise_list.append(surprise)

                    if bool(re.search('N/A', str(q))) == False: # <td aria-label="Surprise(%)" class="Va(m) Ta(end) Px(15px) W(10%) Fz(s)" colspan="" data-reactid="46"><span class="Trsdu(0.3s) Fw(600) C($dataRed)" data-reactid="47">-10.2</span></td>
                        reactid = '"' + str(int(re.search('data-reactid="(.*)"><span class=', str(q)).group(1)) + 1) + '"'
                        surprise= re.search('\)" data-reactid=' + reactid +'>(.*)</span></td>', str(q))
                        
                        surprise_list.append(float(surprise.group(1)))

                    
                ### Creating a dictionary --> DataFrame:
                temp_dict = {'Name' : company_name_list, 'Symbol' : symbol_name_list, "Earnings_Call_Time" : earnings_call_time_list, "EPS_Estimate" :  eps_estimate_list, "EPS_Reported" :  eps_reported_list, "Surprise_Percentage" : surprise_list}
                temp_df = pandas.DataFrame(temp_dict)
                temp_df['Date'] = str(t)

                ### Removing duplicates:
                for q in temp_df.Symbol:
                    selected_df = temp_df[temp_df.Symbol == q]

                    if len(selected_df) > 1:
                        index_list = list(selected_df.index)
                        new_index_list = []

                        for s in index_list:   
                            if selected_df.Earnings_Call_Time[s] == 'After Market Close':
                                keeper = s

                            elif selected_df.Earnings_Call_Time[s] == 'Time Not Supplied':
                                keeper = s

                            elif selected_df.Earnings_Call_Time[s] == 'TAS':
                                keeper = s

                            else:
                                keeper = 'double'
                                temp_df.drop_duplicates(subset= 'Symbol')

                        if keeper == 'double':
                            print(t, 'double')
                            break

                        for s in index_list:
                            if s != keeper:
                                new_index_list.append(s)

                        print(new_index_list)
                        temp_df = temp_df.drop(new_index_list)

                temp_df = temp_df.reset_index(drop=True)
                print(len(temp_df))
            
                for l in [2016, 2017, 2018, 2019, 2020]:
                    if datetime.date(l, 1, 1) <= t <= datetime.date(l, 3, 31): # First Earnings Quarter
                        earnings_df_quarter_1 = pandas.concat([earnings_df_quarter_1, temp_df])
                        print('q1')

                    elif datetime.date(l, 4, 1) <= t <= datetime.date(l, 6, 30): # Second Earnings Quarter
                        earnings_df_quarter_2 = pandas.concat([earnings_df_quarter_2, temp_df])
                        print('q2')
                
                    elif datetime.date(l, 7, 1) <= t <= datetime.date(l, 9, 30): # Third Earnings Quarter
                        earnings_df_quarter_3 = pandas.concat([earnings_df_quarter_3, temp_df])
                        print('q3')
                
                    elif datetime.date(l, 10, 1) <= t <= datetime.date(l, 12, 31): # Fourth Earnings Quarter
                        earnings_df_quarter_4 = pandas.concat([earnings_df_quarter_4, temp_df])
                        print('q4')

            print('date end')
            earnings_df_quarter_1
            earnings_df_quarter_2
            earnings_df_quarter_3
            earnings_df_quarter_4

    # Saving DataFrames
    ### Note: THIS IS NOT SET TO UPDATE EXISTING FILES AND IS MEANT TO GATHER HISTORICAL DATA----OVERWRITE WILL OCCUR
    if not os.path.exists(os.getcwd() + '\\Earnings Dates\\'):
        os.makedirs(os.getcwd() + '\\Earnings Dates\\')

    filepath = os.getcwd() + '\\Earnings Dates\\'

    earnings_df_quarter_1.to_csv(filepath + 'Earnings - Quarter 1 - 2016-2020.csv')
    earnings_df_quarter_2.to_csv(filepath + 'Earnings - Quarter 2 - 2016-2020.csv')
    earnings_df_quarter_3.to_csv(filepath + 'Earnings - Quarter 3 - 2016-2020.csv')
    earnings_df_quarter_4.to_csv(filepath + 'Earnings - Quarter 4 - 2016-2020.csv')


yahoo_earnings_extraction(begin_week_list, end_week_list)