# required libraries #

In [142]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pandas_datareader import data
import os
import re

# s&p 500 companies list
objective: scrape list of public companies from Wikipedia and store them in dataframe

In [143]:
# scrape html data from wikipedia

resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = BeautifulSoup(resp.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})

In [144]:
# extract desired data points from the html data and store them in lists

company_tickers = []
companies = []
sectors = []
subsectors = []

for row in table.findAll('tr')[1:]:
    
    company_ticker = row.findAll('td')[0].text
    company_ticker = company_ticker.strip('\n')
    company_tickers.append(company_ticker)

    company = row.findAll('td')[1].text
    companies.append(company)

    sector = row.findAll('td')[3].text
    sectors.append(sector)

    subsector = row.findAll('td')[4].text
    subsectors.append(subsector)

In [145]:
# convert these lists to a datafarme

df = pd.DataFrame({
                "company_ticker":company_tickers,
                "company":companies,
                "sector":sectors,
                "subsector":subsectors
                    })

In [146]:
# filter for just healthcare companies

health_care_df = df[df['sector'] == 'Health Care'].reset_index(drop=True)
health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment
1,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals
2,ABMD,ABIOMED Inc,Health Care,Health Care Equipment
3,A,Agilent Technologies Inc,Health Care,Health Care Equipment
4,ALXN,Alexion Pharmaceuticals,Health Care,Pharmaceuticals


In [147]:
# group healthcare companies by subsector
# there are 63 total healthcare companies in the s&p500

health_care_df_summary = health_care_df[['subsector']].groupby('subsector').size() \
    .to_frame('company_count') \
    .sort_values('company_count', ascending = False) \
    .reset_index()

health_care_df_summary = health_care_df_summary.append(health_care_df_summary.sum(numeric_only=True), ignore_index=True)
health_care_df_summary.loc[(len(health_care_df_summary)-1),'subsector'] = 'Total # of healthcare companies'
health_care_df_summary

Unnamed: 0,subsector,company_count
0,Health Care Equipment,20.0
1,Pharmaceuticals,10.0
2,Biotechnology,6.0
3,Health Care Distributors,6.0
4,Life Sciences Tools & Services,5.0
5,Managed Health Care,5.0
6,Health Care Supplies,4.0
7,Health Care Facilities,3.0
8,Health Care Services,3.0
9,Health Care Technology,1.0


In [148]:
## resources used for the above section: ##
# https://pythonprogramming.net/sp500-company-price-data-python-programming-for-finance/

# earnings call transcripts

objective: pull earnings call transcripts from financialmodelingprep api and store them in dataframe

In [149]:
# read in locally stored financialmodelingprep api key and assign it to variable 'apikey'

creds_dir = os.path.expanduser("~/creds/financialmodelingprep.txt")

with open(creds_dir, 'r') as text:
    apikey = text.read().strip('\n')

In [150]:
# create lists that will be looped through to pull desired transcripts

# every healthcare company in the s&p 500
company_ticker_list = health_care_df['company_ticker']

# from years 2019 and 2020
year_list = ['2019','2020']

# for q2 (in the future, i may want to run this over multiple quarters, i.e. q2 and q3)
quarter_list = [2] #[2,3]

In [151]:
# pull desired transcripts using lists defined above
# there will be 1 row in the dataframe for every company + year + quarter combination

df = pd.DataFrame()

for company in company_ticker_list:
    for year in year_list:
        for quarter in quarter_list:
            # pull down transcript
            transcript = requests.get(f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company}?quarter={quarter}&year={year}&apikey={apikey}').json()
            try:
                transcript = transcript[0]['content'].split(' ')
            except:
                transcript = np.nan
            finally:
                # store the 4 objects in a list
                lst = [[company, year, quarter, transcript]]
                # append list to dataframe
                df = df.append(lst)

In [152]:
# dataframe formatting

df.columns = ['company_ticker', 'year', 'quarter','transcript_text']
df.reset_index(drop=True, inplace = True)
df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text
0,ABT,2019,2,"[Operator:, Good, morning, and, thank, you, fo..."
1,ABT,2020,2,"[Operator:, Good, morning, and, thank, you, fo..."
2,ABBV,2019,2,"[Operator:, Good, morning, and, thank, you, fo..."
3,ABBV,2020,2,"[Operator:, Good, morning., And, thank, you, f..."
4,ABMD,2019,2,"[Operator:, Good, day,, ladies, and, gentlemen..."


In [153]:
# identify which companies we were not able to pull a transcript for in at least 1 of the years

comanies_missing_data = df[df['transcript_text'].isnull()]['company_ticker'].unique()
comanies_missing_data

array(['BIO', 'VTRS'], dtype=object)

In [154]:
# remove all records of instances of companies that have a missing transcript
# even if a company were to only be missing 1 transcript, the company needs to be removed at large, as we can no longer 
# perform the comparison from 2019 to 2020

df = df[df['company_ticker'].isin(comanies_missing_data) == False].reset_index(drop=True)

In [155]:
# after removal of the companies missing transcripts, we will be able to compare the transcripts
# every company will have 2 records in the dataframe, 1 for each of 2019 and 2020

companies_unique = len(df['company_ticker'].unique())
df_count = len(df)
print('there are ' + str(df_count) + ' rows in the dataframe, with 2 records from ' + str(companies_unique) + ' unique companies')

there are 122 rows in the dataframe, with 2 records from 61 unique companies


In [156]:
## resources used for the above section: ##
#  https://codingandfun.com/analysing-company-earning-calls-with-python/

# analysis

objective: check for a change in frequency of select words used in earnings calls from 2019 to 2020

In [157]:
# define list of words to check earnings calls for

words_to_analyze = ['tele',
                    'digital',
                    'virtual',
                    'remote',
                    'home',
                    'medical',
                    'health']

In [158]:
# create columns in existing dataframe df for every word we're checking for from the list above

for word in words_to_analyze:
    df[word] = np.nan
    
df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text,tele,digital,virtual,remote,home,medical,health
0,ABT,2019,2,"[Operator:, Good, morning, and, thank, you, fo...",,,,,,,
1,ABT,2020,2,"[Operator:, Good, morning, and, thank, you, fo...",,,,,,,
2,ABBV,2019,2,"[Operator:, Good, morning, and, thank, you, fo...",,,,,,,
3,ABBV,2020,2,"[Operator:, Good, morning., And, thank, you, f...",,,,,,,
4,ABMD,2019,2,"[Operator:, Good, day,, ladies, and, gentlemen...",,,,,,,


In [159]:
# populate the columns that were created above with the frequency of the word appearing in each transcript

for i in range(len(df)):
    for word in words_to_analyze:
        try:
            earnings_call = pd.DataFrame(df.loc[i,'transcript_text'],columns=['content'])
        except:
            earnings_call = pd.DataFrame(columns=['content']) # empty
        finally:
            analysis = earnings_call[earnings_call['content'].str.contains(word)]
            word_count = len(analysis)
            df.loc[i,word] = word_count
            

df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text,tele,digital,virtual,remote,home,medical,health
0,ABT,2019,2,"[Operator:, Good, morning, and, thank, you, fo...",0.0,0.0,0.0,0.0,0.0,11.0,11.0
1,ABT,2020,2,"[Operator:, Good, morning, and, thank, you, fo...",0.0,0.0,1.0,1.0,0.0,4.0,6.0
2,ABBV,2019,2,"[Operator:, Good, morning, and, thank, you, fo...",0.0,0.0,0.0,0.0,1.0,1.0,2.0
3,ABBV,2020,2,"[Operator:, Good, morning., And, thank, you, f...",1.0,0.0,1.0,1.0,0.0,3.0,3.0
4,ABMD,2019,2,"[Operator:, Good, day,, ladies, and, gentlemen...",0.0,0.0,0.0,0.0,4.0,0.0,1.0


In [160]:
# drop all columns except the company ticker and the word counts
# then split the dataframe into 2, 1 for each 2019 and 2020

cols = ['company_ticker'] + words_to_analyze

df_2019 = df[df['year'] == '2019'][cols]
df_2020 = df[df['year'] == '2020'][cols]

In [161]:
# add '_[YEAR]' to end of each word count column in the 2 dataframes that were just created

for i in df_2019.columns[1:]:    
    df_2019.rename(columns = {i: (i + '_2019')}, inplace = True)
    
for i in df_2020.columns[1:]:
    df_2020.rename(columns = {i: (i + '_2020')}, inplace = True)


df_2020.head()

Unnamed: 0,company_ticker,tele_2020,digital_2020,virtual_2020,remote_2020,home_2020,medical_2020,health_2020
1,ABT,0.0,0.0,1.0,1.0,0.0,4.0,6.0
3,ABBV,1.0,0.0,1.0,1.0,0.0,3.0,3.0
5,ABMD,0.0,0.0,0.0,0.0,3.0,0.0,3.0
7,A,0.0,5.0,1.0,0.0,5.0,3.0,4.0
9,ALXN,1.0,2.0,1.0,0.0,3.0,1.0,5.0


In [162]:
# merge the health_care_df dataframe from earlier in the file with the 2019 and 2020 word count dataframe

health_care_df = health_care_df.merge(df_2019, how = 'inner', on = ['company_ticker'])
health_care_df = health_care_df.merge(df_2020, how = 'inner', on = ['company_ticker'])

cols_word_count = [col for col in health_care_df.columns if '_20' in col]
cols_word_count = sorted(cols_word_count)
cols = ['company_ticker','company','sector','subsector'] + cols_word_count
health_care_df = health_care_df[cols]

health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment,0.0,0.0,11.0,6.0,0.0,0.0,11.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0
1,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,0.0,0.0,2.0,3.0,1.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,1.0
2,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,0.0,0.0,1.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A,Agilent Technologies Inc,Health Care,Health Care Equipment,0.0,5.0,1.0,4.0,0.0,5.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ALXN,Alexion Pharmaceuticals,Health Care,Pharmaceuticals,0.0,2.0,3.0,5.0,0.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0


In [163]:
# create summary table to view the number for companies and total mentions for each word

summary_df = pd.DataFrame(data = np.nan, index = ['num_companies','total_mentions'], columns = cols_word_count) \
#     .rename_axis('metric').reset_index()

for i in summary_df.columns:
    summary_df.loc['num_companies',i] = len(health_care_df[health_care_df[i] > 0])
    summary_df.loc['total_mentions',i] = sum(health_care_df[i])
    
summary_df

Unnamed: 0,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020
num_companies,21.0,36.0,60.0,60.0,16.0,42.0,42.0,41.0,2.0,28.0,11.0,31.0,8.0,45.0
total_mentions,126.0,186.0,582.0,762.0,79.0,200.0,193.0,213.0,3.0,76.0,16.0,87.0,11.0,176.0


In [165]:
# manipulate data in format required for visualization

viz = summary_df.loc['num_companies'].to_frame().reset_index().rename(columns={'index':'word_year'})
viz['year'] = viz['word_year'].apply(lambda x: re.sub('[a-z_]', '', x))
viz['word'] = viz['word_year'].apply(lambda x: re.sub('_.*', '', x))
viz.drop(columns = 'word_year', inplace = True)
viz = viz.pivot(index = 'word', columns = 'year', values = 'num_companies').sort_values('2019')
viz['pct_change'] = (viz['2020'] - viz['2019']) / viz['2019']
viz

year,2019,2020,pct_change
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
remote,2.0,28.0,13.0
virtual,8.0,45.0,4.625
tele,11.0,31.0,1.818182
home,16.0,42.0,1.625
digital,21.0,36.0,0.714286
medical,42.0,41.0,-0.02381
health,60.0,60.0,0.0


# company market cap #

objective: pull every healthcare companies' market cap

In [166]:
market_caps = data.get_quote_yahoo(company_ticker_list)['marketCap'].to_frame()
market_caps['company_ticker'] = market_caps.index
market_caps.reset_index(drop = True, inplace= True)
market_caps.rename(columns = {'marketCap':'market_cap'}, inplace = True)
market_caps.head()

Unnamed: 0,market_cap,company_ticker
0,195314057216,ABT
1,186330742784,ABBV
2,14533523456,ABMD
3,37708795904,A
4,34505420800,ALXN


In [167]:
# merge the health_care_df dataframe from earlier with the market cap dataframe
# sort this merged dataframe by market cap in desc order - shows most valuable companies first

health_care_df = health_care_df.merge(market_caps, how = 'inner', on = ['company_ticker']) \
    .sort_values('market_cap', ascending = False)
health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020,market_cap
36,JNJ,Johnson & Johnson,Health Care,Pharmaceuticals,6.0,5.0,7.0,23.0,0.0,3.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,421048451072
53,UNH,UnitedHealth Group Inc.,Health Care,Managed Health Care,6.0,13.0,70.0,53.0,4.0,13.0,9.0,9.0,0.0,1.0,0.0,2.0,2.0,4.0,340882915328
41,MRK,Merck & Co.,Health Care,Pharmaceuticals,0.0,2.0,2.0,24.0,0.0,2.0,1.0,3.0,0.0,2.0,0.0,0.0,0.0,0.0,208423878656
45,PFE,Pfizer Inc.,Health Care,Pharmaceuticals,0.0,3.0,6.0,15.0,0.0,1.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,5.0,204938199040
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment,0.0,0.0,11.0,6.0,0.0,0.0,11.0,4.0,0.0,1.0,0.0,0.0,0.0,1.0,195314057216


In [168]:
# write the output file with the final data to csv locally

health_care_df.to_csv('~/Downloads/health_care_df.csv',index = False)

# check specific earnings calls

objective: create function that takes in the following inputs:
* company
* year
* quarter
* word to analyze

which returns all instances of the word in the given earnings call, in context of the sentence it was said

In [169]:
def getTranscript(company_ticker,year,quarter,word_to_analyze):
    
    # apikey is defined earlier in this file
    
    transcript = requests.get(f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company_ticker}?quarter={quarter}&year={year}&apikey={apikey}').json()
    
    transcript_word_break = transcript[0]['content'].split(' ')
    earnings_call = pd.DataFrame(transcript_word_break,columns=['content'])
    analysis = earnings_call[earnings_call['content'].str.contains(word_to_analyze)]
    len_analysis = len(analysis)
    print("instances of the word '" + word_to_analyze + "': " + str(len_analysis))
    print('')
    
    # show the examples of the word in the full sentence
    transcript_line_break = transcript[0]['content'].split('\n')
    earnings_call = pd.DataFrame(transcript_line_break,columns=['content'])
    analysis = earnings_call[earnings_call['content'].str.contains(word_to_analyze)]
    text_earnings = analysis['content'].values
    
    for text in text_earnings:
        for phrase in text.split('. '):
            if word_to_analyze in phrase:
                  print(phrase)
                  print('')

In [170]:
getTranscript('ANTM','2020','2','tele')

instances of the word 'tele': 3

To date, we've facilitated more than 475,000 telehealth visits in 82,000 COVID-19 assessments

Live health online, our on demand telehealth solution surpassed 1 million visits in early April and demand continues

Demand for telemedicine in the behavioral health space is also increasing with the usage of 56 times pre-COVID levels

