# required libraries #

In [42]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pandas_datareader import data
import os
import re

# s&p 500 companies list
objective: scrape list of public companies from Wikipedia and store them in dataframe

In [43]:
# scrape html data from wikipedia

resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = BeautifulSoup(resp.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable sortable'})

In [44]:
# extract desired data points from the html data and store them in lists

company_tickers = []
companies = []
sectors = []
subsectors = []

for row in table.findAll('tr')[1:]:
    
    company_ticker = row.findAll('td')[0].text
    company_ticker = company_ticker.strip('\n')
    company_tickers.append(company_ticker)

    company = row.findAll('td')[1].text
    companies.append(company)

    sector = row.findAll('td')[3].text
    sectors.append(sector)

    subsector = row.findAll('td')[4].text
    subsectors.append(subsector)

In [45]:
# convert these lists to a datafarme

df = pd.DataFrame({
                "company_ticker":company_tickers,
                "company":companies,
                "sector":sectors,
                "subsector":subsectors
                    })

In [46]:
# filter for just healthcare companies

health_care_df = df[df['sector'] == 'Health Care'].reset_index(drop=True)
health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment
1,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals
2,ABMD,ABIOMED Inc,Health Care,Health Care Equipment
3,A,Agilent Technologies Inc,Health Care,Health Care Equipment
4,ALXN,Alexion Pharmaceuticals,Health Care,Pharmaceuticals


In [47]:
# group healthcare companies by subsector
# there are 63 total healthcare companies in the s&p500

health_care_df_summary = health_care_df[['subsector']].groupby('subsector').size() \
    .to_frame('company_count') \
    .sort_values('company_count', ascending = False) \
    .reset_index()

health_care_df_summary = health_care_df_summary.append(health_care_df_summary.sum(numeric_only=True), ignore_index=True)
health_care_df_summary.loc[(len(health_care_df_summary)-1),'subsector'] = 'Total # of healthcare companies'
health_care_df_summary

Unnamed: 0,subsector,company_count
0,Health Care Equipment,20.0
1,Pharmaceuticals,10.0
2,Biotechnology,6.0
3,Health Care Distributors,6.0
4,Life Sciences Tools & Services,5.0
5,Managed Health Care,5.0
6,Health Care Supplies,4.0
7,Health Care Facilities,3.0
8,Health Care Services,3.0
9,Health Care Technology,1.0


In [48]:
## resources used for the above section: ##
# https://pythonprogramming.net/sp500-company-price-data-python-programming-for-finance/

# earnings call transcripts

objective: pull earnings call transcripts from financialmodelingprep api and store them in dataframe

In [49]:
# read in locally stored financialmodelingprep api key and assign it to variable 'apikey'

creds_dir = os.path.expanduser("~/creds/financialmodelingprep.txt")

with open(creds_dir, 'r') as text:
    apikey = text.read().strip('\n')

In [50]:
# create lists that will be looped through to pull desired transcripts

# every healthcare company in the s&p 500
company_ticker_list = health_care_df['company_ticker']

# from years 2019 and 2020
year_list = ['2019','2020']

# for q3 (in the future, i may want to run this over multiple quarters, i.e. q2 and q3)
quarter_list = [3] #[2,3]

In [51]:
# pull desired transcripts using lists defined above
# there will be 1 row in the dataframe for every company + year + quarter combination

df = pd.DataFrame()

for company in company_ticker_list:
    for year in year_list:
        for quarter in quarter_list:
            # pull down transcript
            transcript = requests.get(f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company}?quarter={quarter}&year={year}&apikey={apikey}').json()
            try:
                transcript = transcript[0]['content'].split(' ')
            except:
                transcript = np.nan
            finally:
                # store the 4 objects in a list
                lst = [[company, year, quarter, transcript]]
                # append list to dataframe
                df = df.append(lst)

In [52]:
# dataframe formatting

df.columns = ['company_ticker', 'year', 'quarter','transcript_text']
df.reset_index(drop=True, inplace = True)
df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text
0,ABT,2019,3,"[, Operator:, Good, morning, and, thank, you, ..."
1,ABT,2020,3,"[, Operator:, Good, morning, and, thank, you, ..."
2,ABBV,2019,3,"[Operator:, Good, morning, and, thank, you, fo..."
3,ABBV,2020,3,"[, Operator:, Good, morning, and, thank, you, ..."
4,ABMD,2019,3,"[Operator:, Good, day,, ladies, and, gentlemen..."


In [53]:
# identify which companies we were not able to pull a transcript for in at least 1 of the years

comanies_missing_data = df[df['transcript_text'].isnull()]['company_ticker'].unique()
comanies_missing_data

array(['BIO', 'CTLT', 'DVA', 'XRAY', 'HSIC', 'LLY', 'PRGO', 'REGN', 'UHS',
       'VAR', 'VTRS'], dtype=object)

In [54]:
# remove all records of instances of companies that have a missing transcript
# even if a company were to only be missing 1 transcript, the company needs to be removed at large, as we can no longer 
# perform the comparison from 2019 to 2020

df = df[df['company_ticker'].isin(comanies_missing_data) == False].reset_index(drop=True)

In [55]:
# after removal of the companies missing transcripts, we will be able to compare the transcripts
# every company will have 2 records in the dataframe, 1 for each of 2019 and 2020

companies_unique = len(df['company_ticker'].unique())
df_count = len(df)
print('there are ' + str(df_count) + ' rows in the dataframe, with 2 records from ' + str(companies_unique) + ' unique companies')

there are 104 rows in the dataframe, with 2 records from 52 unique companies


In [56]:
## resources used for the above section: ##
#  https://codingandfun.com/analysing-company-earning-calls-with-python/

# analysis

objective: check for a change in frequency of select words used in earnings calls from 2019 to 2020

In [57]:
# define list of words to check earnings calls for

words_to_analyze = ['tele',
                    'digital',
                    'virtual',
                    'remote',
                    'home',
                    'medical',
                    'health']

In [58]:
# create columns in existing dataframe df for every word we're checking for from the list above

for word in words_to_analyze:
    df[word] = np.nan
    
df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text,tele,digital,virtual,remote,home,medical,health
0,ABT,2019,3,"[, Operator:, Good, morning, and, thank, you, ...",,,,,,,
1,ABT,2020,3,"[, Operator:, Good, morning, and, thank, you, ...",,,,,,,
2,ABBV,2019,3,"[Operator:, Good, morning, and, thank, you, fo...",,,,,,,
3,ABBV,2020,3,"[, Operator:, Good, morning, and, thank, you, ...",,,,,,,
4,ABMD,2019,3,"[Operator:, Good, day,, ladies, and, gentlemen...",,,,,,,


In [59]:
# populate the columns that were created above with the frequency of the word appearing in each transcript

for i in range(len(df)):
    for word in words_to_analyze:
        try:
            earnings_call = pd.DataFrame(df.loc[i,'transcript_text'],columns=['content'])
        except:
            earnings_call = pd.DataFrame(columns=['content']) # empty
        finally:
            analysis = earnings_call[earnings_call['content'].str.contains(word)]
            word_count = len(analysis)
            df.loc[i,word] = word_count
            

df.head()

Unnamed: 0,company_ticker,year,quarter,transcript_text,tele,digital,virtual,remote,home,medical,health
0,ABT,2019,3,"[, Operator:, Good, morning, and, thank, you, ...",0.0,1.0,0.0,0.0,0.0,3.0,5.0
1,ABT,2020,3,"[, Operator:, Good, morning, and, thank, you, ...",0.0,1.0,0.0,0.0,0.0,8.0,4.0
2,ABBV,2019,3,"[Operator:, Good, morning, and, thank, you, fo...",1.0,0.0,1.0,0.0,0.0,4.0,0.0
3,ABBV,2020,3,"[, Operator:, Good, morning, and, thank, you, ...",0.0,0.0,2.0,0.0,0.0,1.0,2.0
4,ABMD,2019,3,"[Operator:, Good, day,, ladies, and, gentlemen...",0.0,0.0,0.0,0.0,4.0,0.0,1.0


In [60]:
# drop all columns except the company ticker and the word counts
# then split the dataframe into 2, 1 for each 2019 and 2020

cols = ['company_ticker'] + words_to_analyze

df_2019 = df[df['year'] == '2019'][cols]
df_2020 = df[df['year'] == '2020'][cols]

In [61]:
# add '_[YEAR]' to end of each word count column in the 2 dataframes that were just created

for i in df_2019.columns[1:]:    
    df_2019.rename(columns = {i: (i + '_2019')}, inplace = True)
    
for i in df_2020.columns[1:]:
    df_2020.rename(columns = {i: (i + '_2020')}, inplace = True)


df_2020.head()

Unnamed: 0,company_ticker,tele_2020,digital_2020,virtual_2020,remote_2020,home_2020,medical_2020,health_2020
1,ABT,0.0,1.0,0.0,0.0,0.0,8.0,4.0
3,ABBV,0.0,0.0,2.0,0.0,0.0,1.0,2.0
5,ABMD,0.0,0.0,1.0,0.0,2.0,3.0,0.0
7,A,0.0,7.0,0.0,0.0,0.0,1.0,2.0
9,ALXN,0.0,0.0,4.0,2.0,6.0,3.0,8.0


In [62]:
# merge the health_care_df dataframe from earlier in the file with the 2019 and 2020 word count dataframe

health_care_df = health_care_df.merge(df_2019, how = 'inner', on = ['company_ticker'])
health_care_df = health_care_df.merge(df_2020, how = 'inner', on = ['company_ticker'])

cols_word_count = [col for col in health_care_df.columns if '_20' in col]
cols_word_count = sorted(cols_word_count)
cols = ['company_ticker','company','sector','subsector'] + cols_word_count
health_care_df = health_care_df[cols]

health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment,1.0,1.0,5.0,4.0,0.0,0.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,0.0,0.0,0.0,2.0,0.0,0.0,4.0,1.0,0.0,0.0,1.0,0.0,1.0,2.0
2,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,0.0,0.0,1.0,0.0,4.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0
3,A,Agilent Technologies Inc,Health Care,Health Care Equipment,4.0,7.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ALXN,Alexion Pharmaceuticals,Health Care,Pharmaceuticals,0.0,0.0,1.0,8.0,0.0,6.0,0.0,3.0,0.0,2.0,0.0,0.0,0.0,4.0


In [63]:
# create summary table to view the number for companies and total mentions for each word

summary_df = pd.DataFrame(data = np.nan, index = ['num_companies','total_mentions'], columns = cols_word_count)

for i in summary_df.columns:
    summary_df.loc['num_companies',i] = len(health_care_df[health_care_df[i] > 0])
    summary_df.loc['total_mentions',i] = sum(health_care_df[i])
    
summary_df

Unnamed: 0,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020
num_companies,18.0,25.0,46.0,47.0,16.0,28.0,35.0,37.0,3.0,14.0,11.0,26.0,6.0,32.0
total_mentions,97.0,199.0,525.0,632.0,56.0,109.0,157.0,183.0,4.0,51.0,19.0,79.0,9.0,98.0


In [64]:
# manipulate num_companies metric data in format required for visualization

viz1 = summary_df.loc['num_companies'].to_frame().reset_index().rename(columns={'index':'word_year'})
viz1['year'] = viz1['word_year'].apply(lambda x: re.sub('[a-z_]', '', x))
viz1['word'] = viz1['word_year'].apply(lambda x: re.sub('_.*', '', x))
viz1.drop(columns = 'word_year', inplace = True)
viz1 = viz1.pivot(index = 'word', columns = 'year', values = 'num_companies').sort_values('2019')
viz1['pct_change'] = (viz1['2020'] - viz1['2019']) / viz1['2019']
viz1

year,2019,2020,pct_change
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
remote,3.0,14.0,3.666667
virtual,6.0,32.0,4.333333
tele,11.0,26.0,1.363636
home,16.0,28.0,0.75
digital,18.0,25.0,0.388889
medical,35.0,37.0,0.057143
health,46.0,47.0,0.021739


In [65]:
# manipulate total_mentions metric data in format required for visualization
# this data is not used in the blog post

viz2 = summary_df.loc['total_mentions'].to_frame().reset_index().rename(columns={'index':'word_year'})
viz2['year'] = viz2['word_year'].apply(lambda x: re.sub('[a-z_]', '', x))
viz2['word'] = viz2['word_year'].apply(lambda x: re.sub('_.*', '', x))
viz2.drop(columns = 'word_year', inplace = True)
viz2 = viz2.pivot(index = 'word', columns = 'year', values = 'total_mentions').sort_values('2019')
viz2['pct_change'] = (viz2['2020'] - viz2['2019']) / viz2['2019']
viz2

year,2019,2020,pct_change
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
remote,4.0,51.0,11.75
virtual,9.0,98.0,9.888889
tele,19.0,79.0,3.157895
home,56.0,109.0,0.946429
digital,97.0,199.0,1.051546
medical,157.0,183.0,0.165605
health,525.0,632.0,0.20381


In [66]:
# create summary table showing, by subsector, how many companies had an uptick in usage of telehealth-related words

viz3_prep = pd.DataFrame(data = np.nan, index = health_care_df['subsector'].unique(), columns = cols_word_count)

for c in viz3_prep.columns:
    for i in viz3_prep.index:
        viz3_prep.loc[i,c] = len(health_care_df[(health_care_df[c] > 0) & (health_care_df['subsector'] == i)])

words = ['digital','home','remote','tele','virtual']

viz3 = pd.DataFrame()

for i in viz3_prep.index:
    for w in words:
        viz3.loc[i,w] = viz3_prep.loc[i,(w + '_2020')] - viz3_prep.loc[i,(w + '_2019')]
        viz3.loc[i,'company_count'] = len(health_care_df[health_care_df['subsector'] == i])

viz3 = viz3[['company_count'] + words] .sort_values('company_count', ascending=False)
viz3

Unnamed: 0,company_count,digital,home,remote,tele,virtual
Health Care Equipment,19.0,2.0,5.0,2.0,3.0,9.0
Pharmaceuticals,6.0,1.0,3.0,2.0,1.0,4.0
Health Care Distributors,5.0,-1.0,1.0,1.0,2.0,3.0
Biotechnology,5.0,0.0,-1.0,2.0,1.0,4.0
Managed Health Care,5.0,1.0,0.0,1.0,3.0,2.0
Life Sciences Tools & Services,4.0,1.0,1.0,2.0,2.0,1.0
Health Care Supplies,3.0,2.0,2.0,1.0,0.0,3.0
Health Care Services,3.0,0.0,1.0,0.0,2.0,0.0
Health Care Technology,1.0,1.0,0.0,0.0,0.0,1.0
Health Care Facilities,1.0,0.0,0.0,0.0,1.0,-1.0


# company market cap #

objective: pull every healthcare companies' market cap
note: this ended uo not being used in the analysis

In [67]:
market_caps = data.get_quote_yahoo(company_ticker_list)['marketCap'].to_frame()
market_caps['company_ticker'] = market_caps.index
market_caps.reset_index(drop = True, inplace= True)
market_caps.rename(columns = {'marketCap':'market_cap'}, inplace = True)
market_caps.head()

Unnamed: 0,market_cap,company_ticker
0,213374418944,ABT
1,183912923136,ABBV
2,16048289792,ABMD
3,37300682752,A
4,34402623488,ALXN


In [68]:
# merge the health_care_df dataframe from earlier with the market cap dataframe
# sort this merged dataframe by market cap in desc order - shows most valuable companies first

health_care_df = health_care_df.merge(market_caps, how = 'inner', on = ['company_ticker']) \
    .sort_values('market_cap', ascending = False)
health_care_df.head()

Unnamed: 0,company_ticker,company,sector,subsector,digital_2019,digital_2020,health_2019,health_2020,home_2019,home_2020,medical_2019,medical_2020,remote_2019,remote_2020,tele_2019,tele_2020,virtual_2019,virtual_2020,market_cap
32,JNJ,Johnson & Johnson,Health Care,Pharmaceuticals,14.0,5.0,13.0,20.0,0.0,0.0,8.0,10.0,0.0,1.0,0.0,1.0,0.0,5.0,445320462336
46,UNH,UnitedHealth Group Inc.,Health Care,Managed Health Care,10.0,6.0,49.0,32.0,1.0,6.0,11.0,5.0,0.0,1.0,0.0,2.0,0.0,2.0,320843808768
0,ABT,Abbott Laboratories,Health Care,Health Care Equipment,1.0,1.0,5.0,4.0,0.0,0.0,3.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,213374418944
39,PFE,Pfizer Inc.,Health Care,Pharmaceuticals,0.0,0.0,13.0,10.0,0.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,199324221440
36,MRK,Merck & Co.,Health Care,Pharmaceuticals,0.0,1.0,5.0,20.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,197139939328


In [69]:
# write the output file with the final data to csv locally

health_care_df.to_csv('~/Downloads/health_care_df.csv',index = False)

# check specific earnings calls

objective: create function that takes in the following inputs:
* company
* year
* quarter
* word to analyze

which returns all instances of the word in the given earnings call, in context of the sentence it was said

In [70]:
def getTranscript(company_ticker,year,quarter,word_to_analyze):
    
    # apikey is defined earlier in this file
    
    transcript = requests.get(f'https://financialmodelingprep.com/api/v3/earning_call_transcript/{company_ticker}?quarter={quarter}&year={year}&apikey={apikey}').json()
    
    transcript_word_break = transcript[0]['content'].split(' ')
    earnings_call = pd.DataFrame(transcript_word_break,columns=['content'])
    analysis = earnings_call[earnings_call['content'].str.contains(word_to_analyze)]
    len_analysis = len(analysis)
    print("instances of the word '" + word_to_analyze + "': " + str(len_analysis))
    print('')
    
    # show the examples of the word in the full sentence
    transcript_line_break = transcript[0]['content'].split('\n')
    earnings_call = pd.DataFrame(transcript_line_break,columns=['content'])
    analysis = earnings_call[earnings_call['content'].str.contains(word_to_analyze)]
    text_earnings = analysis['content'].values
    
    for text in text_earnings:
        for phrase in text.split('. '):
            if word_to_analyze in phrase:
                  print(phrase)
                  print('')

In [71]:
getTranscript('JNJ','2020','3','virtual')

instances of the word 'virtual': 5

However, given the ongoing limitations for in-person events, we decided to provide a shorter, limited scope virtual update on our Medical Device business featuring all of our digital surgery solutions

We enabled patients to both start and stay on therapy using virtual tools and we deployed a number of technologies to help keep our clinical trials on track

So as Jennifer mentioned, COVID has really been an accelerant for shifts within the med tech industry, really creating a lot of new creative solutions related to digital patient engagement, virtual surgeon training, a shift in sites of care and continued evolution in our business models

To name a couple of examples, we partnered with an education partner, Advances in Surgery, where we virtually trained over 1 million surgeons in 150 different countries on COVID-19 protocols as well as ways to safely stand back up medical procedures

Don't forget to mark your calendars to join us for our virtual m