In [1]:
# Import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

def table_df(html_table_bs, index_on=None):
    
    df_columns = [column.get_text(strip=True, separator=" ") for column in html_table_bs.findAll('tr')[0].findAll('th')]
        
    # Loop through the data rows and store into Python array
    df_data = []
    for row in html_table_bs.tbody.findAll('tr')[1:]:
        df_data.append([td.get_text(strip=True, separator=" ") for td in row.findAll('td')])

    dataframe = pd.DataFrame(data=df_data, columns=df_columns)
    if index_on is None:
        return dataframe
    
    dataframe.set_index(index_on, inplace=True)
    return dataframe

# Get HTML data
html_data = requests.get('https://en.wikipedia.org/wiki/Amazon_(company)#Finances').text
wikitable = BeautifulSoup(html_data, 'html.parser').find('table', {'class': 'wikitable float-left'})
table_df(wikitable, ['Year'])

Unnamed: 0_level_0,Revenue [148] in mil. USD$,Net income in mil. USD$,Total Assets in mil. USD$,Employees
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1998,610,,,
1999,1639,,,
2000,2761,,,
2001,3122,,,
2002,3932,,,
2003 [149],5263,35,2162.0,
2004 [149],6921,588,3248.0,
2005 [149],8490,359,3696.0,
2006 [149],10711,190,4363.0,
2007 [149],14835,476,6485.0,17000.0


In [6]:
import re, requests

def getCIKs(TICKERS):
    URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    CIK_RE = re.compile(r'.*CIK=(\d{10}).*')    
    cik_dict = {}
    for ticker in TICKERS:
        res = requests.get(URL.format(ticker), stream = True)
        results = CIK_RE.findall(res.text)
        if len(results):
            results[0] = int(re.sub('\.[0]*', '.', results[0]))
            cik_dict[str(ticker).upper()] = str(results[0])
    f = open('cik_dict', 'w')   
    print(cik_dict)
    f.close()

getCIKs(['wmt','amzn','nflx', 'FGPHF'])
# returns:
# {'WMT': '104169', 'AMZN': '1018724', 'NFLX': '1065280'}

{'WMT': '104169'}


In [10]:
html_data = requests.get('https://en.wikipedia.org/wiki/Tesla,_Inc.').text
wikitable = BeautifulSoup(html_data, 'html.parser').find('table', {'class': 'wikitable'})
table_df(wikitable, ['Joined'])

Unnamed: 0_level_0,Name,Titles,Independent,Notes
Joined,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014 [76],Robyn Denholm,"Full-time Chairwoman of Tesla, Inc.; former CF...",Yes,"As of March 2020, Denholm is the only Board me..."
2004 [7],Elon Musk,"Co-founder, CEO and Product Architect of Tesla...",No,
2004 [79],Kimbal Musk,"Board member, SpaceX [80]",No,
2007 [81],Ira Ehrenpreis,General Partner at Technology Partners [73],Disputed [68],
2007 [79],Antonio J. Gracias,CEO and Chairman of the Investment Committee a...,Disputed [68],Has agreed not to stand for re-election when h...
2017 [73],James Murdoch,Former CEO of 21st Century Fox [73],Yes,
2018 [70],Larry Ellison,"Co-founder, Chairman and CTO of Oracle Corpora...",Yes,
2018 [70],Kathleen Wilson-Thompson,Global head of Human Resources of Walgreens Bo...,Yes,
2020 [84],Hiromichi Mizuno,United Nations Special Envoy on Innovative Fin...,Yes,


In [24]:
vbk_url = 'https://finance.yahoo.com/quote/VBK/holdings'
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' }
method = 'GET'
payload = {
            'p':'VBK'
}

# build url
holdings_url = requests.Request(method, vbk_url, params=payload).prepare().url
holdings_url

'https://finance.yahoo.com/quote/VBK/holdings?p=VBK'

In [31]:
html_data = requests.get(holdings_url, headers=headers).text
wikitable = BeautifulSoup(html_data, 'html.parser').find('table', {'class': 'W(100%) M(0) BdB Bdc($seperatorColor)'})
print(wikitable)
table_df(wikitable, ['Name'])

<table class="W(100%) M(0) BdB Bdc($seperatorColor)"><thead><tr class="Ta(end) C($tertiaryColor) Fz(xs)"><th class="Ta(start) Fw(400) Py(6px)"><span>Name</span></th><th class="Ta(start) Fw(400) Py(6px)"><span>Symbol</span></th><th class="Fw(400) Py(6px)"><span>% Assets</span></th></tr></thead><tbody><tr class="Ta(end) BdT Bdc($seperatorColor) H(36px)"><td class="Ta(start)">Charles River Laboratories International Inc</td><td class="Ta(start)"><a class="Fz(s) Ell Fw(b) C($linkColor)" href="/quote/CRL?p=CRL">CRL</a></td><td class="">0.78%</td></tr><tr class="Ta(end) BdT Bdc($seperatorColor) H(36px)"><td class="Ta(start)">Avantor Inc</td><td class="Ta(start)"><a class="Fz(s) Ell Fw(b) C($linkColor)" href="/quote/AVTR?p=AVTR">AVTR</a></td><td class="">0.73%</td></tr><tr class="Ta(end) BdT Bdc($seperatorColor) H(36px)"><td class="Ta(start)">Bio-Techne Corp</td><td class="Ta(start)"><a class="Fz(s) Ell Fw(b) C($linkColor)" href="/quote/TECH?p=TECH">TECH</a></td><td class="">0.73%</td></tr><t

Unnamed: 0_level_0,Symbol,% Assets
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Avantor Inc,AVTR,0.73%
Bio-Techne Corp,TECH,0.73%
Pool Corp,POOL,0.73%
PerkinElmer Inc,PKI,0.72%
Entegris Inc,ENTG,0.70%
PTC Inc,PTC,0.62%
Fair Isaac Corp,FICO,0.57%
Bill.com Holdings Inc Ordinary Shares,BILL,0.56%
Avalara Inc,AVLR,0.55%


In [35]:
vbk_url = 'https://finance.yahoo.com/quote/VBK/holdings?p=VBK'
html_data = requests.get(vbk_url, headers=headers).text
print(vbk_url)    
pd.read_html(html_data, 
                        header=0, 
                        attrs={'class': 'W(100%) M(0) BdB Bdc($seperatorColor)'}, 
                        index_col='Name')[0]

https://finance.yahoo.com/quote/VBK/holdings?p=VBK


Unnamed: 0_level_0,Symbol,% Assets
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Charles River Laboratories International Inc,CRL,0.78%
Avantor Inc,AVTR,0.73%
Bio-Techne Corp,TECH,0.73%
Pool Corp,POOL,0.73%
PerkinElmer Inc,PKI,0.72%
Entegris Inc,ENTG,0.70%
PTC Inc,PTC,0.62%
Fair Isaac Corp,FICO,0.57%
Bill.com Holdings Inc Ordinary Shares,BILL,0.56%
Avalara Inc,AVLR,0.55%


In [73]:
def convert_pct(value):
    return float(value.strip('%')) / 100.0

def get_etf_holdings(name):
    yahoo_finance_url = f'https://finance.yahoo.com/quote/{name}/holdings?p={name}'
    html_data = requests.get(vbk_url, headers=headers).text
    etf_data =  pd.read_html(html_data, 
                        header=0, 
                        attrs={'class': 'W(100%) M(0) BdB Bdc($seperatorColor)'}, 
                        index_col='Symbol',
                        converters={'% Assets':convert_pct})[0]

    return etf_data

In [74]:
etf_data = get_etf_holdings('SCHA')
etf_data

Unnamed: 0_level_0,Name,% Assets
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
CRL,Charles River Laboratories International Inc,0.0078
AVTR,Avantor Inc,0.0073
TECH,Bio-Techne Corp,0.0073
POOL,Pool Corp,0.0073
PKI,PerkinElmer Inc,0.0072
ENTG,Entegris Inc,0.007
PTC,PTC Inc,0.0062
FICO,Fair Isaac Corp,0.0057
BILL,Bill.com Holdings Inc Ordinary Shares,0.0056
AVLR,Avalara Inc,0.0055
