In [87]:
import pandas as pd
import html5lib

In [88]:
# Prep dataframe for JSON conversion
ticker_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = ticker_list[0]
df.drop(columns = ['SEC filings', 'Date first added', 'Founded'], inplace = True)
df.CIK = df.CIK.astype(str)
df['CIK'] = df['CIK'].str.zfill(10)
df.rename(columns = {'Symbol':'TICKER', 'Security':'COMPANY', 'GICS Sector':'GICS_SECTOR',
                     'GICS Sub Industry':'GICS_INDUSTRY', 'Headquarters Location':'HQ'}, inplace = True)
df['WIKI_URL'] = ''
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,CIK,WIKI_URL
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",66740,
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1800,
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",1551152,
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",815094,
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373,


In [89]:
from bs4 import BeautifulSoup
import requests, re

In [90]:
request = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

In [91]:
# Use BeautifulSoup to extract wiki urls for each ticker and add to dataframe
soup = BeautifulSoup(request.content)
main_table = soup.find(id='constituents')
table = main_table.find('tbody').findAll('tr')
table = table[1:]

base_url = 'https://en.wikipedia.org'
url_list = []
for item in table:
    url = base_url + str(item.findAll('a')[1]['href'])
    url_list.append(url)
    
df['WIKI_URL'] = url_list
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,CIK,WIKI_URL
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",66740,https://en.wikipedia.org/wiki/3M
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1800,https://en.wikipedia.org/wiki/Abbott_Laboratories
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",1551152,https://en.wikipedia.org/wiki/AbbVie_Inc.
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",815094,https://en.wikipedia.org/wiki/Abiomed
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373,https://en.wikipedia.org/wiki/Accenture_plc


In [141]:
def people_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Key people' in df.columns:
        output = df['Key people'][0]
    
    return output

In [143]:
def products_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Products' in df.columns:
        output = df['Products'][0]
    
    return output

In [144]:
def services_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Services' in df.columns:
        output = df['Services'][0]
    
    return output

In [145]:
df['KEY_PEOPLE'] = df['WIKI_URL'].apply(people_extractor)
df['PRODUCTS'] = df['WIKI_URL'].apply(products_extractor)
df['SERVICES'] = df['WIKI_URL'].apply(services_extractor)
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,CIK,WIKI_URL,KEY_PEOPLE,PRODUCTS,SERVICES
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",66740,https://en.wikipedia.org/wiki/3M,"Mike Roman(Chairman, President, & CEO)",none,none
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1800,https://en.wikipedia.org/wiki/Abbott_Laboratories,Miles D. White(Executive Chairman)Robert B. Fo...,Branded generic medicinesMedical devicesDiagno...,none
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",1551152,https://en.wikipedia.org/wiki/AbbVie_Inc.,Richard A. Gonzalez(Chairman and CEO)Michael S...,Pharmaceutical drugs Humira (adalimumab) Imbru...,none
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",815094,https://en.wikipedia.org/wiki/Abiomed,"Michael R. Minogue, CEO",Cardiovascular medical implant devices,none
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373,https://en.wikipedia.org/wiki/Accenture_plc,David RowlandJulie Sweet (CEO),none,"Strategy & Consulting, Interactive, Technology..."


In [146]:
df.to_pickle('wiki_table.pkl')