# Various sources of S&P 500 company names

In [None]:
from pathlib import Path
import re

import requests
import lxml.etree as et
import pandas as pd


def pp(el):
    print(et.tostring(el, pretty_print=True).decode('utf-8'))
    

def normalize_whitespace(s):
    return re.sub('\s+', ' ', s)


def download(url):
    response = requests.get(url)
    return et.fromstring(response.content, parser=et.HTMLParser())

def save(df, fn):
    assert list(df.columns) == ['symbol', 'name']
    df['name'] = df['name'].map(normalize_whitespace)
    df.to_csv(fn, index=False)

    
datadir = Path('../data').absolute()
datadir.mkdir(exist_ok=True, parents=True)

# wikipedia

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'


html = download(url)
tables = html.findall('.//table')
table = tables[0]

rows = table.findall('./tbody/tr')
header = rows.pop(0)
columns = [list(el.itertext())[0].strip() for el in header.findall('th')]
data = [['\n'.join(s.strip() for s in el.itertext()) for el in r.findall('td')] for r in rows]
df = pd.DataFrame(data=data, columns=columns)
df = df[['Symbol', 'Security']].rename(columns={'Symbol': 'symbol', 'Security': 'name'})
save(df, datadir.joinpath('wikipedia.csv'))

# slickcharts

In [None]:
url = 'https://www.slickcharts.com/sp500'
html = download(url)

table = html.findall('.//table')[0]

columns = [el.text for el in table.findall('./thead/tr/th')]
rows = table.findall('./tbody/tr')
data = [[' '.join(el.itertext()).strip() for el in r.findall('td')] for r in rows]
df = pd.DataFrame(data=data, columns=columns)

df = df[['Symbol', 'Company']].rename(columns={'Symbol': 'symbol', 'Company': 'name'})

save(df, datadir.joinpath('slickcharts.csv'))