In [2]:
import pandas as pd
import html5lib

In [3]:
# Prep dataframe for JSON conversion
ticker_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
df = ticker_list[0]
df.drop(columns = ['SEC filings', 'Founded'], inplace = True)
df.CIK = df.CIK.astype(str)
df['CIK'] = df['CIK'].str.zfill(10)
df.rename(columns = {'Symbol':'TICKER', 'Security':'COMPANY', 'GICS Sector':'GICS_SECTOR', 'Date first added':'DATE',
                     'GICS Sub Industry':'GICS_INDUSTRY', 'Headquarters Location':'HQ'}, inplace = True)
df['WIKI_URL'] = ''
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,DATE,CIK
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",1976-08-09,66740
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373


In [2]:
from bs4 import BeautifulSoup
import requests, re

In [90]:
request = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

In [91]:
# Use BeautifulSoup to extract wiki urls for each ticker and add to dataframe
soup = BeautifulSoup(request.content)
main_table = soup.find(id='constituents')
table = main_table.find('tbody').findAll('tr')
table = table[1:]

base_url = 'https://en.wikipedia.org'
url_list = []
for item in table:
    url = base_url + str(item.findAll('a')[1]['href'])
    url_list.append(url)
    
df['WIKI_URL'] = url_list
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,CIK,WIKI_URL
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",66740,https://en.wikipedia.org/wiki/3M
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1800,https://en.wikipedia.org/wiki/Abbott_Laboratories
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",1551152,https://en.wikipedia.org/wiki/AbbVie_Inc.
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",815094,https://en.wikipedia.org/wiki/Abiomed
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373,https://en.wikipedia.org/wiki/Accenture_plc


In [141]:
def people_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Key people' in df.columns:
        output = df['Key people'][0]
    
    return output

In [143]:
def products_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Products' in df.columns:
        output = df['Products'][0]
    
    return output

In [144]:
def services_extractor(url):

    output = 'none' # return none if field doesn't exist
    vcard_list = pd.read_html(url)
    df = vcard_list[0]
    
    if len(df.columns) == 2:
        df.columns = ['columns', 'data']
        # most output tables have 2 elements
    else:
        df.columns = ['columns', 'data', 'trash']
        df.drop(columns = 'trash', inplace = True)
        # sometimes the table output has 3 elements
        
    df.set_index(df['columns'], inplace = True)
    df.drop(columns = 'columns', inplace = True)
    df = df.transpose()
    
    if 'Services' in df.columns:
        output = df['Services'][0]
    
    return output

In [145]:
df['KEY_PEOPLE'] = df['WIKI_URL'].apply(people_extractor)
df['PRODUCTS'] = df['WIKI_URL'].apply(products_extractor)
df['SERVICES'] = df['WIKI_URL'].apply(services_extractor)
df.head()

Unnamed: 0,TICKER,COMPANY,GICS_SECTOR,GICS_INDUSTRY,HQ,CIK,WIKI_URL,KEY_PEOPLE,PRODUCTS,SERVICES
0,MMM,3M Company,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",66740,https://en.wikipedia.org/wiki/3M,"Mike Roman(Chairman, President, & CEO)",none,none
1,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1800,https://en.wikipedia.org/wiki/Abbott_Laboratories,Miles D. White(Executive Chairman)Robert B. Fo...,Branded generic medicinesMedical devicesDiagno...,none
2,ABBV,AbbVie Inc.,Health Care,Pharmaceuticals,"North Chicago, Illinois",1551152,https://en.wikipedia.org/wiki/AbbVie_Inc.,Richard A. Gonzalez(Chairman and CEO)Michael S...,Pharmaceutical drugs Humira (adalimumab) Imbru...,none
3,ABMD,ABIOMED Inc,Health Care,Health Care Equipment,"Danvers, Massachusetts",815094,https://en.wikipedia.org/wiki/Abiomed,"Michael R. Minogue, CEO",Cardiovascular medical implant devices,none
4,ACN,Accenture plc,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",1467373,https://en.wikipedia.org/wiki/Accenture_plc,David RowlandJulie Sweet (CEO),none,"Strategy & Consulting, Interactive, Technology..."


In [146]:
df.to_pickle('wiki_table.pkl')

In [4]:
df = pd.read_pickle('wiki_table.pkl')

In [5]:
df['PEOPLE_LIST'] = ''
bad_list = []

for i in df.index:

    inp = df['KEY_PEOPLE'][i]
    
    if inp != 'none':
        
        # Remove title words found outside parens or brackets
        text = inp.replace('and', '').replace('CFO', '').replace('CEO', '').replace('&', '').replace('.', '').replace(',', '')\
                  .replace('President', '').replace('/', '').replace('Pres', '').replace('Chairman', '').replace('General', '')\
                  .replace('counsel', '').replace('Chair', '').replace('Managing', '').replace('Director', '')\
                  .replace('Chief', '').replace('Executive', '').replace('Officer', '').replace('Jr', '').replace('of', '')\
                  .replace('the', '').replace('Board', '').replace('Governed', '').replace('14-member', '').replace('Trustees', '')\
                  .replace('Sanford', '').replace('Cloud', '').replace('Lead', '').replace('Trustee', '').replace('chief', '')\
                  .replace('executive', '').replace('officer', '').replace('ficer', '').replace('vice', '').replace('president', '')\
                  .replace('financial', '').replace('legal', '').replace('legal', '').replace('Dr', '').replace('chairman', '')\
                  .replace('SO', '')
        
        # Regex to remove all words inside parenthesis and brackets, special characters, numbers, etc;
        text1 = re.sub("[\(\[].*?[\)\]]", " ", text)
        text2 = re.sub(r"(\w)([A-Z])", r"\1 \2", text1)
        text3 = re.sub(r'"[^)]*"', '', text2)
        text4 = re.sub(' [A-Z]* ', ' ', text3)
        text5 = re.sub('[A-Z]* ', ' ', text4)
        text6 = re.sub(r'[^a-zA-Z0-9\s]', '', text5)
        text7 = re.sub('[0-9]', '', text6)
        out = text7.strip()
        word_list = out.split()
        
        df['PEOPLE_LIST'][i] = word_list
        
        # Attempt at identifying lists that cannot be easily merged first name + last name by find lists with odd number of elements;
        if len(df['PEOPLE_LIST'][i]) % 2 != 0:
            bad_list.append(i)

In [6]:
len(bad_list)

48

In [7]:
# Looking at output from above; specifically the 48 entries that have odd numbers
df['PEOPLE_LIST'].iloc[bad_list]

7      [Lisa, Su, John, Edward, Caldwell, Mark, Paper...
11                                    [Mike, Mc, Mullen]
42        [Elizabeth, Tallett, Gail, Koziara, Boudreaux]
51     [Jayshree, Ullal, Andy, Bechtolsheim, Kenneth,...
74     [Owen, Thomas, Douglas, Linde, Michael, La, Be...
77                      [Henry, Samueli, Tan, Hock, Eng]
79              [Lawson, Whiting, George, Garvin, Brown]
110                                   [Bruce, Van, Saun]
123                                     [John, Mc, Avoy]
127                  [Wendell, Weeks, Lawrence, Mc, Rae]
138    [Javier, Rodriguez, Peter, Grauer, lead, indep...
193                [James, Herbert, Hafize, Gaye, Erkan]
199                  [William, Clay, Ford, Jim, Hackett]
205    [Jennifer, Johnson, Gregory, Johnson, Rupert, ...
219                          [Macpherson, Thomas, Okray]
245    [Lawrence, Kingsley, Jay, Mazelsky, Brian, Mc,...
248    [Francis, de, Souza, Jay, Flatley, Sam, Samad,...
259    [Rex, Adams, Martin, Fla

In [8]:
# Looking at entire output
df['PEOPLE_LIST'].head(50)

0                                         [Mike, Roman]
1           [Miles, White, Robert, Ford, Robert, Funck]
2                [Richard, Gonzalez, Michael, Severino]
3                                    [Michael, Minogue]
4                           [David, Rowl, Julie, Sweet]
5     [Brian, Kelly, Bobby, Kotick, Daniel, Alegre, ...
6     [Shantanu, Narayen, John, Murphy, Abhay, Paras...
7     [Lisa, Su, John, Edward, Caldwell, Mark, Paper...
8                                                      
9                          [Jay, Morse, Andres, Gluski]
10                     [Dan, Amos, Frederick, Crawford]
11                                   [Mike, Mc, Mullen]
12                                     [Seifi, Ghasemi]
13               [Thomson, Leighton, Frederic, Salerno]
14           [Brad, Tilden, Ben, Minicucci, Gary, Beck]
15                                                     
16                      [Joel, Marcus, Dean, Shigenaga]
17                     [Leonard, Bell, Ludwig, H

In [9]:
# Looking at original scraped text
df['KEY_PEOPLE'].head(50)

0                Mike Roman(Chairman, President, & CEO)
1     Miles D. White(Executive Chairman)Robert B. Fo...
2     Richard A. Gonzalez(Chairman and CEO)Michael S...
3                               Michael R. Minogue, CEO
4                        David RowlandJulie Sweet (CEO)
5     Brian Kelly(chairman)Bobby Kotick(CEO)Daniel A...
6     Shantanu Narayen (Chairman and CEO)John F. Mur...
7     Lisa Su (President and CEO)John Edward Caldwel...
8                                                  none
9     Jay Morse (Chairman)Andres Gluski (President, ...
10    Dan Amos (Chairman, CEO)Frederick J. Crawford ...
11                                  Mike McMullen (CEO)
12       Seifi Ghasemi (CEO),(chairman) and (president)
13    F. Thomson Leighton(CEO and Co-Founder)Frederi...
14    Brad Tilden (Chairman and CEO) Ben Minicucci (...
15                                                 none
16       Joel S. Marcus, ChairmanDean A. Shigenaga, CFO
17       Leonard Bell (Chairman)Ludwig N. Hantso

In [10]:
# Repair last names that start with 'Mc', 'De', 'Le', etc, due to regex seperating
for ind in df['PEOPLE_LIST'].iloc[bad_list].index:
    for i in range(len(df['PEOPLE_LIST'][ind])):
        if df['PEOPLE_LIST'][ind][i] in ['Mc', 'De', 'Le', 'La', 'de']:
            df['PEOPLE_LIST'][ind][i] = df['PEOPLE_LIST'][ind][i]+df['PEOPLE_LIST'][ind][i+1]
            df['PEOPLE_LIST'][ind].pop(i+1)
            break

In [11]:
new_bad_list = []
for i in df.index:
    if len(df['PEOPLE_LIST'][i]) % 2 != 0:
        new_bad_list.append(i)

In [12]:
df['PEOPLE_LIST'].iloc[new_bad_list]

7      [Lisa, Su, John, Edward, Caldwell, Mark, Paper...
42        [Elizabeth, Tallett, Gail, Koziara, Boudreaux]
77                      [Henry, Samueli, Tan, Hock, Eng]
79              [Lawson, Whiting, George, Garvin, Brown]
110                                   [Bruce, Van, Saun]
138    [Javier, Rodriguez, Peter, Grauer, lead, indep...
193                [James, Herbert, Hafize, Gaye, Erkan]
199                  [William, Clay, Ford, Jim, Hackett]
205    [Jennifer, Johnson, Gregory, Johnson, Rupert, ...
219                          [Macpherson, Thomas, Okray]
259    [Rex, Adams, Martin, Flanagan, Phil, Taylor, S...
261            [Dennis, Gillings, Founder, Ari, Bousbib]
308    [Ajaypal, Singh, Banga, Richard, Haythornthwaite]
319    [Sanjay, Mehrotra, Gurtej, Singh, Shu, Robert,...
328              [Robert, Lumpkins, James, Joc, ORourke]
338    [Rupert, Murdoch, Lachlan, Murdoch, Robert, Ja...
339    [Rupert, Murdoch, Lachlan, Murdoch, Robert, Ja...
350                            

In [13]:
# Second pass over removing numbers and special characters
for ind in df['PEOPLE_LIST'].index:
    if ind in bad_list:
        for i in range(len(df['PEOPLE_LIST'][ind])):
            pattern = '[0-9]'
            df['PEOPLE_LIST'][ind][i] = re.sub(r'[^a-zA-Z0-9\s]', '', df['PEOPLE_LIST'][ind][i])
            df['PEOPLE_LIST'][ind][i] = re.sub(pattern, '', df['PEOPLE_LIST'][ind][i])

In [14]:
df['PEOPLE_LIST'].iloc[bad_list]

7      [Lisa, Su, John, Edward, Caldwell, Mark, Paper...
11                                      [Mike, McMullen]
42        [Elizabeth, Tallett, Gail, Koziara, Boudreaux]
51     [Jayshree, Ullal, Andy, Bechtolsheim, Kenneth,...
74     [Owen, Thomas, Douglas, Linde, Michael, LaBell...
77                      [Henry, Samueli, Tan, Hock, Eng]
79              [Lawson, Whiting, George, Garvin, Brown]
110                                   [Bruce, Van, Saun]
123                                       [John, McAvoy]
127                    [Wendell, Weeks, Lawrence, McRae]
138    [Javier, Rodriguez, Peter, Grauer, lead, indep...
193                [James, Herbert, Hafize, Gaye, Erkan]
199                  [William, Clay, Ford, Jim, Hackett]
205    [Jennifer, Johnson, Gregory, Johnson, Rupert, ...
219                          [Macpherson, Thomas, Okray]
245    [Lawrence, Kingsley, Jay, Mazelsky, Brian, McK...
248    [Francis, deSouza, Jay, Flatley, Sam, Samad, F...
259    [Rex, Adams, Martin, Fla

In [None]:
# This is where code to clean Products text will go

In [None]:
# This is where code to clean Serices text will go

In [None]:
# This is where the code to convert to JSON will go

## Code in cells below are attempts at scraping names directly from websites using BeautifulSoup (abandoned)

In [310]:
#request1 = requests.get('https://en.wikipedia.org/wiki/Apple_Inc.')
#soup1 = BeautifulSoup(request1.content)

#vcard = soup.find(class_="infobox vcard")
#table_rows = vcard.find('tbody').findAll('tr')

#for item in table_rows:
#    for div in item.findAll('div'):
#        if div.text == 'Key people':
#            output = []
#            people = item
#            for person in people.find('td').findAll('li'):
#                output.append(person.find('a').text)
#            print(output)

['Arthur D. Levinson', 'Tim Cook', 'Jeff Williams']


In [317]:
#len(str(list(people.find('td').children)))

85

In [316]:
#request2 = requests.get('https://en.wikipedia.org/wiki/Abiomed')
#soup2 = BeautifulSoup(request2.content)

#vcard2 = soup2.find(class_="infobox vcard")
#table_rows2 = vcard2.find('tbody').findAll('tr')

#for item in table_rows2:
#    for div in item.findAll('div'):
#        if div.text == 'Key people':
#            output = []
#            people = item
#            for person in people.find('td').findAll('li'):
#                output.append(person.find('a').text)
#            print(output)

[]


In [318]:
#request3 = requests.get('https://en.wikipedia.org/wiki/Accenture')
#soup3 = BeautifulSoup(request3.content)

#vcard3 = soup3.find(class_="infobox vcard")
#table_rows3 = vcard3.find('tbody').findAll('tr')

#for item in table_rows3:
#    for div in item.findAll('div'):
#        if div.text == 'Key people':
#            output = []
#            people = item
#            for person in people.find('td').findAll('li'):
#                output.append(person.find('a').text)
#            print(output)

['Julie Sweet']
