In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

Source used to find most data was Wikipedia.

The exception are the social media links, which I found by scraping the front pages of each individual company website.

In [4]:

#Aggregating the wikipedia pages of 5 of the companies, Bouygues only has a French wikipedia page, so is added later
pages=['https://en.wikipedia.org/wiki/Microsoft',
       'https://en.wikipedia.org/wiki/HSBC',
       'https://en.wikipedia.org/wiki/BNP_Paribas',
       'https://en.wikipedia.org/wiki/Dataiku',
       'https://en.wikipedia.org/wiki/Salesforce']


#Empty dictionary to gradually fill with data
data={'Name':[],'Headquarters':[],'Revenue':[],'Number of employees':[],'Website':[],'Social Media Links':[]}



#Scraping each English wikipedia page
for n in pages:
    page = requests.get(n)
    soup = BeautifulSoup(page.content, 'html.parser')
    data['Name'].append(soup.find('h1').get_text())  #Names are found in page headers
    table=soup.find('table',{'class':'infobox vcard'})  #Most data is extracted from the infobox on the side of the page
    result = {}
    
    for tr in table.find_all('tr'): 
        if tr.find('th'):
            result[tr.find('th').text] = \
            tr.find('td').get_text(separator=" ").replace('  ',' ').replace(' ,',',').replace('\xa0','').replace(' .','.')
    #Parsing through the infobox data and cleaning it to make more readable
            
    data['Headquarters'].append(result['Headquarters'])
    data['Revenue'].append(result['Revenue'])
    data['Number of employees'].append(result['Number of employees'])
    data['Website'].append(result['Website'])
    

    
#Same process as above for Bouygues, except changed to fit the specifics of French Wikipedia
page = requests.get('https://fr.wikipedia.org/wiki/Bouygues_Construction')
soup = BeautifulSoup(page.content, 'html.parser')
data['Name'].append(soup.find('h1').get_text())
table=soup.find('table',{'class':'infobox_v2 noarchive'})
result = {}

for tr in table.find_all('tr'):
    if tr.find('th'):
        result[tr.find('th').text] = \
        tr.find('td').get_text(separator=" ").replace('  ',' ').replace(' ,',',').replace('\xa0','').replace('\n','')
        
data['Headquarters'].append(result['Siège social\n'])
data['Revenue'].append(result["Chiffre d'affaires\n"])
data['Number of employees'].append(result['Effectif\n'])
data['Website'].append(result['Site web\n'])    




#Further cleaning Revenue data, in order to have a float at the end
temp=[]
for x in data['Revenue']:
    if 'billion' in x:
        temp.append(float(x.split('billion')[0].split()[1])*1000000000)
    elif 'million' in x:
        temp.append(float(x.split('million')[0].split()[1])*1000000)
    elif 'Mds' in x:
        temp.append(float(x.split()[0].replace(',','.'))*1000000000)

data['Revenue']=temp


#Further cleaning Employee data, in order to have an int at the end
temp=[]
for x in data['Number of employees']:
    if 'en' in x:
        temp.append(int(x.split('en')[0].replace(' ','')))
    else:
        temp.append(int(x.split()[0].replace('+','').replace(',','')))
data['Number of employees']=temp



#Individually scraping through the front webpages of each company, in order to find social media links
#usually located at the bottom of the page, with small changes to accomodate each page
#Links are then added as a list to their respective dictionary entries
temp=[]
page = requests.get('https://www.microsoft.com/en-us/')
soup = BeautifulSoup(page.content, 'html.parser')
for x in soup.find_all('li',{'class':"list-inline-item mr-g"}):
    temp.append(x.find('a')['href'])
data['Social Media Links'].append(temp)

temp=[]
page = requests.get('https://www.hsbc.com/')
soup = BeautifulSoup(page.content, 'html.parser')
for x in soup.find_all('li',{'class':"footer-social__item"}):
    temp.append(x.find('a')['href'])
data['Social Media Links'].append(temp)
 
temp=[]
page = requests.get('https://group.bnpparibas/en/')
soup = BeautifulSoup(page.content, 'html.parser')
soup = soup.find('div',{'class','col-md-2 col-sm-3 col-xs-6 footer-social links'})
for x in soup.find_all('li'):
     temp.append(x.find('a')['href'])
data['Social Media Links'].append(temp)
    
temp=[]
page = requests.get('https://www.dataiku.com/')
soup = BeautifulSoup(page.content, 'html.parser')
soup = soup.find('ul',{'class','socials'})
for x in soup.find_all('li'):
    temp.append(x.find('a')['href'])
data['Social Media Links'].append(temp)

temp=[]
page = requests.get('https://www.salesforce.com/fr/?ir=1')
soup = BeautifulSoup(page.content, 'html.parser')
soup = soup.find('div',{'class','footer-social-links'})
for x in soup.find_all('a'):
    temp.append(x['href'])
data['Social Media Links'].append(temp)

temp=[]
page = requests.get('https://www.bouygues-construction.com/')
soup = BeautifulSoup(page.content, 'html.parser')
soup = soup.find('div',{'class','block block-bouygues-main no-title even last block-count-10 block-region-footer block-top-social-links'})
for x in soup.find_all('li'):
    temp.append(x.find('a')['href'])
data['Social Media Links'].append(temp)


#Final dataframe
df=pd.DataFrame(data)
df

Unnamed: 0,Name,Headquarters,Revenue,Number of employees,Website,Social Media Links
0,Microsoft,"One Microsoft Way Redmond, Washington, U.S.",198300000000.0,221000,microsoft.com,"[https://www.facebook.com/Microsoft, https://t..."
1,HSBC,"8 Canada Square London, England, UK",49552000000.0,219697,www.hsbc.com,"[https://twitter.com/HSBC, https://www.linkedi..."
2,BNP Paribas,"Boulevard des Italiens, Paris, France",46200000000.0,190000,group.bnpparibas,"[https://twitter.com/BNPParibas, https://www.l..."
3,Dataiku,"New York City, United States",150000000.0,1000,dataiku.com,"[https://www.facebook.com/dataiku/, https://ww..."
4,Salesforce,"Salesforce Tower San Francisco, California, U.S.",26490000000.0,73542,salesforce.com,"[http://www.facebook.com/salesforceFrance, htt..."
5,Bouygues Construction,Challenger à Guyancourt ( Saint-Quentin-en-Yve...,12800000000.0,52800,bouygues-construction.com,[https://www.bouygues-construction.com/blog/fr...


# Data visualization done in Tableau can be found here:

https://public.tableau.com/app/profile/philippe.gaudin/viz/DelphaDataScrapingTest/CompanyDashboard?publish=yes