# Web Scrape


##### Description: 
##### Website: www.forbes.com/billionaires/
##### Author: Pedro Sanhueza

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime


In [None]:
path = '../../../Web Driver/forbes billionaires (py)/forbes - historical data/forbes_profiles 31-05-2022 181655.csv'

# Read CSV from Web Driver (refer to the web driver code to web scrape the list from where to get the profiles)
forbes_profiles = pd.read_csv(path,usecols=['URL'])['URL'].to_list()


In [None]:
billionaires = []

In [None]:
# RUNNING TIME: 57 minutes

# open each url and save selected information into a dictionary
# build a data frame (table) with the dictionaries

for url in forbes_profiles:
    response = requests.get(url).text # store url's html data into a variable
    soup = BeautifulSoup(response, 'html.parser') # parse the data for 'Beautiful Soup' to navigate it

    billionary = {'Name': soup.find('div', {"class":"listuser-header__name"}).get_text()} # create dictionary with name

    try: # not all profiles have their title
        billionary['Title'] = soup.find('div', {"class":"listuser-header__headline--title"}).get_text() # add profile into dictionary
    except:
        pass
    
    keys = [ x.get_text() for x in soup.find_all('span', {"class":"profile-stats__title"}) ]
    values = [ x.get_text() for x in soup.find_all('span', {"class":"profile-stats__text"})[::2] ]
    billionary = billionary | { k: v for k, v in zip(keys,values) } # add profile into dictionary

    billionary['Image'] = soup.find('div', {"class":"listuser-image"})["style"][21:-2] # add image into dictionary

    net_worth_value = [ x.get_text() for x in soup.find_all('div', {"class":"profile-info__item-value"}) ]
    net_worth_key = [ x.get_text() for x in soup.find_all('h3', {"class":"profile-info__item-title"}) ]
    billionary = billionary | { k: v for k, v in zip(net_worth_key,net_worth_value) } # merge net worth into dictionary

    try: # not all profiles have this infomation
        hisotrical_net_worth = [ x[ x.find('$') : x.find('","name"') ] for x in soup.find('canvas', {"class":"person-networth-chart"})['data-chart'][1:-2].split('},')]
        hisotrical_date = [ x[ len(x)-5: len(x)-1] for x in soup.find('canvas', {"class":"person-networth-chart"})['data-chart'][1:-2].split('},')]
        billionary = billionary | { k: v for k, v in zip(hisotrical_date,hisotrical_net_worth) } # merge historical net worth into dictionary
    except:
        pass

    try: # not all profiles have this infomation
        billionary['Quote'] = soup.find('p').get_text() # add quote into dictionary
    except:
        pass

    billionary['Source Page'] = url # add scraped url into dictionary

    billionaires.append(billionary) # append dictionary (row in dataFrame) into list

    print(len(billionaires), 'out of', len(forbes_profiles))


In [None]:
# optional: table as csv
file_path = '../Billionaires - Historical Data/Billionaires ' + datetime.now().strftime("%d-%m-%Y %H%M%S") + ".csv" # folder location with file name
pd.DataFrame(billionaires).to_csv(file_path) # save data frame as csv in file location


In [None]:
# Optional: Reorder Columns by Direct Assignment

pd.DataFrame(billionaires
    ).drop(['Lifetime Giving','Giving as a percentage of net worth','Agent', 'Agency', 'Clients', 'Salary/Winnings'],axis=1
    ).reindex(columns= [
    'Name','Title','Age','Source of Wealth','Self-Made Score','Philanthropy Score','Residence','Citizenship','Marital Status',
    'Children','Education','Image','Real Time Net Worth','2022 Billionaires Net Worth',
    '1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010',
    '2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022','Quote','Source Page']
    ).to_csv(file_path)

In [2]:
pd.read_csv('../Billionaires - Output/Billionaires 01-06-2022 205250.csv').head()

Unnamed: 0.1,Unnamed: 0,Name,Title,Age,Source of Wealth,Self-Made Score,Philanthropy Score,Residence,Citizenship,Marital Status,...,2015,2016,2017,2018,2019,2020,2021,2022,Quote,Source Page
0,0,Elon Musk,"CEO, Tesla",50.0,"Tesla, SpaceX, Self Made",8.0,1.0,"Austin, Texas",United States,Single,...,$12B,$10.7B,$13.9B,$19.9B,$22.3B,$24.6B,$151B,$219B,I operate on the physics approach to analysis....,https://www.forbes.com/profile/elon-musk/?list...
1,1,Jeff Bezos,"Chairman and Founder, Amazon",58.0,"Amazon, Self Made",8.0,1.0,"Seattle, Washington",United States,In Relationship,...,$34.8B,$45.2B,$72.8B,$112B,$131B,$113B,$177B,$171B,I didn't think I'd regret trying and failing. ...,https://www.forbes.com/profile/jeff-bezos/?lis...
2,2,Bernard Arnault & family,"Chairman and CEO, LVMH Moët Hennessy Louis Vui...",73.0,LVMH,,,"Paris, France",France,Married,...,$37.2B,$34B,$41.5B,$72B,$76B,$76B,$150B,$158B,I see myself as an ambassador of French herita...,https://www.forbes.com/profile/bernard-arnault...
3,3,Bill Gates,"Cofounder, Bill & Melinda Gates Foundation",66.0,"Microsoft, Self Made",8.0,4.0,"Medina, Washington",United States,Divorced,...,$79.2B,$75B,$86B,$90B,$96.5B,$98B,$124B,$129B,Money has no utility to me beyond a certain po...,https://www.forbes.com/profile/bill-gates/?lis...
4,4,Warren Buffett,"CEO, Berkshire Hathaway",91.0,"Berkshire Hathaway, Self Made",8.0,5.0,"Omaha, Nebraska",United States,"Widowed, Remarried",...,$72.7B,$60.8B,$75.6B,$84B,$82.5B,$67.5B,$96B,$118B,Rational people don't risk what they have and ...,https://www.forbes.com/profile/warren-buffett/...


# ------ TEST AREA ------