# THE 22ND ANNUAL GLOBAL 2000 RANKING OF THE WORLD’S LARGEST PUBLIC COMPANIES

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
url = 'https://www.forbes.com/lists/global2000/'

In [None]:
response = requests.get(url)

In [None]:
response.raise_for_status()

In [None]:
response

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

In [None]:
column_headers = [header.get_text().strip(' ') for header in soup.find_all('span', class_='header-content-text')]

In [None]:
column_headers

In [None]:
column_data_rows = soup.find_all('a', class_='table-row')

In [None]:
data = []
for row in column_data_rows:
    col_data = row.find_all('div', class_='table-cell')
    col_data = [col.get_text().strip(' ') for col in col_data]
    data.append(col_data)

In [None]:
print("Data:")
for row in data:
    print(row)

In [None]:
df = pd.DataFrame(data, columns=column_headers)

In [None]:
df.info()

In [None]:
df.RANK = df.RANK.astype('str').str.replace(',', '').str.strip()
df.RANK = df.RANK.astype('int')
df.NAME = df.NAME.astype('str')
df['COUNTRY/TERRITORY'] = df['COUNTRY/TERRITORY'].astype('str')

In [None]:
def clean_numeric_column(column):
    def convert_value(value):
        value = value.replace('$', '').replace(',', '')
        if 'B' in value:
            return float(value.replace('B', ''))
        elif 'M' in value:
            return float(value.replace('M', '')) / 1_000  # Convert millions to billions
        else:
            return float(value)

    return df[column].apply(convert_value)

In [None]:
df['SALES (in billions $)'] = clean_numeric_column('SALES')
df['PROFIT (in billions $)'] = clean_numeric_column('PROFIT')
df['ASSETS (in billions $)'] = clean_numeric_column('ASSETS')
df['MARKET VALUE (in billions $)'] = clean_numeric_column('MARKET VALUE')

In [None]:
df.drop(columns=['SALES', 'PROFIT', 'ASSETS', 'MARKET VALUE'], inplace=True)

In [None]:
print("Note: All financial values are converted to billions of dollars for consistency.")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.corr(numeric_only=True)