# CompaniesMarketCap
Scraping data from [CompaniesMarketCap](https://companiesmarketcap.com/), which contains information on company industries, sub-industries, market capitalization, and employee headcount.

In [58]:
import requests
from bs4 import BeautifulSoup
import re
from cleantext import clean
import os
import pandas as pd

In [59]:
cmc_dir = '../../data/raw/companiesmarketcap/'
parameters_dir = '../../data/parameters/'
final_file = os.path.join(parameters_dir, 'companiesmarketcap.csv')

base_url = 'https://companiesmarketcap.com'
download = '?download=csv'

In [60]:
# Create output filename with custom extension
def output_filename(ext):
  return cmc_dir + 'cmc_' + ext + '.csv'

In [4]:
# Create output folder if it doesn't exist
if not os.path.exists(cmc_dir):
  os.makedirs(cmc_dir)

## Pages to scrape
Collect a list of pages we want to scrape.  

In [5]:
pages = []

Start with the pages that contain data for all companies

In [6]:
# List of all companies
# First get the page that contains market capitalization data
pages.append({
  'category': 'All',
  'filename': output_filename('all'),
  'url': base_url
})
# Then the page with the employee headcount field
pages.append({
  'category': 'All - Employees',
  'filename': output_filename('all_employees'),
  'url': base_url + '/largest-companies-by-number-of-employees/'
})

Next, scrape the page that contains the list of all available company categories/industries

In [7]:
response = requests.get(base_url + '/all-categories')
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
# Extract data from the table
rows = soup.find('table').find('tbody').find_all('a')

# Extract the categories from the table
for row in rows:
  category = re.sub(r'[^A-Za-z0-9 &/]+', '', row.text).strip()
  filename = re.sub(r'-', '_', row['href'].split('/')[1])
  url = base_url + row['href']
  
  pages.append({
    'category': category,
    'filename': output_filename(filename),
    'url': url
  })

In [9]:
print('Total pages to scrape:', len(pages))

Total pages to scrape: 161


## Scrape data and save files

Loop through the list of pages, scrape data and save to disk

In [14]:
# Keep track of any pages that don't exist / return 404 errors
pages_not_exist = []

for page in pages:
  response = requests.get(page['url'] + download)

  # If the page exists, save the data. Otherwise skip.
  # Note which pages exist and which don't
  if response.ok:
    print('Saving', page['category'], 'to:', page['filename'])
    with open(page['filename'], 'wb') as file:
      file.write(response.content)
  else:
    print('SKIPPING', page['category'], 'at', page['url'])
    pages_not_exist.append(page)

Saving All to: ../../data/raw/companiesmarketcap/cmc_all.csv
Saving All - Employees to: ../../data/raw/companiesmarketcap/cmc_all_employees.csv
Saving Tech to: ../../data/raw/companiesmarketcap/cmc_tech.csv
Saving Dow jones to: ../../data/raw/companiesmarketcap/cmc_dow_jones.csv
Saving AI to: ../../data/raw/companiesmarketcap/cmc_artificial_intelligence.csv
Saving Financial services to: ../../data/raw/companiesmarketcap/cmc_financial_services.csv
Saving Software to: ../../data/raw/companiesmarketcap/cmc_software.csv
Saving Tech Hardware to: ../../data/raw/companiesmarketcap/cmc_tech_hardware.csv
Saving Energy to: ../../data/raw/companiesmarketcap/cmc_energy.csv
Saving Banks to: ../../data/raw/companiesmarketcap/cmc_banks.csv
Saving Electronics to: ../../data/raw/companiesmarketcap/cmc_electronics.csv
Saving Internet to: ../../data/raw/companiesmarketcap/cmc_internet.csv
Saving Semiconductors to: ../../data/raw/companiesmarketcap/cmc_semiconductors.csv
Saving Oil&Gas to: ../../data/raw/

In [57]:
# Drop any pages that did not exist
pages = [page for page in pages if page not in pages_not_exist]

In [59]:
print('Scraped', len(pages), 'pages')
print(len(pages_not_exist), 'pages did not exist')
# for page in pages_not_exist:
#   print(page['segment'], ': ', page['url'], sep='')

Scraped 161 pages
0 pages did not exist


## Final dataset

### Combine company data across different categories

Create the base dataset of all companies

In [175]:
all_filename = 'cmc_all.csv'
all_employees_filename = 'cmc_all_employees.csv'
cmc_files = [file for file in os.listdir(cmc_dir) if file.endswith('.csv') and file not in [all_filename, all_employees_filename]]
# categories = [re.sub('cmc_|\\.csv', '', file) for file in cmc_files if file not in ['cmc_all.csv', 'cmc_all_employees.csv']]

In [176]:
# Start with the list of all companies
companies = pd.read_csv(os.path.join(cmc_dir, all_filename), index_col=['Name', 'country', 'Symbol'])

# Get the employee headcounts data
companies_employees = pd.read_csv(os.path.join(cmc_dir, all_employees_filename), usecols=['Name', 'country', 'Symbol', 'employees_count'], index_col=['Name', 'country', 'Symbol'])
companies_employees.rename(columns={'employees_count': 'Employees'}, inplace=True)

# Combine into single dataset
companies = companies.join(companies_employees, validate='1:1')
# companies.reset_index(inplace=True)

Now import the category datasets and join each with the base dataset.  
If a company appears in a category dataset, set the category field to 1, otherwise 0

In [177]:
category_dfs = {}
category_cols = []

for file in cmc_files:
  # Read the file, which contains the companies in each category
  df = pd.read_csv(os.path.join(cmc_dir, file), usecols=['Name', 'country', 'Symbol'])

  # Create the column name from the filename, and set the value to 1
  category = re.sub('cmc_|\\.csv', '', file)
  category_cols.append(category)
  df[category] = 1

  df.set_index(['Name', 'country', 'Symbol'], inplace=True)
  category_dfs[category] = df

# Concatenate all datasets
category_dfs = pd.concat(category_dfs.values(), axis=1)

Join the categories with the full company dataset and fill in any blank fields

In [178]:
companies = companies.join(category_dfs, how='left', validate='1:1')
companies[category_cols] = companies[category_cols].fillna(0)

Reset the index, first creating a copy to defragment the dataset

In [179]:
companies = companies.copy()
companies.reset_index(inplace=True)

Rename columns

In [180]:
companies.rename(
  columns = {
    'Name': 'Company',
    'country': 'Country',
    'Rank': 'Market Cap Rank',
    'price (USD)': 'Price (USD)'
  },
  inplace=True
)

### Clean up data

A handful of companies have the same name. To make names distinct, identify any duplicate company names and add the company symbol to the name.

In [181]:
# Identify duplicate company names
company_counts = companies['Company'].value_counts()
company_dupes = company_counts[company_counts > 1].index.tolist()
print(f'Fixing {len(company_dupes)} company names: {', '.join(company_dupes)}')

Fixing 3 company names: First Bancorp, Phoenix Group, Castellum


In [182]:
# Find the index for each duplicate
idx = companies['Company'].isin(company_dupes)

# Update the company name of each duplicate to include the company symbol
companies.loc[idx, 'Company'] = companies.loc[idx, 'Company'] + ' (' + companies.loc[idx, 'Symbol'] + ')'

Replace instances of `&amp;` in company names with `&`

In [183]:
companies['Company'] = companies['Company'].str.replace('&amp;', '&')

### Manual adjustments

A handful of manual adjustments to the dataset

Flag Tech Mahindra as IT Services. CompaniesMarketCap does not provide a category but [Yahoo Finance](https://finance.yahoo.com/quote/TECHM.NS/) describes the business as IT services.

In [184]:
companies.loc[companies['Company'] == 'Tech Mahindra', 'it_services'] = 1

Flag Larsen & Toubro as Professional Services. CompaniesMarketCap does not provide a category but [Yahoo Finance](https://finance.yahoo.com/quote/LT.NS/ ) and the company's website describe the business as providing engineering services

In [185]:
companies.loc[companies['Company'] == 'Larsen & Toubro', 'professional_services'] = 1

Flag L&T Technology Services as Professional Services. CompaniesMarketCap does not provide a category but [Yahoo Finance](https://finance.yahoo.com/quote/LT.NS/ ) and the company's website describe the business as providing engineering services.

In [186]:
companies.loc[companies['Company'] == 'L&T Technology Services', 'professional_services'] = 1

Making the company name "QUALCOMM" lowercase to match other datasets

In [187]:
companies.loc[companies['Company'] == 'QUALCOMM', 'Company'] = 'Qualcomm'

Making the company name "NVIDIA" lowercase to match other datasets

In [188]:
companies.loc[companies['Company'] == 'NVIDIA', 'Company'] = 'Nvidia'

## Save the final dataset

In [189]:
companies.to_csv(final_file)