# Scraping Webpage Data - Wikipedia

In [1]:
# Import Libraries
from bs4 import BeautifulSoup
import requests

import pandas as pd
import numpy as np

In [2]:
# User Agent - use: whatismyuseragent.com
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5.1 Safari/605.1.15"

In [3]:
# Headers for the request
HEADERS = ({'User-Agent': USER_AGENT, 'Accept-Language': "en-US, en;q=0.5"})

In [4]:
# Wikipedia URL
URL = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

In [5]:
# HTTP request
page = requests.get(URL)

In [6]:
page.status_code

200

In [7]:
# Emsamble the soup with the page information
soup = BeautifulSoup(page.text, "html")

In [8]:
# Uncomment next line to see results
# print(soup)

#### table element with the attributes classes to easily identify it.
- `<table class="wikitable sortable jquery-tablesorter">`

In [9]:
soup.find('table')

<table class="box-More_citations_needed plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation"><tbody><tr><td class="mbox-image"><div class="mbox-image-div"><span typeof="mw:File"><a class="mw-file-description" href="/wiki/File:Question_book-new.svg"><img alt="" class="mw-file-element" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></span></div></td><td class="mbox-text"><div class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>.<span class="hide-when-compact"> Please help <a href="/wiki/Special

In [10]:
# From all tables select the first one - There are three tables on this page.
table = soup.find_all('table')[1]

In [11]:
# uncomment next line to see results
# table

In [12]:
# Find all `th` elements in the table = `headers` for the table
th_tags = table.find_all('th')

In [13]:
# Collect table headers to then add them as df columns
th_text = []
for th_tag in th_tags:
    th_text.append(th_tag.text.strip())

th_text

['Rank',
 'Name',
 'Industry',
 'Revenue (USD millions)',
 'Revenue growth',
 'Employees',
 'Headquarters']

In [14]:
# Find all table rows. Discard the first row because `headers` are the first one.
rows = table.find_all('tr')[1:]

In [15]:
d = {'Rank': [], 'Name': [], 'Industry': [], 'Revenue (USD millions)': [], 'Revenue growth': [], 'Employees': [], 'Headquarters': []}

# Collect the information to fill up the dictionary
for row in rows:
    elems = row.find_all('td')
    d['Rank'].append(elems[0].text.strip())
    d['Name'].append(elems[1].text.strip())
    d['Industry'].append(elems[2].text.strip())
    d['Revenue (USD millions)'].append(elems[3].text.strip())
    d['Revenue growth'].append(elems[4].text.strip())
    d['Employees'].append(elems[5].text.strip())
    d['Headquarters'].append(elems[6].text.strip())

In [16]:
# Create the `DataFrame` from the dictionary
companies_df = pd.DataFrame.from_dict(d)

In [17]:
companies_df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,611289,6.7%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and Cloud Computing,513983,9.4%,1540000,"Seattle, Washington"
2,3,Exxon Mobil,Petroleum industry,413680,44.8%,62000,"Spring, Texas"
3,4,Apple,Electronics industry,394328,7.8%,164000,"Cupertino, California"
4,5,UnitedHealth Group,Healthcare,324162,12.7%,400000,"Minnetonka, Minnesota"
...,...,...,...,...,...,...,...
95,96,Best Buy,Retail,46298,10.6%,71100,"Richfield, Minnesota"
96,97,Bristol-Myers Squibb,Pharmaceutical industry,46159,0.5%,34300,"New York City, New York"
97,98,United Airlines,Airline,44955,82.5%,92795,"Chicago, Illinois"
98,99,Thermo Fisher Scientific,Laboratory instruments,44915,14.5%,130000,"Waltham, Massachusetts"


In [18]:
# Check if there are missing values
companies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Rank                    100 non-null    object
 1   Name                    100 non-null    object
 2   Industry                100 non-null    object
 3   Revenue (USD millions)  100 non-null    object
 4   Revenue growth          100 non-null    object
 5   Employees               100 non-null    object
 6   Headquarters            100 non-null    object
dtypes: object(7)
memory usage: 5.6+ KB


In [19]:
# Save `df` as a `csv` file
companies_df.to_csv('largest_companies_in_the_us_by_revenue.csv', header=True, index=False)