In [1]:
import requests

In [2]:
# Import packages

# import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# Download and parse the HTML

start_url = 'https://en.wikipedia.org/wiki/Tesla,_Inc.'

# Download the HTML from start_url
downloaded_html = requests.get(start_url)

# Parse the HTML with BeautifulSoup and create a soup object
soup = BeautifulSoup(downloaded_html.text)



In [7]:
# Save a local copy
with open('downloaded.html', 'w',encoding='utf-8') as file:
    # Depending on the HTML page, on the next line, you may have to use soup.prettify().encode('UTF-8')
    file.write(soup.prettify())

In [8]:
# Select table.wikitable

full_table = soup.select('table.wikitable tbody')[0]
print(full_table)


<tbody><tr>
<th>Opened
</th>
<th>Name
</th>
<th>City
</th>
<th>Country
</th>
<th>Employees
</th>
<th>Products
</th>
<th class="unsortable"><abbr title="Reference(s)">Ref.</abbr>
</th></tr>
<tr>
<td>2010
</td>
<td><a href="/wiki/Tesla_Fremont_Factory" title="Tesla Fremont Factory">Tesla Fremont Factory</a>
</td>
<td><a href="/wiki/Fremont,_California" title="Fremont, California">Fremont, California</a>
</td>
<td>United States
</td>
<td>22,000
</td>
<td><a href="/wiki/Tesla_Model_S" title="Tesla Model S">Model S</a>, <a href="/wiki/Tesla_Model_X" title="Tesla Model X">Model X</a>, <a href="/wiki/Tesla_Model_3" title="Tesla Model 3">Model 3</a>, <a href="/wiki/Tesla_Model_Y" title="Tesla Model Y">Model Y</a>
</td>
<td><sup class="reference" id="cite_ref-Future_32-1"><a href="#cite_note-Future-32">[31]</a></sup><sup class="reference" id="cite_ref-TC_staff_2020_222-0"><a href="#cite_note-TC_staff_2020-222">[221]</a></sup><sup class="reference" id="cite_ref-223"><a href="#cite_note-223">[222

In [14]:
# Extract the table column headings
# End result: A list with all the column headings

table_head = full_table.select('tr th')

print(table_head)
print(type(table_head))
print('-----------')
for element in table_head:
    print(element.text)

[<th>Opened
</th>, <th>Name
</th>, <th>City
</th>, <th>Country
</th>, <th>Employees
</th>, <th>Products
</th>, <th class="unsortable"><abbr title="Reference(s)">Ref.</abbr>
</th>]
<class 'bs4.element.ResultSet'>
-----------
Opened

Name

City

Country

Employees

Products

Ref.



In [29]:
# Extract the table column headings
# End result: A list with all the column headings

import re
regex = re.compile('_\[\w\]')

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=" ", strip=True)
    column_label = column_label.replace(' ', '_')
    column_label = regex.sub('', column_label)
    table_columns.append(column_label)
print(table_columns)

['Opened', 'Name', 'City', 'Country', 'Employees', 'Products', 'Ref.']


In [30]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

table_rows = full_table.select('tr')
print(table_rows)


[<tr>
<th>Opened
</th>
<th>Name
</th>
<th>City
</th>
<th>Country
</th>
<th>Employees
</th>
<th>Products
</th>
<th class="unsortable"><abbr title="Reference(s)">Ref.</abbr>
</th></tr>, <tr>
<td>2010
</td>
<td><a href="/wiki/Tesla_Fremont_Factory" title="Tesla Fremont Factory">Tesla Fremont Factory</a>
</td>
<td><a href="/wiki/Fremont,_California" title="Fremont, California">Fremont, California</a>
</td>
<td>United States
</td>
<td>22,000
</td>
<td><a href="/wiki/Tesla_Model_S" title="Tesla Model S">Model S</a>, <a href="/wiki/Tesla_Model_X" title="Tesla Model X">Model X</a>, <a href="/wiki/Tesla_Model_3" title="Tesla Model 3">Model 3</a>, <a href="/wiki/Tesla_Model_Y" title="Tesla Model Y">Model Y</a>
</td>
<td><sup class="reference" id="cite_ref-Future_32-1"><a href="#cite_note-Future-32">[31]</a></sup><sup class="reference" id="cite_ref-TC_staff_2020_222-0"><a href="#cite_note-TC_staff_2020-222">[221]</a></sup><sup class="reference" id="cite_ref-223"><a href="#cite_note-223">[222]</a>

In [32]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text)
        table_data.append(row_list)

print(table_data)


[['2010\n', 'Tesla Fremont Factory\n', 'Fremont, California\n', 'United States\n', '22,000\n', 'Model S, Model X, Model 3, Model Y\n', '[31][221][222]\n'], ['2016\n', 'Gigafactory Nevada\n', 'Storey County, Nevada\n', 'United States\n', '7,000\n', 'Batteries, Powerwall, Semi\n', '[223][224][225]\n'], ['2017\n', 'Gigafactory New York\n', 'Buffalo, New York\n', 'United States\n', '1,500\n', 'Solar Roof, Supercharger\n', '[226][227]\n'], ['2019\n', 'Gigafactory Shanghai\n', 'Shanghai\n', 'China\n', '20,000\n', 'Model 3, Model Y, Supercharger\n', '[228][229]\n'], ['2022\n', 'Gigafactory Berlin-Brandenburg\n', 'Grünheide\n', 'Germany\n', '10,000\n', 'Model Y (planned: batteries, Model 3)\n', '[230][231][232]\n'], ['2022\n', 'Gigafactory Texas\n', 'Austin, Texas\n', 'United States\n', '12,000\n', 'Cybertruck, Model Y (planned: batteries, Next-gen vehicle)\n', '[233][234][235]\n']]


In [33]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

# Final Iteration

table_rows = full_table.select('tr')
table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)

print(table_data)


[['2010', 'Tesla Fremont Factory', 'Fremont, California', 'United States', '22,000', 'Model S, Model X, Model 3, Model Y', '[31][221][222]'], ['2016', 'Gigafactory Nevada', 'Storey County, Nevada', 'United States', '7,000', 'Batteries, Powerwall, Semi', '[223][224][225]'], ['2017', 'Gigafactory New York', 'Buffalo, New York', 'United States', '1,500', 'Solar Roof, Supercharger', '[226][227]'], ['2019', 'Gigafactory Shanghai', 'Shanghai', 'China', '20,000', 'Model 3, Model Y, Supercharger', '[228][229]'], ['2022', 'Gigafactory Berlin-Brandenburg', 'Grünheide', 'Germany', '10,000', 'Model Y (planned: batteries, Model 3)', '[230][231][232]'], ['2022', 'Gigafactory Texas', 'Austin, Texas', 'United States', '12,000', 'Cybertruck, Model Y (planned: batteries, Next-gen vehicle)', '[233][234][235]']]


In [34]:
# Create a Pandas DataFrame

df = pd.DataFrame(table_data, columns=table_columns) 
df

Unnamed: 0,Opened,Name,City,Country,Employees,Products,Ref.
0,2010,Tesla Fremont Factory,"Fremont, California",United States,22000,"Model S, Model X, Model 3, Model Y",[31][221][222]
1,2016,Gigafactory Nevada,"Storey County, Nevada",United States,7000,"Batteries, Powerwall, Semi",[223][224][225]
2,2017,Gigafactory New York,"Buffalo, New York",United States,1500,"Solar Roof, Supercharger",[226][227]
3,2019,Gigafactory Shanghai,Shanghai,China,20000,"Model 3, Model Y, Supercharger",[228][229]
4,2022,Gigafactory Berlin-Brandenburg,Grünheide,Germany,10000,"Model Y (planned: batteries, Model 3)",[230][231][232]
5,2022,Gigafactory Texas,"Austin, Texas",United States,12000,"Cybertruck, Model Y (planned: batteries, Next-...",[233][234][235]


In [35]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

table_rows = full_table.select('tr')
print(table_rows)


[<tr>
<th>Opened
</th>
<th>Name
</th>
<th>City
</th>
<th>Country
</th>
<th>Employees
</th>
<th>Products
</th>
<th class="unsortable"><abbr title="Reference(s)">Ref.</abbr>
</th></tr>, <tr>
<td>2010
</td>
<td><a href="/wiki/Tesla_Fremont_Factory" title="Tesla Fremont Factory">Tesla Fremont Factory</a>
</td>
<td><a href="/wiki/Fremont,_California" title="Fremont, California">Fremont, California</a>
</td>
<td>United States
</td>
<td>22,000
</td>
<td><a href="/wiki/Tesla_Model_S" title="Tesla Model S">Model S</a>, <a href="/wiki/Tesla_Model_X" title="Tesla Model X">Model X</a>, <a href="/wiki/Tesla_Model_3" title="Tesla Model 3">Model 3</a>, <a href="/wiki/Tesla_Model_Y" title="Tesla Model Y">Model Y</a>
</td>
<td><sup class="reference" id="cite_ref-Future_32-1"><a href="#cite_note-Future-32">[31]</a></sup><sup class="reference" id="cite_ref-TC_staff_2020_222-0"><a href="#cite_note-TC_staff_2020-222">[221]</a></sup><sup class="reference" id="cite_ref-223"><a href="#cite_note-223">[222]</a>

In [36]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

# Iteration 2

table_rows = full_table.select('tr')

table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text)
        table_data.append(row_list)

print(table_data)


[['2010\n', 'Tesla Fremont Factory\n', 'Fremont, California\n', 'United States\n', '22,000\n', 'Model S, Model X, Model 3, Model Y\n', '[31][221][222]\n'], ['2016\n', 'Gigafactory Nevada\n', 'Storey County, Nevada\n', 'United States\n', '7,000\n', 'Batteries, Powerwall, Semi\n', '[223][224][225]\n'], ['2017\n', 'Gigafactory New York\n', 'Buffalo, New York\n', 'United States\n', '1,500\n', 'Solar Roof, Supercharger\n', '[226][227]\n'], ['2019\n', 'Gigafactory Shanghai\n', 'Shanghai\n', 'China\n', '20,000\n', 'Model 3, Model Y, Supercharger\n', '[228][229]\n'], ['2022\n', 'Gigafactory Berlin-Brandenburg\n', 'Grünheide\n', 'Germany\n', '10,000\n', 'Model Y (planned: batteries, Model 3)\n', '[230][231][232]\n'], ['2022\n', 'Gigafactory Texas\n', 'Austin, Texas\n', 'United States\n', '12,000\n', 'Cybertruck, Model Y (planned: batteries, Next-gen vehicle)\n', '[233][234][235]\n']]


In [37]:
# Extract the table data (rows)
# End result: A multi-dimensional list containing a list for each row

# Final Iteration

table_rows = full_table.select('tr')
table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)

print(table_data)


[['2010', 'Tesla Fremont Factory', 'Fremont, California', 'United States', '22,000', 'Model S, Model X, Model 3, Model Y', '[31][221][222]'], ['2016', 'Gigafactory Nevada', 'Storey County, Nevada', 'United States', '7,000', 'Batteries, Powerwall, Semi', '[223][224][225]'], ['2017', 'Gigafactory New York', 'Buffalo, New York', 'United States', '1,500', 'Solar Roof, Supercharger', '[226][227]'], ['2019', 'Gigafactory Shanghai', 'Shanghai', 'China', '20,000', 'Model 3, Model Y, Supercharger', '[228][229]'], ['2022', 'Gigafactory Berlin-Brandenburg', 'Grünheide', 'Germany', '10,000', 'Model Y (planned: batteries, Model 3)', '[230][231][232]'], ['2022', 'Gigafactory Texas', 'Austin, Texas', 'United States', '12,000', 'Cybertruck, Model Y (planned: batteries, Next-gen vehicle)', '[233][234][235]']]


In [38]:
# Create a Pandas DataFrame

df = pd.DataFrame(table_data, columns=table_columns) 
df

Unnamed: 0,Opened,Name,City,Country,Employees,Products,Ref.
0,2010,Tesla Fremont Factory,"Fremont, California",United States,22000,"Model S, Model X, Model 3, Model Y",[31][221][222]
1,2016,Gigafactory Nevada,"Storey County, Nevada",United States,7000,"Batteries, Powerwall, Semi",[223][224][225]
2,2017,Gigafactory New York,"Buffalo, New York",United States,1500,"Solar Roof, Supercharger",[226][227]
3,2019,Gigafactory Shanghai,Shanghai,China,20000,"Model 3, Model Y, Supercharger",[228][229]
4,2022,Gigafactory Berlin-Brandenburg,Grünheide,Germany,10000,"Model Y (planned: batteries, Model 3)",[230][231][232]
5,2022,Gigafactory Texas,"Austin, Texas",United States,12000,"Cybertruck, Model Y (planned: batteries, Next-...",[233][234][235]
