In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# download and parse HTML

start_url = "https://en.wikipedia.org/wiki/Tesla,_Inc."

# download the HTML from start_url
downloaded_html = requests.get(start_url)

# parse HTML with BeautifulSoup and create a soup object
soup = BeautifulSoup(downloaded_html.text)

# save a local copy
with open('downloaded.html', 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

In [5]:
# select table.wikitable

full_table = soup.select('table.wikitable')

print(full_table)

[<table class="wikitable">
<tbody><tr style="text-align:center;">
<th>Quarter</th>
<th>Cumulative<br/>production</th>
<th>Total<br/>production</th>
<th>Model S<br/>sales</th>
<th>Model X<br/>sales</th>
<th>Model 3<br/>sales
</th>
<th>Model Y<br/>sales<sup class="reference" id="cite_ref-140"><a href="#cite_note-140">[a]</a></sup></th>
<th>Total<br/>sales<sup class="reference" id="cite_ref-141"><a href="#cite_note-141">[b]</a></sup></th>
<th>In transit<sup class="reference" id="cite_ref-142"><a href="#cite_note-142">[c]</a></sup></th>
<th>Source
</th></tr>
<tr style="text-align:center;">
<td>Q1 2013</td>
<td>N/A</td>
<td>5,000+</td>
<td>4,900</td>
<td style="background:#f1f5fa;"></td>
<td style="background:#f1f5fa;"></td>
<td style="background:#f1f5fa;"></td>
<td>4,900</td>
<td></td>
<td><sup class="reference" id="cite_ref-143"><a href="#cite_note-143">[140]</a></sup>
</td></tr>
<tr style="text-align:center;">
<td>Q2 2013</td>
<td>N/A</td>
<td>N/A</td>
<td>5,150</td>
<td style="backgroun

In [6]:
full_table = soup.select('table.wikitable tbody')[0]

print(full_table)

<tbody><tr style="text-align:center;">
<th>Quarter</th>
<th>Cumulative<br/>production</th>
<th>Total<br/>production</th>
<th>Model S<br/>sales</th>
<th>Model X<br/>sales</th>
<th>Model 3<br/>sales
</th>
<th>Model Y<br/>sales<sup class="reference" id="cite_ref-140"><a href="#cite_note-140">[a]</a></sup></th>
<th>Total<br/>sales<sup class="reference" id="cite_ref-141"><a href="#cite_note-141">[b]</a></sup></th>
<th>In transit<sup class="reference" id="cite_ref-142"><a href="#cite_note-142">[c]</a></sup></th>
<th>Source
</th></tr>
<tr style="text-align:center;">
<td>Q1 2013</td>
<td>N/A</td>
<td>5,000+</td>
<td>4,900</td>
<td style="background:#f1f5fa;"></td>
<td style="background:#f1f5fa;"></td>
<td style="background:#f1f5fa;"></td>
<td>4,900</td>
<td></td>
<td><sup class="reference" id="cite_ref-143"><a href="#cite_note-143">[140]</a></sup>
</td></tr>
<tr style="text-align:center;">
<td>Q2 2013</td>
<td>N/A</td>
<td>N/A</td>
<td>5,150</td>
<td style="background:#f1f5fa;"></td>
<td style

In [7]:
# extract table column headings

table_head = full_table.select('tr th')
print(table_head)

[<th>Quarter</th>, <th>Cumulative<br/>production</th>, <th>Total<br/>production</th>, <th>Model S<br/>sales</th>, <th>Model X<br/>sales</th>, <th>Model 3<br/>sales
</th>, <th>Model Y<br/>sales<sup class="reference" id="cite_ref-140"><a href="#cite_note-140">[a]</a></sup></th>, <th>Total<br/>sales<sup class="reference" id="cite_ref-141"><a href="#cite_note-141">[b]</a></sup></th>, <th>In transit<sup class="reference" id="cite_ref-142"><a href="#cite_note-142">[c]</a></sup></th>, <th>Source
</th>]


In [8]:
for element in table_head:
    print(element.text)

Quarter
Cumulativeproduction
Totalproduction
Model Ssales
Model Xsales
Model 3sales

Model Ysales[a]
Totalsales[b]
In transit[c]
Source



In [9]:
# extract table column headings

table_head = full_table.select('tr th')

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=' ', strip=True)
    table_columns.append(column_label)
    print(column_label)
    
print('---------')
print(table_columns)

Quarter
Cumulative production
Total production
Model S sales
Model X sales
Model 3 sales
Model Y sales [a]
Total sales [b]
In transit [c]
Source
---------
['Quarter', 'Cumulative production', 'Total production', 'Model S sales', 'Model X sales', 'Model 3 sales', 'Model Y sales [a]', 'Total sales [b]', 'In transit [c]', 'Source']


In [11]:
# extract table column headings

table_head = full_table.select('tr th')

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=' ', strip=True)
    column_label = column_label.replace(' ', '_')
    table_columns.append(column_label)
    
print('---------')
print(table_columns)

---------
['Quarter', 'Cumulative_production', 'Total_production', 'Model_S_sales', 'Model_X_sales', 'Model_3_sales', 'Model_Y_sales_[a]', 'Total_sales_[b]', 'In_transit_[c]', 'Source']


In [12]:
# extract table column headings

import re
regex = re.compile('_\[\w\]')

table_columns = []
for element in table_head:
    column_label = element.get_text(separator=' ', strip=True)
    column_label = column_label.replace(' ', '_')
    column_label = regex.sub('', column_label)
    table_columns.append(column_label)
    
print(table_columns)

['Quarter', 'Cumulative_production', 'Total_production', 'Model_S_sales', 'Model_X_sales', 'Model_3_sales', 'Model_Y_sales', 'Total_sales', 'In_transit', 'Source']


In [14]:
# extract the table data (rows)

table_rows = full_table.select('tr')
table_data = []

for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text)
        table_data.append(row_list)
        
print(table_data)

[['Q1 2013', 'N/A', '5,000+', '4,900', '', '', '', '4,900', '', '[140]\n'], ['Q2 2013', 'N/A', 'N/A', '5,150', '', '', '', '5,150', '', '[141]\n'], ['Q3 2013', 'N/A', 'N/A', '5,500+', '', '', '', '5,500+', '', '[142]\n'], ['Q4 2013', '~34,851', '6,587', '6,892', '', '', '', '6,892', '', '[143]\n'], ['Q1 2014', '~41,438', '7,535', '6,457', '', '', '', '6,457', '', '[144]\n'], ['Q2 2014', '~48,973', '8,763', '7,579', '', '', '', '7,579', '', '[145]\n'], ['Q3 2014', '~57,736', '~7,075', '7,785', '', '', '', '7,785', '', '[146]\n'], ['Q4 2014', '64,811', '11,627', '9,834', '', '', '', '9,834', '', '[147]\n'], ['Q1 2015', '76,438', '11,160', '10,045', '', '', '', '10,045', '', '[148]\n'], ['Q2 2015', '89,245', '12,807', '11,532', '', '', '', '11,532', '', '[149]\n'], ['Q3 2015', '102,336', '13,091', '11,597', '6', '', '', '11,603', '', '[150]\n'], ['Q4 2015', '116,373', '14,037', '17,272', '206', '', '', '17,478', '', '[151]\n'], ['Q1 2016', '131,883', '15,510', '12,420', '2,400', '', '', '

In [15]:
# extract the table data (rows)

table_rows = full_table.select('tr')
table_data = []

for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)
        
print(table_data)

[['Q1 2013', 'N/A', '5,000+', '4,900', '', '', '', '4,900', '', '[140]'], ['Q2 2013', 'N/A', 'N/A', '5,150', '', '', '', '5,150', '', '[141]'], ['Q3 2013', 'N/A', 'N/A', '5,500+', '', '', '', '5,500+', '', '[142]'], ['Q4 2013', '~34,851', '6,587', '6,892', '', '', '', '6,892', '', '[143]'], ['Q1 2014', '~41,438', '7,535', '6,457', '', '', '', '6,457', '', '[144]'], ['Q2 2014', '~48,973', '8,763', '7,579', '', '', '', '7,579', '', '[145]'], ['Q3 2014', '~57,736', '~7,075', '7,785', '', '', '', '7,785', '', '[146]'], ['Q4 2014', '64,811', '11,627', '9,834', '', '', '', '9,834', '', '[147]'], ['Q1 2015', '76,438', '11,160', '10,045', '', '', '', '10,045', '', '[148]'], ['Q2 2015', '89,245', '12,807', '11,532', '', '', '', '11,532', '', '[149]'], ['Q3 2015', '102,336', '13,091', '11,597', '6', '', '', '11,603', '', '[150]'], ['Q4 2015', '116,373', '14,037', '17,272', '206', '', '', '17,478', '', '[151]'], ['Q1 2016', '131,883', '15,510', '12,420', '2,400', '', '', '14,820', '2,615', '[152]

In [16]:
df = pd.DataFrame(table_data, columns=table_columns)

df

Unnamed: 0,Quarter,Cumulative_production,Total_production,Model_S_sales,Model_X_sales,Model_3_sales,Model_Y_sales,Total_sales,In_transit,Source
0,Q1 2013,,"5,000+",4900,,,,4900,,[140]
1,Q2 2013,,,5150,,,,5150,,[141]
2,Q3 2013,,,"5,500+",,,,"5,500+",,[142]
3,Q4 2013,"~34,851",6587,6892,,,,6892,,[143]
4,Q1 2014,"~41,438",7535,6457,,,,6457,,[144]
5,Q2 2014,"~48,973",8763,7579,,,,7579,,[145]
6,Q3 2014,"~57,736","~7,075",7785,,,,7785,,[146]
7,Q4 2014,64811,11627,9834,,,,9834,,[147]
8,Q1 2015,76438,11160,10045,,,,10045,,[148]
9,Q2 2015,89245,12807,11532,,,,11532,,[149]
