## Exploring Web Scraping with Python: Tesla Wikipedia Page

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
start_url = "https://en.wikipedia.org/wiki/Tesla,_Inc."

downloaded_html = requests.get(start_url)

soup = BeautifulSoup(downloaded_html.text)

with open('downloaded.html', 'w') as file:
    file.write(soup.prettify())

In [5]:
# select table.wikitable

full_table = soup.select('table.wikitable tbody')[0]
print(full_table)

<tbody><tr style="text-align:center;">
<th>Quarter</th>
<th>Cumulative<br/>production</th>
<th>Total<br/>production</th>
<th>Model S<br/>sales
</th>
<th>Model X<br/>sales
</th>
<th>Model 3 + Model Y<br/>sales<sup class="reference" id="cite_ref-99"><a href="#cite_note-99">[b]</a></sup></th>
<th>Total<br/>sales<sup class="reference" id="cite_ref-100"><a href="#cite_note-100">[c]</a></sup></th>
<th>In transit<sup class="reference" id="cite_ref-101"><a href="#cite_note-101">[d]</a></sup></th>
<th>Source
</th></tr>
<tr style="text-align:center;">
<td>Q1 2013</td>
<td>?</td>
<td>5,000+</td>
<td>4,900</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>4,900</td>
<td></td>
<td><sup class="reference" id="cite_ref-102"><a href="#cite_note-102">[98]</a></sup>
</td></tr>
<tr style="text-align:center;">
<td>Q2 2013</td>
<td>?</td>
<td>?</td>
<td>5,150</td>
<td style="background:#f1f5fa;">
</td>
<td style="background:#f1f5fa;"></td>
<td>5,150</td>
<td></td>
<td><su

In [9]:
# extract the table column headings
# end result: a list with all the column headings
import re
regex = re.compile('_\[\w\]')

table_head = full_table.select('tr th')
# print(table_head)
table_columns = []
for el in table_head:
    column_label = el.get_text(separator=" ", strip=True)
    column_label = column_label.replace(" ", "_")
    column_label = regex.sub('', column_label)
    table_columns.append(column_label)
print(table_columns)

['Quarter', 'Cumulative_production', 'Total_production', 'Model_S_sales', 'Model_X_sales', 'Model_3_+_Model_Y_sales', 'Total_sales', 'In_transit', 'Source']


In [11]:
# extract table data
# end result: multidimensional list containing a list for each row

table_rows = full_table.select('tr')
table_data = []
for index, element in enumerate(table_rows):
    if index > 0:
        row_list = []
        values = element.select('td')
        for value in values:
            row_list.append(value.text.strip())
        table_data.append(row_list)
        
print(table_data)

[['Q1 2013', '?', '5,000+', '4,900', '', '', '4,900', '', '[98]'], ['Q2 2013', '?', '?', '5,150', '', '', '5,150', '', '[99]'], ['Q3 2013', '?', '?', '5,500+', '', '', '5,500+', '', '[100]'], ['Q4 2013', '~34,851', '6,587', '6,892', '', '', '6,892', '', '[101]'], ['Q1 2014', '~41,438', '7,535', '6,457', '', '', '6,457', '', '[102]'], ['Q2 2014', '~48,973', '8,763', '7,579', '', '', '7,579', '', '[103]'], ['Q3 2014', '~57,736', '~7,075', '7,785', '', '', '7,785', '', '[104]'], ['Q4 2014', '64,811', '11,627', '9,834', '', '', '9,834', '', '[105]'], ['Q1 2015', '76,438', '11,160', '10,045', '', '', '10,045', '', '[106]'], ['Q2 2015', '89,245', '12,807', '11,532', '', '', '11,532', '', '[107]'], ['Q3 2015', '102,336', '13,091', '11,597', '6', '', '11,603', '', '[108]'], ['Q4 2015', '116,373', '14,037', '17,272', '206', '', '17,478', '', '[109]'], ['Q1 2016', '131,883', '15,510', '12,420', '2,400', '', '14,820', '2,615', '[110]'], ['Q2 2016', '150,228', '18,345', '9,764', '4,638', '', '14,4

In [12]:
# create a panda dataframe
df = pd.DataFrame(table_data, columns=table_columns)
df

Unnamed: 0,Quarter,Cumulative_production,Total_production,Model_S_sales,Model_X_sales,Model_3_+_Model_Y_sales,Total_sales,In_transit,Source
0,Q1 2013,?,"5,000+",4900,,,4900,,[98]
1,Q2 2013,?,?,5150,,,5150,,[99]
2,Q3 2013,?,?,"5,500+",,,"5,500+",,[100]
3,Q4 2013,"~34,851",6587,6892,,,6892,,[101]
4,Q1 2014,"~41,438",7535,6457,,,6457,,[102]
5,Q2 2014,"~48,973",8763,7579,,,7579,,[103]
6,Q3 2014,"~57,736","~7,075",7785,,,7785,,[104]
7,Q4 2014,64811,11627,9834,,,9834,,[105]
8,Q1 2015,76438,11160,10045,,,10045,,[106]
9,Q2 2015,89245,12807,11532,,,11532,,[107]
