In [1]:
import requests #html requestor
from bs4 import BeautifulSoup #html parser
import pandas as pd #dataframe manipulator

In [2]:
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://datatables.net/examples/basic_init/zero_configuration.html'

Now we request the html site. A response of 200 means that the request was successful

In [3]:
response = requests.get(url, headers=headers) # we use the headers so that requests imitates the request of a browser
print(response.status_code)

200


Now we parse the result with `BeautifulSoup`.

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

Printing the result:

In [5]:
soup

<!DOCTYPE html>

<html>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width,initial-scale=1,user-scalable=no" name="viewport"/>
<title>DataTables example - Zero configuration</title>
<link href="/media/images/favicon.png" rel="shortcut icon" type="image/png"/>
<link href="http://www.datatables.net/rss.xml" rel="alternate" title="RSS 2.0" type="application/rss+xml"/>
<link href="/media/css/site-examples.css?_=0db1cd38700c0cfcdc140c39a2ebc306" rel="stylesheet" type="text/css"/>
<link href="https://cdn.datatables.net/1.10.21/css/jquery.dataTables.min.css" rel="stylesheet" type="text/css"/>
<style class="init" type="text/css">
</style>
<script src="/media/js/site.js?_=99e3a7f81210e4bf610bf9f4bede2312" type="text/javascript"></script>
<script async="" src="/media/js/dynamic.php?comments-page=examples%2Fbasic_init%2Fzero_configuration.html" type="text/javascript"></script>
<script language="javascript" src="https://code.jquery.com/jqu

We see that the information we want to extract is enclosed in tags named `tr`. Now we find every one of these tags

In [6]:
info = soup.find_all('tr')
print(len(info))

59


In [7]:
print(info)

[<tr>
<th>Name</th>
<th>Position</th>
<th>Office</th>
<th>Age</th>
<th>Start date</th>
<th>Salary</th>
</tr>, <tr>
<td>Tiger Nixon</td>
<td>System Architect</td>
<td>Edinburgh</td>
<td>61</td>
<td>2011/04/25</td>
<td>$320,800</td>
</tr>, <tr>
<td>Garrett Winters</td>
<td>Accountant</td>
<td>Tokyo</td>
<td>63</td>
<td>2011/07/25</td>
<td>$170,750</td>
</tr>, <tr>
<td>Ashton Cox</td>
<td>Junior Technical Author</td>
<td>San Francisco</td>
<td>66</td>
<td>2009/01/12</td>
<td>$86,000</td>
</tr>, <tr>
<td>Cedric Kelly</td>
<td>Senior Javascript Developer</td>
<td>Edinburgh</td>
<td>22</td>
<td>2012/03/29</td>
<td>$433,060</td>
</tr>, <tr>
<td>Airi Satou</td>
<td>Accountant</td>
<td>Tokyo</td>
<td>33</td>
<td>2008/11/28</td>
<td>$162,700</td>
</tr>, <tr>
<td>Brielle Williamson</td>
<td>Integration Specialist</td>
<td>New York</td>
<td>61</td>
<td>2012/12/02</td>
<td>$372,000</td>
</tr>, <tr>
<td>Herrod Chandler</td>
<td>Sales Assistant</td>
<td>San Francisco</td>
<td>59</td>
<td>2012/08/06</

We note that every observation except the first and the last contain the info we need. We also note that the name, position, office, age, start date, and salary always have the same order. We can make use of these patterns to extract the information in a dataframe.

In [8]:
df = pd.DataFrame(columns=['name', 'position', 'office', 'age', 'start date', 'salary'])
#this created an empty dataframe with these column names
df

Unnamed: 0,name,position,office,age,start date,salary


In [9]:
len(info)

59

In [10]:
info[58]

<tr>
<th>Name</th>
<th>Position</th>
<th>Office</th>
<th>Age</th>
<th>Start date</th>
<th>Salary</th>
</tr>

In [11]:
for i, item in enumerate(info): #enumerate gives us the order of the element and the element
    #now we skip the first and last elements. Note that
    #Python starts indexing at zero, as opposed to R
    if i != 0 and i != len(info)-1:
        #We previously saw that every data point was inside a "tr" tag
        #Now we note that every variable (column) inside the "tr" tags
        #is enclosed by tags named "td". This is what we want to extract
        data_point = item.find_all('td')
        #print(data_point)
        row = [] #we'll append the variables to this empty row
        for value in data_point:
            row.append(value.text)
        print('\nObservation:', i)
        print(row)
        df.loc[i-1] = row #this adds a new row to the dataframe
        #we use i-1 because we skipped the first element
        print("Info added to the dataframe")


Observation: 1
['Tiger Nixon', 'System Architect', 'Edinburgh', '61', '2011/04/25', '$320,800']
Info added to the dataframe

Observation: 2
['Garrett Winters', 'Accountant', 'Tokyo', '63', '2011/07/25', '$170,750']
Info added to the dataframe

Observation: 3
['Ashton Cox', 'Junior Technical Author', 'San Francisco', '66', '2009/01/12', '$86,000']
Info added to the dataframe

Observation: 4
['Cedric Kelly', 'Senior Javascript Developer', 'Edinburgh', '22', '2012/03/29', '$433,060']
Info added to the dataframe

Observation: 5
['Airi Satou', 'Accountant', 'Tokyo', '33', '2008/11/28', '$162,700']
Info added to the dataframe

Observation: 6
['Brielle Williamson', 'Integration Specialist', 'New York', '61', '2012/12/02', '$372,000']
Info added to the dataframe

Observation: 7
['Herrod Chandler', 'Sales Assistant', 'San Francisco', '59', '2012/08/06', '$137,500']
Info added to the dataframe

Observation: 8
['Rhona Davidson', 'Integration Specialist', 'Tokyo', '55', '2010/10/14', '$327,900']


In [12]:
df

Unnamed: 0,name,position,office,age,start date,salary
0,Tiger Nixon,System Architect,Edinburgh,61,2011/04/25,"$320,800"
1,Garrett Winters,Accountant,Tokyo,63,2011/07/25,"$170,750"
2,Ashton Cox,Junior Technical Author,San Francisco,66,2009/01/12,"$86,000"
3,Cedric Kelly,Senior Javascript Developer,Edinburgh,22,2012/03/29,"$433,060"
4,Airi Satou,Accountant,Tokyo,33,2008/11/28,"$162,700"
5,Brielle Williamson,Integration Specialist,New York,61,2012/12/02,"$372,000"
6,Herrod Chandler,Sales Assistant,San Francisco,59,2012/08/06,"$137,500"
7,Rhona Davidson,Integration Specialist,Tokyo,55,2010/10/14,"$327,900"
8,Colleen Hurst,Javascript Developer,San Francisco,39,2009/09/15,"$205,500"
9,Sonya Frost,Software Engineer,Edinburgh,23,2008/12/13,"$103,600"


And now this dataframe can be easily exported to a `csv` file.