In [1]:
import requests #html requestor
from bs4 import BeautifulSoup #html parser
import pandas as pd #dataframe manipulator

In [2]:
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://datatables.net/examples/basic_init/zero_configuration.html'

Now we request the html site. A response of 200 means that the request was successful

In [3]:
response = requests.get(url, headers=headers) # we use the headers so that requests imitates the request of a browser
print(response.status_code)

ConnectionError: HTTPSConnectionPool(host='datatables.net', port=443): Max retries exceeded with url: /examples/basic_init/zero_configuration.html (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000020FF13C0408>: Failed to establish a new connection: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions'))

Now we parse the result with `BeautifulSoup`.

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')

Printing the result:

In [None]:
soup

We see that the information we want to extract is enclosed in tags named `tr`. Now we find every one of these tags

In [None]:
info = soup.find_all('tr')
print(len(info))

In [None]:
print(info)

We note that every observation except the first and the last contain the info we need. We also note that the name, position, office, age, start date, and salary always have the same order. We can make use of these patterns to extract the information in a dataframe.

In [None]:
df = pd.DataFrame(columns=['name', 'position', 'office', 'age', 'start date', 'salary'])
#this created an empty dataframe with these column names
df

In [None]:
for i, item in enumerate(info): #enumerate gives us the order of the element and the element
    #now we skip the first and last elements. Note that
    #Python starts indexing at zero, as opposed to R
    if i != 0 and i != len(info)-1:
        #We previously saw that every data point was inside a "tr" tag
        #Now we note that every variable (column) inside the "tr" tags
        #is enclosed by tags named "td". This is what we want to extract
        data_point = item.find_all('td')
        row = [] #we'll append the variables to this empty row
        for value in data_point:
            row.append(value.text)
        print('\nObservation:', i)
        print(row)
        df.loc[i-1] = row #this adds a new row to the dataframe
        #we use i-1 because we skipped the first element
        print("Info added to the dataframe")

In [None]:
df

And now this dataframe can be easily exported to a `csv` file.