In [39]:
import re, pickle, os, csv, requests
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError


PAGE_URL = "https://www.infoclimat.fr/observations-meteo/archives/2/janvier/2016/paris-montsouris/07156.html"
file_name = 'scraped_page.pickle'


# if the page has already been downloaded & saved
if os.path.exists(file_name):
    with open(file_name, 'rb') as f:
        print(f"Loading cached {file_name}")
        response = pickle.load(f)

# otherwise fetch it for the first time
else:
    print(f"Fetching {PAGE_URL} from the internet")
    try:
        response = requests.get(PAGE_URL)
        # If the response was successful, no Exception will be raised
        response.raise_for_status()
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        print('Request success!')
        with open(file_name, 'wb') as f:
            print(f"Writing cached {file_name}")
            pickle.dump(response, f)



source = response.text
soup = BeautifulSoup(source, 'html5lib')

Fetching https://www.infoclimat.fr/observations-meteo/archives/2/janvier/2016/paris-montsouris/07156.html from the internet
Request success!
Writing cached scraped_page.pickle


In [None]:
# classes range from class_='cdata-hour23' for 00h to class_='cdata-hour00' for 1h

In [91]:
hr_range = [f"{i:02d}" for i in list(range(23, -1, -1)) ]

In [124]:
page_data = ""
for h in hr_range:
    class_ = 'cdata-hour' + h
    #row = hour = temp = rain = humi = wind = 'NaN'
    try:
        row = soup.find('tr', class_= class_)
    except:
        row = 'NaN'
    try:
        hour = row.find('span', class_="tipsy-trigger").text
    except:
        hour = 'NaN'
    try:
        temp = row.find('span', text="°C", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.text
    except:
        temp = 'NaN'
    try:
        rain = row.find('span', text="mm/1h", attrs={'class' : 'tab-units-v'}).find_parent('td').contents[0].replace(' ', '')
    except:
        rain = 'NaN'
    try:
        humi = row.find('span', text="%", attrs={'class' : 'tab-units-v'}).previous_sibling.text
    except:
        humi = 'NaN'
    try:
        wind = row.find('span', text="km/h", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.text
    except:
        wind = 'NaN'
    result = ';'.join([hour, temp, rain, humi, wind]) + '\n'
    page_data += result

print(page_data)

00h;7.8;0;76;15
23h;8.4;0;73;13
22h;9.0;0;72;15
21h;9.1;0;77;17
20h;8.8;0.2;82;15
19h;8.7;0.6;85;17
18h;9.1;0.6;82;17
17h;10.1;0;73;19
16h;10.4;NaN;73;19
15h;10.8;0;70;22
14h;10.9;0;70;19
13h;10.8;0;71;22
12h;10.5;0;73;22
11h;10.1;0;78;19
10h;9.5;0;82;13
09h;9.0;0;88;13
08h;8.7;0;91;11
07h;8.6;0;90;9
06h;8.6;0;90;13
05h;8.4;0;89;15
04h;8.1;0.2;89;13
03h;7.9;0;88;13
02h;7.3;0;88;11
01h;7.4;0.2;88;11



In [113]:
h15 = soup.find('tr', class_='cdata-hour15')

In [114]:
h15.find('span', class_="tipsy-trigger").text

'16h'

In [115]:
h15.find('span', class_="tipsy-trigger").text


'16h'

In [116]:
h15.find('span', text="°C", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.text


'10.4'

In [117]:
h15.find('span', text="mm/1h", attrs={'class' : 'tab-units-v'}).find_parent('td').contents[0].replace(' ', '')


AttributeError: 'NoneType' object has no attribute 'find_parent'

In [103]:
h15.find('span', text="%", attrs={'class' : 'tab-units-v'}).previous_sibling.text


'73'

In [111]:
h15.find('span', text="km/h", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.texth15

'19'

---

Working examples

In [43]:
#print(soup.prettify())

In [44]:
h23 = soup.find('tr', class_='cdata-hour23')
#print(h23.prettify())

Working HOUR

In [46]:
h23.find('span', class_="tipsy-trigger").text

'00h'

In [73]:
#print(h23.prettify())

In [56]:
h23.find('span', text="°C", attrs={'class' : 'tab-units-v'})

<span class="tab-units-v">°C</span>

Working TEMPERATURE

In [59]:
h23.find('span', text="°C", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.text

'7.8'

In [67]:
h23.find('span', text="mm/1h", attrs={'class' : 'tab-units-v'}).find_parent('td')

<td>0 <span class="tab-units-v">mm/1h</span><span class="color-heatmap" style="background-color:rgb(255,255,255)"></span></td>

Working PLUIE

In [71]:
h23.find('span', text="mm/1h", attrs={'class' : 'tab-units-v'}).find_parent('td').contents[0]

'0 '

In [64]:
h18 = soup.find('tr', class_='cdata-hour18')

In [72]:
h18.find('span', text="mm/1h", attrs={'class' : 'tab-units-v'}).find_parent('td').contents[0]

'0.6 '

Working HUMIDITE

In [76]:
h23.find('span', text="%", attrs={'class' : 'tab-units-v'}).previous_sibling.text

'76'

Working VENT MOYEN

In [80]:
h23.find('span', text="km/h", attrs={'class' : 'tab-units-v'}).previous_sibling.previous_sibling.text

'15'

---

several tries

In [None]:
soup.findAll('td', text=pattern, attrs={'class' : 'pos'})

In [31]:
soup.find("div", {"id": "articlebody"})

In [32]:
soup.find('div', class_='observation-table')

In [33]:
#history-observation-table
soup.find("div", {"id": "history-observation-table"})

In [34]:
table = soup.find('table', id="history-observation-table")

In [38]:
# bs4 supports most CSS selectors with the .select() method, therefore we can use an id selector :
soup.select('#history-observation-table')

[]

In [126]:
list(range(1, 32))

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31]