This example scrapes the BBC weather website for any specific city, and collects weather forecast for the next 14 days and saves it as a csv file.

In [1]:
import json                   # to convert API to json format
from urllib.parse import urlencode
import requests               # to get the webpage
from bs4 import BeautifulSoup # to parse the webpage
import pandas as pd
import re                     # regular expression operators
from datetime import datetime

Getting the webpage of interest from the server

In [2]:
required_city = "Mumbai"
location_url = 'https://locator-service.api.bbci.co.uk/locations?' + urlencode({
   'api_key': 'AGbFAKx58hyjQScCXIYrxuEwJh2W2cmv',
   's': required_city,
   'stack': 'aws',
   'locale': 'en',
   'filter': 'international',
   'place-types': 'settlement,airport,district',
   'order': 'importance',
   'a': 'true',
   'format': 'json'
})
location_url

'https://locator-service.api.bbci.co.uk/locations?api_key=AGbFAKx58hyjQScCXIYrxuEwJh2W2cmv&s=Mumbai&stack=aws&locale=en&filter=international&place-types=settlement%2Cairport%2Cdistrict&order=importance&a=true&format=json'

In [3]:
result = requests.get(location_url).json()
result

{'response': {'results': {'results': [{'container': 'India',
     'containerId': 1269750,
     'country': 'IN',
     'id': '1275339',
     'language': 'en',
     'latitude': 19.07283,
     'longitude': 72.88261,
     'name': 'Mumbai',
     'placeType': 'settlement',
     'timezone': 'Asia/Kolkata'}],
   'totalResults': 1}}}

In [12]:
# url      = 'https://www.bbc.com/weather/1275339' # url to BBC weather, corresponding to a specific city (Mumbai, in this example)
url      = 'https://www.bbc.com/weather/'+result['response']['results']['results'][0]['id']
response = requests.get(url)
url

'https://www.bbc.com/weather/1275339'

In [5]:
#Initiating BeautifulSoup
soup = BeautifulSoup(response.content,'html.parser') 

we want (daily high and low temp., and daily weather summary, are in specific blocks on the webpage. We need to find the block type, type of identifier, and the identifier name

In [6]:
#predicted highest temperatures for next 14 days
daily_high_values = soup.find_all('span', attrs={'class': 'wr-day-temperature__high-value'}) # block-type: span; identifier type: class; and class name: wr-day-temperature__high-value 
daily_high_values

[<span class="wr-day-temperature__high-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">30°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">86°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">30°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">86°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">31°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">88°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">30°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">86°</span></span></span>,
 <span class="wr-day-temperature__high-value"><span class="wr-value--temperature "><span cla

In [7]:
#predicted lowest temperatures for next 14 days
daily_low_values  = soup.find_all('span', attrs={'class': 'wr-day-temperature__low-value'})
daily_low_values

[<span class="wr-day-temperature__low-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">25°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">76°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">26°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">78°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">26°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">79°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature "><span class="wr-value--temperature--c">26°</span><span class="wr-hide"> </span><span class="wr-value--temperature--f">79°</span></span></span>,
 <span class="wr-day-temperature__low-value"><span class="wr-value--temperature "><span class="w

daily_high_values and daily_low values return a lot of junk, we want only the ˚C values, hence we run the following piece of code

In [13]:
daily_high_values_list = [daily_high_values[i].text.strip().split()[0] for i in range(len(daily_high_values))]
daily_high_values_list

['30°',
 '30°',
 '31°',
 '30°',
 '30°',
 '29°',
 '30°',
 '29°',
 '29°',
 '29°',
 '30°',
 '30°',
 '29°',
 '29°']

In [14]:
daily_low_values_list = [daily_low_values[i].text.strip().split()[0] for i in range(len(daily_low_values))]
daily_low_values_list

['25°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '26°',
 '25°']

In [15]:
#To return the highest ˚C and ˚F values of a particular day
daily_high_values[0].text.strip() #current day's values

'30° 86°'

In [16]:
#To return the lowest ˚C and ˚F values of a particular day
daily_low_values[0].text.strip() #current day's values

'25° 76°'

In [8]:
daily_summary = soup.find('div', attrs={'class': 'wr-day-summary'})
daily_summary

<div class="wr-day-summary"><div class="gel-wrap"><span class="">Light rain showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a gentle breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span><span class="wr-hide">Thundery showers and a moderate breeze</span></div></div>

In [9]:
daily_summary.text

'Light rain showers and a gentle breezeThundery showers and a gentle breezeThundery showers and a gentle breezeThundery showers and a gentle breezeThundery showers and a gentle breezeThundery showers and a gentle breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breezeThundery showers and a moderate breeze'

In [17]:
#Splitting the summary of the 14 days by finding capitalised words
daily_summary_list = re.findall('[a-zA-Z][^A-Z]*', daily_summary.text) #split the string on uppercase
daily_summary_list

['Light rain showers and a gentle breeze',
 'Thundery showers and a gentle breeze',
 'Thundery showers and a gentle breeze',
 'Thundery showers and a gentle breeze',
 'Thundery showers and a gentle breeze',
 'Thundery showers and a gentle breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze',
 'Thundery showers and a moderate breeze']

In [18]:
#creating a date column using the panda's date range function
datelist = pd.date_range(datetime.today(), periods=len(daily_high_values)).tolist()
datelist

[Timestamp('2022-06-22 07:24:34.210324', freq='D'),
 Timestamp('2022-06-23 07:24:34.210324', freq='D'),
 Timestamp('2022-06-24 07:24:34.210324', freq='D'),
 Timestamp('2022-06-25 07:24:34.210324', freq='D'),
 Timestamp('2022-06-26 07:24:34.210324', freq='D'),
 Timestamp('2022-06-27 07:24:34.210324', freq='D'),
 Timestamp('2022-06-28 07:24:34.210324', freq='D'),
 Timestamp('2022-06-29 07:24:34.210324', freq='D'),
 Timestamp('2022-06-30 07:24:34.210324', freq='D'),
 Timestamp('2022-07-01 07:24:34.210324', freq='D'),
 Timestamp('2022-07-02 07:24:34.210324', freq='D'),
 Timestamp('2022-07-03 07:24:34.210324', freq='D'),
 Timestamp('2022-07-04 07:24:34.210324', freq='D'),
 Timestamp('2022-07-05 07:24:34.210324', freq='D')]

In [19]:
#Stripping off all the junk, and getting only date from above list
datelist = [datelist[i].date().strftime('%y-%m-%d') for i in range(len(datelist))]
datelist

['22-06-22',
 '22-06-23',
 '22-06-24',
 '22-06-25',
 '22-06-26',
 '22-06-27',
 '22-06-28',
 '22-06-29',
 '22-06-30',
 '22-07-01',
 '22-07-02',
 '22-07-03',
 '22-07-04',
 '22-07-05']

In [20]:
zipped = zip(datelist, daily_high_values_list, daily_low_values_list, daily_summary_list)

In [21]:
df = pd.DataFrame(list(zipped), columns=['Date', 'High','Low', 'Summary'])

In [22]:
display(df)

Unnamed: 0,Date,High,Low,Summary
0,22-06-22,30°,25°,Light rain showers and a gentle breeze
1,22-06-23,30°,26°,Thundery showers and a gentle breeze
2,22-06-24,31°,26°,Thundery showers and a gentle breeze
3,22-06-25,30°,26°,Thundery showers and a gentle breeze
4,22-06-26,30°,26°,Thundery showers and a gentle breeze
5,22-06-27,29°,26°,Thundery showers and a gentle breeze
6,22-06-28,30°,26°,Thundery showers and a moderate breeze
7,22-06-29,29°,26°,Thundery showers and a moderate breeze
8,22-06-30,29°,26°,Thundery showers and a moderate breeze
9,22-07-01,29°,26°,Thundery showers and a moderate breeze


In [23]:
# remove the 'degree' character
df.High = df.High.replace('\°','',regex=True).astype(float)
df.Low  = df.Low.replace('\°','',regex=True).astype(float)

In [24]:
display(df)

Unnamed: 0,Date,High,Low,Summary
0,22-06-22,30.0,25.0,Light rain showers and a gentle breeze
1,22-06-23,30.0,26.0,Thundery showers and a gentle breeze
2,22-06-24,31.0,26.0,Thundery showers and a gentle breeze
3,22-06-25,30.0,26.0,Thundery showers and a gentle breeze
4,22-06-26,30.0,26.0,Thundery showers and a gentle breeze
5,22-06-27,29.0,26.0,Thundery showers and a gentle breeze
6,22-06-28,30.0,26.0,Thundery showers and a moderate breeze
7,22-06-29,29.0,26.0,Thundery showers and a moderate breeze
8,22-06-30,29.0,26.0,Thundery showers and a moderate breeze
9,22-07-01,29.0,26.0,Thundery showers and a moderate breeze


In [25]:
#Extracting the name of the city for which data is gathered
#location = soup.find('div', attrs={'class':'wr-c-location'})
location = soup.find('h1', attrs={'id':'wr-location-name-id'})
location.text.split()



In [26]:
#Saving as csv
filename_csv = location.text.split()[0]+'.csv'
df.to_csv(filename_csv, index=None)

In [None]:
#Saving as xlsx
filename_xlsx = location.text.split()[0]+'.xlsx'
df.to_excel(filename_xlsx)