# Scraping weather data
Weather data is scraped from the Melbourne weather service which stores historic weather (either from forecast or actual conditions).  
For every 30 minutes a string is downloaded which contains details on the weather conditions.  
This string is split into components related to different weather variables, and saved into columns in a dataframe.  
The 30 minute data is then converted to hourly data.  

Rainfall data in mm/day is also available from: http://www.bom.gov.au/jsp/ncc/cdio/weatherData/av?p_nccObsCode=136&p_display_type=dailyDataFile&p_startYear=2014&p_c=-1487361950&p_stn_num=086232

In [8]:
import os

import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from time import time, sleep
from selenium.webdriver.chrome.service import Service

import requests
from bs4 import BeautifulSoup

import joblib
import time
from selenium.common import exceptions  

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [38]:
def scrape_weather_data(year):
   
    day_dict = {}

    for month_value in range(1,13):
        url = 'https://www.timeanddate.com/weather/australia/melbourne/historic?month={}&year={}'.format(month_value, year)
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
        # driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.get('https://www.timeanddate.com/weather/australia/melbourne/historic?month={}&year={}'.format(month_value, year))

        # Drop down menu html
        select = driver.find_element(By.ID,'wt-his-select')

        date_row = []
        elements = select.find_elements(By.TAG_NAME, 'option')
        date = []

        # For each drop down option
        for option in elements:

            hourly_record = []
            option.click()
            time.sleep(1)
            
            table = driver.find_element(By.ID, 'wt-his')
            rows = table.find_elements(By.TAG_NAME, "tr")
            
            # Add each hour of day's table to list hourly_record
            for row in rows:
                try:
                    hourly_record.append(row.text)
                except:
                    exceptions.StaleElementReferenceException

            date.append(' '.join(hourly_record[2].split(' ')[1:3]))

            # Add entire day to date_row list
            date_row.append(hourly_record[2:-1])

        full_date_row = []
        for i, day in enumerate(date_row):

            day_list = date_row[i][0].split(' ')[3:]
            day_list.insert(0, date_row[i][0].split('\n', 1)[0])
            date_row[i][0] = ' '.join(day_list)
            full_date_row.append(date_row[i])

        values = []

        for index, r in enumerate(full_date_row):
            day_values = []
            for z in r:
                values.append(z + ' ' + date[index])

        day_dict[month_value] = values
    
    return day_dict


def clean_weather_data(monthly_weather_dict, year):

    #########################################################################
    # Get one dataframe containing data for the whole year (one column with a string)
    #########################################################################
    # Put weather data for each month into a dataframe containing all the data
    months_data = []
    for i in range(1,13):
        try:
            months_data.append(pd.DataFrame(monthly_weather_dict[i]))
        except KeyError as e:
            print(f"Warning, missing month {i} in year {year}. If this month is in the future then there's no problem, but otherwise somethingis going wrong.")
    yearly_df = pd.concat(months_data)

    #########################################################################
    # Split the string of weather info out into columns of weather variables
    #########################################################################
    yearly_df_formatted = pd.DataFrame(None)  

    # loop through rows in dataframe. Each row contains a string containing the weather information
    for this_row_number, this_row in enumerate(yearly_df.iloc[:,0]):
        # Split the row into the different strings
        this_row_strings =this_row.split(' ')
        # Accounts for cases where lots of values are N/A for some reason
        if this_row_strings[1] != 'N/A':
            # Get the conditions
            conditions = this_row_strings[3] 
            # Sometimes conditions is more than one word, so then loop through the next few strings
            for i in range(4,10):
                # Check if the this string is a number, if it is then leave the loop
                try:
                    int(this_row_strings[i])
                # If it's not a number, then add it to the conditions
                except:
                    if this_row_strings[i] == '↑':
                        break
                    elif this_row_strings[i] == 'No':
                        break
                    else:
                        conditions = conditions + this_row_strings[i]
                else:
                    break

            # Get wind speed (accounting for times when it is listed as 'No Wind')
            if 'mph' in this_row:
                windspeed = this_row.partition('mph')[0].split()[-1]
            else:
                windspeed = 0
            # Get humidity (accounting for times when it is listed as 'N/A')
            if '%' in this_row:
                humidity = this_row.split('%')[0].split()[-1] 
            else:
                humidity = 0

            # Problem from 2022 data
            this_row_further_strings = this_row_strings[0].split('\n')
            Hour = this_row_further_strings[0]

            # Use different parts of string to create a dataframe row
            formatted_row = pd.DataFrame({ "Day": this_row_strings[-2], "Month": this_row_strings[-1], 'Hour':  Hour, 
                                          'Humidity':humidity, "Temp": this_row_strings[1], 'Conditions': conditions, 
                                          'WindSpeed': windspeed,'Pressure': this_row.partition('mbar')[0].split()[-1]},
                                         index = [this_row_number])
            #yearly_df_formatted = yearly_df_formatted.append(formatted_row)
            yearly_df_formatted = pd.concat([yearly_df_formatted, formatted_row], ignore_index=True)

        else:
            print(this_row_strings)

    #########################################################################
    # Further formatting
    #########################################################################
    # Set the year
    yearly_df_formatted['Year'] = year
    # Change months to month numbers
    yearly_df_formatted['Month']= yearly_df_formatted.Month.replace({'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9,
                         'Oct': 10, 'Nov': 11, 'Dec': 12})
    # Set other varibles to integers
    for variable in ['Day', 'Temp', 'Humidity', 'Pressure', 'WindSpeed']:
        yearly_df_formatted[variable] = yearly_df_formatted[variable].astype('int')
    # Split hour into hour and minutes    
    yearly_df_formatted['Minute'] = pd.to_datetime(yearly_df_formatted['Hour'], format='%H:%M').dt.minute
    yearly_df_formatted['Hour'] = pd.to_datetime(yearly_df_formatted['Hour'], format='%H:%M').dt.hour
    # Create datetime
    yearly_df_formatted['datetime'] =  pd.to_datetime(yearly_df_formatted[['Year', 'Month', 'Day', 'Hour', 'Minute']])
    yearly_df_formatted.set_index('datetime', inplace = True, drop = True)
    # Create datetime
    yearly_df_formatted['datetime'] =  pd.to_datetime(yearly_df_formatted[['Year', 'Month', 'Day', 'Hour', 'Minute']])
    yearly_df_formatted.set_index('datetime', inplace = True, drop = True)
    
    #########################################################################
    # Add variable specifying whether rain has happened
    #########################################################################
    yearly_df_formatted['Rain'] = yearly_df_formatted['Conditions'].str.contains('Rain|rain')
        
    #########################################################################
    # Resample to be hourly values rather than half-hourly
    #########################################################################
    #  Keep just the necessary columns
    filtered_df = yearly_df_formatted[['Temp', 'Humidity', 'Pressure', 'Rain', 'WindSpeed']].copy()
    # Take the mean over each hour
    hourly = filtered_df.resample('H').mean()
    # Anything over 0 means there was rain at some point in the hour
    hourly['Rain'] =  np.where(hourly['Rain'] > 0, 1,0)
    
    return hourly 

## Scrape weather data, clean and save to csv for each year in 2011-2024

In [42]:
dir = "../../Cleaned_data/WeatherData/"
os.makedirs(dir, exist_ok=True)

weather_all_years = pd.DataFrame(None)
for year in range(2011, 2024+1):
    print(year)
    # Scrape data
    weather_thisyear_monthly_dict = scrape_weather_data(year)
    # Clean data
    clean_thisyear = clean_weather_data(weather_thisyear_monthly_dict, year)
    # Add to dataframe contianing all years weather data
    weather_all_years = pd.concat([weather_all_years, clean_thisyear])
    
#weather_all_years['datetime'] = weather_all_years(weather['datetime'])    
# Save to csv
weather_all_years.to_csv(dir+'weather_data_allyears.csv', header = weather_all_years.columns, index=True)

2022
2023
2024


### Check alternative weather data

In [43]:
rainfall = pd.read_csv("../../Data/RainfallData/melbourne_gardens_rainfall.csv")
rainfall_post2010 = rainfall[rainfall['Year']>2010].copy()
rainfall_post2010['datetime'] = pd.to_datetime(rainfall_post2010[['Month', 'Day', 'Year']])
rainfall_post2010 = rainfall_post2010[['datetime','Rainfall amount (millimetres)']]
rainfall_post2010.to_csv(dir+'DailyRainfallData.csv', index=False)