# Historical weather information scrapping using Selenium

[Wunderground](wunderground.com) used to offer free API keys allowing to collect historical weather data. Well, not anymore. In this notebook, I am using [Selinium](https://www.seleniumhq.org/) to automate the control of the browser (google chrome in my case) and scrap weather data from the HTML source.

In [1]:
import time
import numpy as np
import pandas as pd
import datetime
import re
from selenium import webdriver  # Selinium requires also a driver to be able to control a browser,
                                # to be installed in the project directory from
                                #  https://selenium-python.readthedocs.io/installation.html

In [2]:
# Date format: YYYY-MM-DD

In [3]:
def generate_dates_list(start_date,end_date):
    """ Generates a list of of dates between the 'start_date' date and the 'end_date' date """
    start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') # datetime object for the start_date
    end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') # datetime object for the end_date
    # generates list of dates between start_date and end_date dates
    counter = start_date
    dates = []
    while counter <= end_date:
        dates.append(counter.date())
        counter += datetime.timedelta(days=1) # advance one day each iteration
    # return the list of dates in string format
    return [str(date) for date in dates]



In [4]:
# Dictionary mapping months' numbers from pervious function with their names
month_dictio ={ 1:'January', 2:'February', 3:'March', 4:'April', 5:'May', 6:'June',
    7:'July', 8:'August', 9:'September', 10:'October', 11:'November', 12:'December'}
inv_month_dictio = {v: k for k, v in month_dictio.items()}


In [5]:
def generate_dates_df(d_list,f_mat ='%Y-%m-%d'):
    """Extracts the date information produced by list_dates() for Day, Month, Year """
    # creates a pandas dataframe of dates
    dates_df = pd.DataFrame(d_list,columns=['date'])
    ff = pd.to_datetime(dates_df.date, format=f_mat)
    for att in ('Day','Month','Year'):
        dates_df[att] = np.zeros(len(d_list),int)
        for ind in ff.index:
            dates_df.loc[ind,att] = getattr(ff[ind],att.lower())
    dates_df['Month'] = dates_df['Month'].map(month_dictio)
    return dates_df

In [6]:
generate_dates_df(generate_dates_list('2018-02-26','2018-02-28'),f_mat ='%Y-%m-%d')

Unnamed: 0,date,Day,Month,Year
0,2018-02-26,26,February,2018
1,2018-02-27,27,February,2018
2,2018-02-28,28,February,2018


I used _Inspect_ from the right click menu to access to source code of the webpage and then get the xpath of each element I want to scrap

In [7]:
dates_list_df = generate_dates_df(generate_dates_list('2018-09-29','2018-10-02'),f_mat ='%Y-%m-%d')
data_df = pd.DataFrame(columns=['Time', 'Temperature (C)',' Dew Point(C)', 'Humidity(%)','Wind Speed(km/h)',
                                'Wind Gust(km/h)',' Pressure (mm)',' Precip(mm)','Precip Accum', 'Condition'])
# Create the driver and launch the page
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument('window-size=1920,1080')
driver = webdriver.Chrome('chromedriver_win32/chromedriver',chrome_options=options)
driver.get('https://www.wunderground.com/history/daily/CWTA/date/2018-9-19?req_city=Montreal&req_state=QC&req_statename=Quebec&reqdb.zip=00000&reqdb.magic=182&reqdb.wmo=71612')

time.sleep(3) # sleep timer to wait for page to load (not necessary)

# change settings to Celsius since default is Fahrenheit
setting_btn = driver.find_element_by_xpath('//*[@id="wuSettings"]/i')
setting_btn.click()
celsius_btn = driver.find_element_by_xpath('//*[@id="wuSettings-quick"]/div/a[2]')
celsius_btn.click()

time.sleep(3) # sleep timer to wait for page to load (not necessary)


for ind, row in dates_list_df.iterrows():
    for ii  in range(5):
        Day = row['Day']
        Month = row['Month']
        Year= row['Year']
        month = driver.find_element_by_xpath('//*[@id="inner-content"]/div[2]/div[1]/div/div[1]/div/div/date-selector/div/select[1]')
        month.send_keys(Month)
        day = driver.find_element_by_xpath('//*[@id="inner-content"]/div[2]/div[1]/div/div[1]/div/div/date-selector/div/select[2]')
        day.send_keys(Day)
        year = driver.find_element_by_xpath('//*[@id="inner-content"]/div[2]/div[1]/div/div[1]/div/div/date-selector/div/select[3]')
        year.send_keys(Year)
        view_btn = driver.find_element_by_xpath('//*[@id="inner-content"]/div[2]/div[1]/div/div[1]/div/div/date-selector/div/input')
        view_btn.click()
        
        time.sleep(2) # sleep timer to wait for page to load (not necessary)
        # extract the data table
        weatherdata = driver.find_elements_by_id('history-observation-table')
        if len(weatherdata)>0:
            break
    weatherdata = weatherdata[0].text
    # preprocessing and organazing the acquired raw data
    w1 = ['\n'+str(i)+':' for i in range(1,13)]
    w2 = ['\n '+str(i)+':' for i in range(1,13)]
    for i in range(12):
        weatherdata = weatherdata.replace(w1[i],w2[i])
    weatherdata = weatherdata.split('\n ') # breaks the data into observations per row
    for it in ['\n','%']:
        weatherdata = [weatherdata[0]] + [ xx.replace(it,' ') for xx in weatherdata[1:]]
    weatherdata = [re.sub(' +',' ',xx) for xx in weatherdata]
    am_pm = [weatherdata[i].split()[1] for i in range(1,len(weatherdata))]
    for i in range(1,len(weatherdata)):
        weatherdata[i] = weatherdata[i].split()[:18:2]+[' '.join(weatherdata[i].split()[18:])]
        
    weatherdata = [[dates_list_df.date[ind] + ' ' + weatherdata[i][0]+am_pm[i-1]]+weatherdata[i][1:] for i in range(1,len(weatherdata))]
    aux_df = pd.DataFrame(weatherdata,columns=['Time', 'Temperature (C)',' Dew Point(C)', 'Humidity(%)','Wind Speed(km/h)',
                                            'Wind Gust(km/h)',' Pressure (mm)',' Precip(mm)','Precip Accum', 'Condition'])
    # adding the new data dataframe to the original data dataframe
    data_df = data_df.append(aux_df,ignore_index=True)
driver.quit()

In [8]:
#transforming the 12 hours format to 24
data_df['Time'] = [pd.to_datetime(u).strftime('%Y-%m-%d %H:%M') for u in data_df['Time']]

In [9]:
data_df.head()

Unnamed: 0,Time,Temperature (C),Dew Point(C),Humidity(%),Wind Speed(km/h),Wind Gust(km/h),Pressure (mm),Precip(mm),Precip Accum,Condition
0,2018-09-29 12:00,14,9,72,20,0,1017.0,0.0,0.0,Partly Cloudy
1,2018-09-29 00:00,13,10,82,11,0,1011.6,0.0,0.0,Light Rain Shower
2,2018-09-29 00:26,13,11,88,13,0,1012.3,0.0,0.0,Light Rain Shower
3,2018-09-29 01:00,12,9,82,17,0,1012.6,0.0,0.0,Light Rain Shower
4,2018-09-29 01:53,12,11,94,17,0,1014.3,0.0,0.0,Light Rain
