In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np

In [2]:
URL = "https://en.tutiempo.net/climate/ws-{}.html"

In [3]:
cities = {
    'Toronto': '716240',
    'Vancouver': '718920',
    'Quebec City':'717140',
    'Montreal':'716270',
    'Ottawa':'716280',
    'St John':'718010',
    'Charlottetown':'717060',
    'Halifax':'713950'
}

In [4]:
def get_df_for_city(city_id, city_name):
    req = requests.get(URL.format(city_id))
    soup = BeautifulSoup(req.content, 'lxml')
    table = soup.findAll('table', {'class': 'medias'})[0]
    table_arr = np.array([[cell.text for cell in row.find_all(["th","td"])] for row in table.find_all("tr")])
    tab_header = table_arr[0]
    tab_data = np.delete(table_arr, (0), axis=0)
    df_data = []
    for row_data in tab_data:
        row = []
        for val in row_data:
            if val == '-':
                row.append(np.nan)
            else:
                row.append(np.float(val))
        df_data.append(row)
    df = pd.DataFrame(columns=tab_header, data = df_data)
    ## Feature cleaning
    df = df[df.Year >= 1990]
    df = df[df.Year < 2020]
    med = df.median()
    df.fillna(value=med, inplace=True)
    df['City'] = city_name
    return df.reset_index(drop=True)   

In [5]:
df = get_df_for_city('713950', 'Toronto')

In [10]:
df.head(200)

Unnamed: 0,Year,T,TM,Tm,PP,V,RA,SN,TS,FG,TN,GR,City
0,1982.0,6.1,10.8,2.1,1330.65,16.8,154.0,94.0,10.0,185.0,0.0,0.0,Toronto
1,1983.0,7.2,11.4,3.7,1330.65,16.9,173.0,78.0,8.0,194.0,0.0,1.0,Toronto
2,1984.0,6.9,11.4,3.1,1330.65,15.9,166.0,91.0,11.0,211.0,0.0,0.0,Toronto
3,1985.0,5.6,10.1,1.9,0.0,16.4,150.0,101.0,9.0,175.0,0.0,1.0,Toronto
4,1986.0,5.7,10.3,1.8,0.0,16.4,181.0,85.0,7.0,212.0,0.0,0.0,Toronto
5,1987.0,6.8,12.2,2.3,1330.65,17.5,174.5,93.5,10.0,114.5,0.0,0.5,Toronto
6,1993.0,6.8,12.2,2.3,1330.65,17.5,174.5,93.5,10.0,114.5,0.0,0.5,Toronto
7,1994.0,6.4,11.2,2.5,0.0,15.4,168.0,92.0,12.0,187.0,0.0,0.0,Toronto
8,1995.0,6.5,11.0,2.7,0.0,13.9,170.0,120.0,10.0,185.0,0.0,1.0,Toronto
9,1996.0,6.4,10.6,2.7,1330.65,15.0,198.0,98.0,13.0,164.0,0.0,0.0,Toronto


In [7]:
dfs = []
for key, val in cities.items():
    dfs.append(get_df_for_city(val, key))

In [8]:
fdf = pd.concat(dfs)

In [9]:
fdf.to_csv('final_df.csv', index=False)

In [11]:
fdf.head(200)

Unnamed: 0,Year,T,TM,Tm,PP,V,RA,SN,TS,FG,TN,GR,City
0,1982.0,7.2,13.3,1.2,923.73,13.7,143.0,92.0,22.0,162.0,0.0,1.0,Toronto
1,1983.0,8.1,13.9,2.4,836.67,13.4,162.0,86.0,32.0,158.0,0.0,4.0,Toronto
2,1984.0,7.6,13.5,1.8,843.99,12.8,153.0,77.0,29.0,165.0,0.0,1.0,Toronto
3,1985.0,7.5,13.5,2.0,986.20,16.3,152.0,99.0,28.0,149.0,0.0,3.0,Toronto
4,1986.0,7.8,13.6,2.1,1091.85,14.1,154.0,76.0,33.0,166.0,0.0,2.0,Toronto
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,1987.0,4.5,9.7,0.2,1431.45,22.5,203.0,131.0,4.0,201.0,0.0,0.0,St John
6,1988.0,5.0,10.5,0.7,1517.42,24.6,223.0,132.0,4.0,225.0,0.0,0.0,St John
7,1989.0,4.4,10.3,-0.4,1233.08,24.9,196.0,133.0,2.0,197.0,0.0,0.0,St John
8,1990.0,4.7,10.3,-0.1,1517.07,23.1,205.0,129.0,4.0,214.0,0.0,1.0,St John
