In [1]:
import requests 
from bs4 import BeautifulSoup 
import pandas as pd
import numpy as np

In [2]:
URL = "https://en.tutiempo.net/climate/ws-{}.html"

In [3]:
cities = {
    'Toronto': '716240',
    'Vancouver': '718920',
    'Quebec City':'717140',
    'Montreal':'716270',
    'Ottawa':'716280',
    'St John':'718010',
    'Charlottetown':'717060',
    'Halifax':'713950'
}

In [4]:
def get_df_for_city(city_id, city_name):
    req = requests.get(URL.format(city_id))
    soup = BeautifulSoup(req.content, 'lxml')
    table = soup.findAll('table', {'class': 'medias'})[0]
    table_arr = np.array([[cell.text for cell in row.find_all(["th","td"])] for row in table.find_all("tr")])
    tab_header = table_arr[0]
    tab_data = np.delete(table_arr, (0), axis=0)
    df_data = []
    for row_data in tab_data:
        row = []
        for val in row_data:
            if val == '-':
                row.append(np.nan)
            else:
                row.append(np.float(val))
        df_data.append(row)
    df = pd.DataFrame(columns=tab_header, data = df_data)
    ## Feature cleaning
    df = df[df.Year >= 1990]
    df = df[df.Year < 2020]
    med = df.median()
    df.fillna(value=med, inplace=True)
    df['City'] = city_name
    return df.reset_index(drop=True)   

In [5]:
df = get_df_for_city('713950', 'Toronto')

In [6]:
df.head(200)

Unnamed: 0,Year,T,TM,Tm,PP,V,RA,SN,TS,FG,TN,GR,City
0,1993.0,6.85,12.6,2.4,1461.52,17.9,176.0,94.0,10.0,107.0,0.0,1.0,Toronto
1,1994.0,6.4,11.2,2.5,0.0,15.4,168.0,92.0,12.0,187.0,0.0,0.0,Toronto
2,1995.0,6.5,11.0,2.7,0.0,13.9,170.0,120.0,10.0,185.0,0.0,1.0,Toronto
3,1996.0,6.4,10.6,2.7,1461.52,15.0,198.0,98.0,13.0,164.0,0.0,0.0,Toronto
4,1997.0,5.8,10.4,1.9,0.0,15.0,161.0,124.0,14.0,114.0,0.0,1.0,Toronto
5,1998.0,7.6,11.9,4.1,0.0,16.2,181.0,85.0,13.0,115.0,0.0,0.0,Toronto
6,1999.0,8.3,14.0,3.4,1461.52,17.2,181.0,87.0,12.0,115.0,0.0,0.0,Toronto
7,2000.0,7.1,12.7,2.5,1329.36,17.1,184.0,94.0,12.0,120.0,0.0,1.0,Toronto
8,2001.0,7.4,13.0,2.5,1048.61,15.6,138.0,116.0,10.0,116.0,0.0,0.0,Toronto
9,2002.0,6.85,12.6,2.4,1461.52,17.9,165.0,112.0,14.0,117.0,0.0,1.0,Toronto


In [7]:
dfs = []
for key, val in cities.items():
    dfs.append(get_df_for_city(val, key))

In [8]:
fdf = pd.concat(dfs)

In [9]:
fdf.to_csv('final_df.csv', index=False)

In [10]:
fdf.head(200)

Unnamed: 0,Year,T,TM,Tm,PP,V,RA,SN,TS,FG,TN,GR,City
0,1990.0,8.90,15.1,3.1,887.63,15.7,168.0,80.0,22.0,166.0,0.0,0.0,Toronto
1,1991.0,9.10,15.2,3.3,830.07,14.6,149.0,78.0,30.0,127.0,0.0,1.0,Toronto
2,1992.0,7.30,12.9,1.7,967.70,12.5,171.0,91.0,32.0,85.0,0.0,1.0,Toronto
3,1993.0,7.50,13.5,1.5,762.67,14.1,165.0,88.0,34.0,130.0,0.0,1.0,Toronto
4,1994.0,7.50,14.0,1.6,951.45,14.5,153.0,86.0,38.0,145.0,1.0,1.0,Toronto
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,2005.0,6.25,11.3,1.7,1210.83,16.3,173.0,104.0,11.5,39.5,0.0,0.0,Charlottetown
16,2006.0,7.50,12.5,3.1,1159.51,15.7,190.0,76.0,13.0,39.0,2.0,0.0,Charlottetown
17,2007.0,5.70,10.7,1.4,1210.83,16.5,160.0,91.0,13.0,24.0,1.0,3.0,Charlottetown
18,2008.0,6.40,11.4,2.1,1487.15,16.1,193.0,93.0,14.0,40.0,0.0,0.0,Charlottetown
