In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [14]:
df = pd.read_csv('Data/climate-daily.csv')

In [3]:
df.columns

Index(['x', 'y', 'STATION_NAME', 'STN_ID', 'CLIMATE_IDENTIFIER', 'ID',
       'LOCAL_DATE', 'PROVINCE_CODE', 'LOCAL_YEAR', 'LOCAL_MONTH', 'LOCAL_DAY',
       'MEAN_TEMPERATURE', 'MEAN_TEMPERATURE_FLAG', 'MIN_TEMPERATURE',
       'MIN_TEMPERATURE_FLAG', 'MAX_TEMPERATURE', 'MAX_TEMPERATURE_FLAG',
       'TOTAL_PRECIPITATION', 'TOTAL_PRECIPITATION_FLAG', 'TOTAL_RAIN',
       'TOTAL_RAIN_FLAG', 'TOTAL_SNOW', 'TOTAL_SNOW_FLAG', 'SNOW_ON_GROUND',
       'SNOW_ON_GROUND_FLAG', 'DIRECTION_MAX_GUST', 'DIRECTION_MAX_GUST_FLAG',
       'SPEED_MAX_GUST', 'SPEED_MAX_GUST_FLAG', 'COOLING_DEGREE_DAYS',
       'COOLING_DEGREE_DAYS_FLAG', 'HEATING_DEGREE_DAYS',
       'HEATING_DEGREE_DAYS_FLAG', 'MIN_REL_HUMIDITY', 'MIN_REL_HUMIDITY_FLAG',
       'MAX_REL_HUMIDITY', 'MAX_REL_HUMIDITY_FLAG'],
      dtype='object')

In [4]:
# keep only the data we want to use
df0 = df[['STATION_NAME','LOCAL_YEAR', 'LOCAL_MONTH', 'LOCAL_DAY',
    'MEAN_TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE',
    'TOTAL_PRECIPITATION', 'TOTAL_RAIN', 'TOTAL_SNOW', 'SNOW_ON_GROUND',
    'DIRECTION_MAX_GUST','SPEED_MAX_GUST', 'COOLING_DEGREE_DAYS',
    'HEATING_DEGREE_DAYS','MIN_REL_HUMIDITY','MAX_REL_HUMIDITY']]

In [5]:
# extract each station's data
station_names = df0['STATION_NAME'].unique().tolist()
station_names

['MONTREAL/PIERRE ELLIOTT TRUDEAU INTL',
 'MONTREAL/ST-HUBERT',
 'MONTREAL/PIERRE ELLIOTT TRUDEAU INTL A']

In [6]:
df1 = df0.loc[df0.STATION_NAME=='MONTREAL/PIERRE ELLIOTT TRUDEAU INTL']
df1 = df1[df1.columns[1:]]

In [7]:
df2 = df0.loc[df0.STATION_NAME=='MONTREAL/ST-HUBERT']
df2 = df2[df2.columns[1:]]

In [8]:
df3 = df0.loc[df0.STATION_NAME=='MONTREAL/PIERRE ELLIOTT TRUDEAU INTL A']
df3 = df3[df3.columns[1:]]

In [9]:
def aggregate_data_per_week(df):
    
    # aggregate the lines per weeks
    df_per_week = []
    for year in df['LOCAL_YEAR'].unique():
        df_y = df.loc[df.LOCAL_YEAR==year]
        for month in df_y['LOCAL_MONTH'].unique():
            df_y_m = df_y.loc[df_y.LOCAL_MONTH==month]
            df_y_m_s = []
            cpt = 0
            for index, row in df_y_m.iterrows():
                line = row.values.tolist()
                day = line[2]
                id_week = int(year)*1000+int(month)*10+cpt
                df_y_m_s.append([id_week] + line[3:])
                if int(day)%7==0:
                    cpt +=1
                    df_per_week.append(df_y_m_s)
                    df_y_m_s = []
    
    # make the summary of each week
    # the columns are: 'MEAN_TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE',
    #                  'TOTAL_PRECIPITATION', 'TOTAL_RAIN', 'TOTAL_SNOW', 'SNOW_ON_GROUND',
    #                  'DIRECTION_MAX_GUST','SPEED_MAX_GUST', 'COOLING_DEGREE_DAYS',
    #                  'HEATING_DEGREE_DAYS','MIN_REL_HUMIDITY','MAX_REL_HUMIDITY'
    df_per_week_summary = []
    dw_m1 = np.array(df_per_week[0])
    dw_m2 = np.array(df_per_week[0])
    dw_m3 = np.array(df_per_week[0])
    
    for data_week in df_per_week:
        data_week = np.array(data_week)
        summary = [int(data_week[0,0])]
        # summary of the week
        summary.append(np.mean(data_week[:,1]))
        summary.append(np.min(data_week[:,2]))
        summary.append(np.max(data_week[:,3]))
        summary.append(np.sum(data_week[:,4]))
        summary.append(np.sum(data_week[:,5]))
        summary.append(np.sum(data_week[:,6]))
        summary.append(np.mean(data_week[:,7]))
        summary.append(np.mean(data_week[:,8]))
        summary.append(np.max(data_week[:,9]))
        summary.append(np.mean(data_week[:,10]))
        summary.append(np.mean(data_week[:,11]))
        summary.append(np.min(data_week[:,12]))
        summary.append(np.max(data_week[:,13]))
        summary.append(len(np.where(data_week[:,3]>20)[0]))
        summary.append(len(np.where(data_week[:,3]>25)[0]))
        summary.append(len(np.where(data_week[:,3]>30)[0]))
        summary.append(len(np.where(data_week[:,2]<10)[0]))
        summary.append(len(np.where(data_week[:,2]<0)[0]))
        summary.append(len(np.where(data_week[:,2]<-5)[0]))
        summary.append(len(np.where(data_week[:,2]<-10)[0]))
        summary.append(len(np.where(data_week[:,4]>5)[0]))
        # summary of last 2 weeks
        data_2week = np.concatenate((dw_m1,data_week), axis=0)
        summary.append(np.mean(data_2week[:,1]))
        summary.append(np.min(data_2week[:,2]))
        summary.append(np.max(data_2week[:,3]))
        summary.append(len(np.where(data_2week[:,3]>20)[0]))
        summary.append(len(np.where(data_2week[:,3]>25)[0]))
        summary.append(len(np.where(data_2week[:,3]>30)[0]))
        summary.append(len(np.where(data_2week[:,2]<10)[0]))
        summary.append(len(np.where(data_2week[:,2]<0)[0]))
        summary.append(len(np.where(data_2week[:,2]<-5)[0]))
        summary.append(len(np.where(data_2week[:,2]<-10)[0]))
        summary.append(len(np.where(data_2week[:,4]>5)[0]))
        # summary of last 3 weeks
        data_3week = np.concatenate((dw_m2,dw_m1,data_week), axis=0)
        summary.append(np.mean(data_3week[:,1]))
        summary.append(np.min(data_3week[:,2]))
        summary.append(np.max(data_3week[:,3]))
        summary.append(len(np.where(data_3week[:,3]>20)[0]))
        summary.append(len(np.where(data_3week[:,3]>25)[0]))
        summary.append(len(np.where(data_3week[:,3]>30)[0]))
        summary.append(len(np.where(data_3week[:,2]<10)[0]))
        summary.append(len(np.where(data_3week[:,2]<0)[0]))
        summary.append(len(np.where(data_3week[:,2]<-5)[0]))
        summary.append(len(np.where(data_3week[:,2]<-10)[0]))
        summary.append(len(np.where(data_3week[:,4]>5)[0]))
        # summary of last 4 weeks
        data_4week = np.concatenate((dw_m3,dw_m2,dw_m1,data_week), axis=0)
        summary.append(np.mean(data_4week[:,1]))
        summary.append(np.min(data_4week[:,2]))
        summary.append(np.max(data_4week[:,3]))
        summary.append(len(np.where(data_4week[:,3]>20)[0]))
        summary.append(len(np.where(data_4week[:,3]>25)[0]))
        summary.append(len(np.where(data_4week[:,3]>30)[0]))
        summary.append(len(np.where(data_4week[:,2]<10)[0]))
        summary.append(len(np.where(data_4week[:,2]<0)[0]))
        summary.append(len(np.where(data_4week[:,2]<-5)[0]))
        summary.append(len(np.where(data_4week[:,2]<-10)[0]))
        summary.append(len(np.where(data_4week[:,4]>5)[0]))
        
        dw_m3 = dw_m2
        dw_m2 = dw_m1
        dw_m1 = data_week
        df_per_week_summary.append(summary)
    
    df_week = pd.DataFrame(df_per_week_summary, columns=['id_week',
    'MEAN_TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE',
    'TOTAL_PRECIPITATION', 'TOTAL_RAIN', 'TOTAL_SNOW', 'SNOW_ON_GROUND',
    'DIRECTION_MAX_GUST','SPEED_MAX_GUST', 'COOLING_DEGREE_DAYS',
    'HEATING_DEGREE_DAYS','MIN_REL_HUMIDITY','MAX_REL_HUMIDITY',
    'nb_j_t_sup20_1w', 'nb_j_t_sup25_1w', 'nb_j_t_sup30_1w', 'nb_j_t_inf_10_1w', 'nb_j_t_inf_0_1w', 'nb_jt__inf_m5_1w',
    'nb_j_t_inf_m10_1w', 'nb_j_rain_sup_5_1w',
    'mean_t_2w', 'min_t_2w', 'max_t_2w',
    'nb_j_t_sup20_2w', 'nb_j_t_sup25_2w', 'nb_j_t_sup30_2w', 'nb_j_t_inf_10_2w', 'nb_j_t_inf_0_2w', 'nb_jt__inf_m5_2w',
    'nb_j_t_inf_m10_2w', 'nb_j_rain_sup_5_2w',
    'mean_t_3w', 'min_t_3w', 'max_t_3w',
    'nb_j_t_sup20_3w', 'nb_j_t_sup25_3w', 'nb_j_t_sup30_3w', 'nb_j_t_inf_10_3w', 'nb_j_t_inf_0_3w', 'nb_jt__inf_m5_3w',
    'nb_j_t_inf_m10_3w', 'nb_j_rain_sup_5_3w',
    'mean_t_4w', 'min_t_4w', 'max_t_4w',
    'nb_j_t_sup20_4w', 'nb_j_t_sup25_4w', 'nb_j_t_sup30_4w', 'nb_j_t_inf_10_4w', 'nb_j_t_inf_0_4w', 'nb_jt__inf_m5_4w',
    'nb_j_t_inf_m10_4w', 'nb_j_rain_sup_5_4w'])
    
    return(df_week)

In [10]:
df1_w = aggregate_data_per_week(df1)
df2_w = aggregate_data_per_week(df2)
df3_w = aggregate_data_per_week(df3)



In [11]:
# 2012 10 0 (1st week of october of 2012)
df1_w.id_week.unique()[84]

2012100

In [12]:
data_week = []
for week in df1_w.id_week.unique()[84:]:
    df1_week = df1_w.loc[df1_w.id_week==week].values[:,1:]
    if week in df2_w.id_week.unique() and week in df3_w.id_week.unique():
        df2_week = df2_w.loc[df2_w.id_week==week].values[:,1:]
        df3_week = df3_w.loc[df3_w.id_week==week].values[:,1:]
        data_week.append(np.mean(np.concatenate((df1_week, df2_week, df3_week), axis=0), axis=0))
    elif week in df2_w.id_week.unique():
        df2_week = df2_w.loc[df2_w.id_week==week].values[:,1:]
        data_week.append(np.mean(np.concatenate((df1_week, df2_week), axis=0), axis=0))
    elif week in df3_w.id_week.unique():
        df3_week = df3_w.loc[df3_w.id_week==week].values[:,1:]
        data_week.append(np.mean(np.concatenate((df1_week, df3_week), axis=0), axis=0))
    else:
        data_week.append(df1_week[0])

In [13]:
df_final = pd.DataFrame(data_week, columns=[
    'MEAN_TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE',
    'TOTAL_PRECIPITATION', 'TOTAL_RAIN', 'TOTAL_SNOW', 'SNOW_ON_GROUND',
    'DIRECTION_MAX_GUST','SPEED_MAX_GUST', 'COOLING_DEGREE_DAYS',
    'HEATING_DEGREE_DAYS','MIN_REL_HUMIDITY','MAX_REL_HUMIDITY',
    'nb_j_t_sup20_1w', 'nb_j_t_sup25_1w', 'nb_j_t_sup30_1w', 'nb_j_t_inf_10_1w', 'nb_j_t_inf_0_1w', 'nb_jt__inf_m5_1w',
    'nb_j_t_inf_m10_1w', 'nb_j_rain_sup_5_1w',
    'mean_t_2w', 'min_t_2w', 'max_t_2w',
    'nb_j_t_sup20_2w', 'nb_j_t_sup25_2w', 'nb_j_t_sup30_2w', 'nb_j_t_inf_10_2w', 'nb_j_t_inf_0_2w', 'nb_jt__inf_m5_2w',
    'nb_j_t_inf_m10_2w', 'nb_j_rain_sup_5_2w',
    'mean_t_3w', 'min_t_3w', 'max_t_3w',
    'nb_j_t_sup20_3w', 'nb_j_t_sup25_3w', 'nb_j_t_sup30_3w', 'nb_j_t_inf_10_3w', 'nb_j_t_inf_0_3w', 'nb_jt__inf_m5_3w',
    'nb_j_t_inf_m10_3w', 'nb_j_rain_sup_5_3w',
    'mean_t_4w', 'min_t_4w', 'max_t_4w',
    'nb_j_t_sup20_4w', 'nb_j_t_sup25_4w', 'nb_j_t_sup30_4w', 'nb_j_t_inf_10_4w', 'nb_j_t_inf_0_4w', 'nb_jt__inf_m5_4w',
    'nb_j_t_inf_m10_4w', 'nb_j_rain_sup_5_4w'])
df_final = df_final[['TOTAL_PRECIPITATION','MEAN_TEMPERATURE', 'MIN_TEMPERATURE', 'MAX_TEMPERATURE', 'COOLING_DEGREE_DAYS',
    'HEATING_DEGREE_DAYS', 'nb_j_t_sup20_1w', 'nb_j_t_sup25_1w', 'nb_j_t_sup30_1w', 'nb_j_t_inf_10_1w', 'nb_j_t_inf_0_1w', 'nb_jt__inf_m5_1w',
    'nb_j_t_inf_m10_1w', 'nb_j_rain_sup_5_1w',
    'mean_t_2w', 'min_t_2w', 'max_t_2w',
    'nb_j_t_sup20_2w', 'nb_j_t_sup25_2w', 'nb_j_t_sup30_2w', 'nb_j_t_inf_10_2w', 'nb_j_t_inf_0_2w', 'nb_jt__inf_m5_2w',
    'nb_j_t_inf_m10_2w', 'nb_j_rain_sup_5_2w',
    'mean_t_3w', 'min_t_3w', 'max_t_3w',
    'nb_j_t_sup20_3w', 'nb_j_t_sup25_3w', 'nb_j_t_sup30_3w', 'nb_j_t_inf_10_3w', 'nb_j_t_inf_0_3w', 'nb_jt__inf_m5_3w',
    'nb_j_t_inf_m10_3w', 'nb_j_rain_sup_5_3w',
    'mean_t_4w', 'min_t_4w', 'max_t_4w',
    'nb_j_t_sup20_4w', 'nb_j_t_sup25_4w', 'nb_j_t_sup30_4w', 'nb_j_t_inf_10_4w', 'nb_j_t_inf_0_4w', 'nb_jt__inf_m5_4w',
    'nb_j_t_inf_m10_4w', 'nb_j_rain_sup_5_4w']]
df_final.fillna(method='ffill', inplace=True)
df_final.to_csv('climate_per_week_final.csv')