<a href="https://colab.research.google.com/github/markuskunej/air-pollution-thesis/blob/master/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This file is used to train various models

## Unpickle dataframes from google drive and store in df dictionary

In [1]:
import pandas as pd
import os
import glob

DATA_PATH = "/content/drive/MyDrive/Air_Pollution_Data/dataframes"
all_pickle_files = glob.glob(DATA_PATH + '/*.pkl')

dfs = {}

for pkl_file in all_pickle_files:
  loc_and_interval = os.path.basename(pkl_file).split(".")[0] # i.e. Y&E_15m
  dfs[loc_and_interval] = pd.read_pickle(pkl_file)

Split data from 2022-10-03 11:00:00 to 2022-10-04 11:00:00

In [2]:
for df_name in dfs:
  df1 = dfs[df_name][:'2022-10-03 11:00:00-04:00']
  df2 = dfs[df_name]['2022-10-04 11:00:00-04:00':]
  dfs[df_name] = {"July-Oct": df1, "Oct-Jan": df2}

In [3]:
# deal with NaN values
def remove_nan_variables(df):
  for name, col in df.iteritems():
    if df[name].isnull().values.any():
      # column contains NaN values, drop from dataframe
      print(df[name].isnull().sum())
      df.drop(columns=name, inplace=True)
      print("Dropping column " + name + " because it contains NaN values.")

for interval in dfs:
  for date_range in dfs[interval]:
    print("Checking {} {} dataframe for NaN values\n".format(interval, date_range))
    remove_nan_variables(dfs[interval][date_range])

Checking Y&E_60m July-Oct dataframe for NaN values

1423
Dropping column NoiseLEQ because it contains NaN values.
1423
Dropping column NoiseMax because it contains NaN values.
1423
Dropping column WindGust because it contains NaN values.
Checking Y&E_60m Oct-Jan dataframe for NaN values

2127
Dropping column Noise because it contains NaN values.
Checking Y&E_15m July-Oct dataframe for NaN values

5693
Dropping column NoiseLEQ because it contains NaN values.
5693
Dropping column NoiseMax because it contains NaN values.
5694
Dropping column WindGust because it contains NaN values.
Checking Y&E_15m Oct-Jan dataframe for NaN values

8505
Dropping column Noise because it contains NaN values.
Checking Y&E_1m July-Oct dataframe for NaN values

403
Dropping column WindDirection because it contains NaN values.
85370
Dropping column NoiseLEQ because it contains NaN values.
85370
Dropping column NoiseMax because it contains NaN values.
85666
Dropping column WindGust because it contains NaN values

In [11]:
# test for stationarity, difference if seasonality exists
# https://michael-fuchs-python.netlify.app/2020/10/29/time-series-analysis-regression-extension-techniques-for-forecasting-multivariate-variables/#stationarity
from statsmodels.tsa.stattools import adfuller

def Augmented_Dickey_Fuller_Test_func(timeseries , column_name):
    '''
    Calculates statistical values whether the available data are stationary or not 
    
    Args:
        series (float64): Values of the column for which stationarity is to be checked, numpy array of floats 
        column_name (str): Name of the column for which stationarity is to be checked
    
    Returns:
        p-value that indicates whether the data are stationary or not
    '''
    print (f'Results of Dickey-Fuller Test for column: {column_name}')
    adfTest = adfuller(timeseries, autolag='AIC')   # why AIC vs BIC, t-stat, etc.?
    dfResults = pd.Series(adfTest[0:4], index=['ADF Test Statistic','P-Value','# Lags Used','# Observations Used'])
    for key, value in adfTest[4].items():
       dfResults['Critical Value (%s)'%key] = value
    print (dfResults)
    if adfTest[1] <= 0.05:
        print()
        print("Conclusion:")
        print("Reject the null hypothesis")
        print('\033[92m' + "Data is stationary" + '\033[0m')
        return True
    else:
        print()
        print("Conclusion:")
        print("Fail to reject the null hypothesis")
        print('\033[91m' + "Data is non-stationary" + '\033[0m')
        return False

# check each column of df for seasonality
def adf_df_test(df):
  non_station_cols = []
  for name, col in df.iteritems():
    if not df[name].isnull().values.any():
      is_stationary = Augmented_Dickey_Fuller_Test_func(df[name],name)
      print('\n')
      if not is_stationary:
        # add column to list of non-stationary ones
        non_station_cols.append(name)
    else:
      # column contains NaN values, drop from dataframe
      print(df[name].isnull().sum())
      df.drop(columns=name, inplace=True)
      print("Dropping column " + name + " because it contains NaN values.")
      

  return non_station_cols


In [4]:
import numpy as np

non_stat_cols_dict = {}
# test each df for stationarity
for group in dfs:
  for split_df_names in dfs[group]:
    print("STARTING ADF TEST FOR " + group + "_" + split_df_names + '\n')
    non_station_cols = adf_df_test(dfs[group][split_df_names])
    non_stat_cols_dict[group + "_" + split_df_names] = non_station_cols

STARTING ADF TEST FOR Y&E_60m_July-Oct



NameError: ignored

In [None]:
print(non_stat_cols_dict)

{'Y&E_60m_July-Oct': ['Temperature'], 'Y&E_60m_Oct-Jan': [], 'Y&E_15m_July-Oct': [], 'Y&E_15m_Oct-Jan': [], 'Y&E_1m_July-Oct': ['Pressure'], 'Y&E_1m_Oct-Jan': []}


Difference non-stationary variables

In [4]:
non_stat_cols_dict = {'Y&E_60m': {'July-Oct': ['Temperature']}, 'Y&E_1m': {'July-Oct': ['Pressure']}}

def difference_df(df, non_stationary_columns):
  differenced_df = df.copy()
  differenced_df[non_stationary_columns] = df[non_stationary_columns].apply(lambda x: x.diff())

  # drop nan rows from beginning, differencing produces a NaN for first value
  while(differenced_df.iloc[0].isnull().values.any() == True):
    differenced_df.drop(index=differenced_df.index[0], axis=0, inplace=True)

  return difference_df


diff_dfs = {}

for interval in non_stat_cols_dict:
  diff_dfs[interval] = {}
  for date_range in non_stat_cols_dict[interval]:
    diff_dfs[interval][date_range] = difference_df(dfs[interval][date_range], non_stat_cols_dict[interval][date_range])


Split into train and validation set

In [5]:
# train_size is number of data points to use in training.
# i.e. one day with 1hr intervals would be train_size = 24.
def split_train_valid(df, df_stationary, train_size):
  if df_stationary is not None:
    train_df_stationary = df_stationary[:int(train_size)]
  else:
    train_df_stationary = None
  train_df = df[:int(train_size)]

  valid_df = df[int(train_size):]

  return train_df, valid_df, train_df_stationary


VAR model train

In [6]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.statespace.varmax import VARMAX

def train_var(df, df_stationary):

  predicting_vars = ["CO", "CO2", "NO", "NO2", "O3", "PM1", "PM10", "PM2"]
  env_vars = list(set(df.columns) - set(predicting_vars))
  # used to select best AIC lag order
  # see if the df was differenced
  if df_stationary:
    model = VAR(df_stationary[predicting_vars], exog=df_stationary[env_vars])
  else:
    model = VAR(df[predicting_vars], exog=df[env_vars])

  sorted_order=model.select_order(maxlags=20)
  print(sorted_order.summary())

  # use the non differenced df since VARMAX can do its own automatic differencing
  # second order is 0 since we're not using moving average here (MAX part of VARMAX)
  var_model = VARMAX(df[predicting_vars], exog=df[env_vars], order=(sorted_order.selected_orders['aic'],0), enforce_stationarity=True)
  fitted_model = var_model.fit(disp=False)
  print(fitted_model.summary())

  return fitted_model


In [8]:
class Trained_Model:
  def __init__(self, idx, model, model_type, train_size, variables_used, \
               data_interval, date_range, device_location, training_time):
    self.idx = idx
    self.model = model
    self.model_type = model_type
    self.train_size = train_size
    self.var_used = variables_used
    self.interval = data_interval
    self.date_range = date_range
    self.dev_loc = device_location
    self.train_time = training_time

  def getFileDescription(self, base_path):
    return self.model_type + '_' + self.dev_loc + '_' + self.interval + '_' + self.date_range + '_' + self.train_size + '_' + self.var_used

  def getCsvDict(self):
    return {"Index":self.idx, "Model Type":self.model_type, "Device Location":self.dev_loc, \
            "Data Interval":self.interval, "Date Range":self.date_range, "Training Size":self.train_size, \
            "Variables Used":self.var_used, "Training Time":self.train_time}
  
  def getCsvRow(self):
    return [self.idx, self.model_type, self.dev_loc, self.interval, \
            self.date_range, self.train_size, self.var_used, self.train_time]


In [14]:
# create csv file to match trained model to an index
import csv

col_names = ["Index", "Model Type", "Device Location", "Data Interval", "Date Range", "Training Size", "Variables Used", "Training Time"]

with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', 'w') as f:
  writer = csv.writer(f)
  writer.writerow(col_names)
  f.close()


In [11]:
def get_num_rows_csv():
  cnt = 0
  with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', mode="r") as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
      cnt += 1
    
    csv_file.close()
  
  return cnt


In [12]:
def append_to_csv(row):
  with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', 'a') as csv_file:
    writer_obj = csv.writer(csv_file)
    writer_obj.writerow(row)
    csv_file.close()

Various tests

In [15]:
import time
import pickle


def train_intervals(df_dict, diff_df_dict, train_size = 500):
  model_idx = get_num_rows_csv()
  for df_interval in df_dict:
    for date_range in df_dict[df_interval]:
      if df_interval in diff_df_dict and date_range in diff_df_dict:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    diff_df_dict[df_interval][date_range], train_size)
      else:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    None, train_size)
      title = df_interval + "_" + date_range
      print("Starting training for {}".format(title))
      start_time = time.time()
      fitted_model = train_var(train_df, train_df_stationary)
      end_time = time.time()
      elapsed_time = end_time - start_time
      dev_loc, interval = df_interval.split('_') # i.e. Y&E_1m
      trained_model = Trained_Model(model_idx, fitted_model, 'VAR', train_size, \
                                    df_dict[df_interval][date_range].columns, \
                                    interval, date_range, dev_loc, elapsed_time)
      csv_row = trained_model.getCsvRow()
      append_to_csv(csv_row) # add the model description to the csv file tracking models to index
      filename = '/content/drive/MyDrive/Air_Pollution_Models/' + str(model_idx) + '.obj'

      fileObj = open(filename, 'wb')
      pickle.dump(trained_model, fileObj)
      fileObj.close()

      model_idx += 1 #increment model index


train_intervals(dfs, diff_dfs)

Starting training for Y&E_60m_July-Oct




 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        17.36       17.85   3.479e+07       17.56
1        7.615       8.659       2030.       8.026
2        7.024      8.624*       1124.      7.653*
3        7.030       9.186       1131.       7.877
4       7.014*       9.727      1115.*       8.080
5        7.093       10.36       1210.       8.379
6        7.160       10.99       1297.       8.664
7        7.223       11.61       1387.       8.946
8        7.302       12.24       1510.       9.243
9        7.414       12.91       1699.       9.574
10       7.500       13.55       1867.       9.879
11       7.576       14.18       2035.       10.17
12       7.627       14.79       2168.       10.44
13       7.675       15.40       2306.       10.71
14       7.712       15.99       2432.       10.97
15       7.821       16.66       2764.       11.29
16       7.937       17.33     



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(4)   Log Likelihood               -7734.060
                                                              + intercept   AIC                          16164.121
Date:                                                    Sat, 25 Feb 2023   BIC                          17630.804
Time:                                                            18:20:38   HQIC                         16739.645
Sample:                                                                 0                                         
                                                                    - 500                                         
Covariance Type:                                                      opg       



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(2)   Log Likelihood               -8182.327
                                                              + intercept   AIC                          16836.654
Date:                                                    Sat, 25 Feb 2023   BIC                          17831.301
Time:                                                            18:22:01   HQIC                         17226.952
Sample:                                                        10-04-2022                                         
                                                             - 10-25-2022                                         
Covariance Type:                                                      opg       



 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        13.76       14.24   9.418e+05       13.95
1      -0.3682      0.6753      0.6921     0.04196
2      -1.037*     0.5629*     0.3547*    -0.4081*
3      -0.9815       1.175      0.3753     -0.1338
4      -0.9103       1.803      0.4036      0.1561
5      -0.8172       2.452      0.4439      0.4679
6      -0.7523       3.074      0.4751      0.7516
7      -0.6465       3.736      0.5303       1.076
8      -0.5974       4.342      0.5600       1.344
9      -0.5548       4.941      0.5882       1.605
10     -0.5055       5.546      0.6230       1.873
11     -0.4621       6.146      0.6572       2.136
12     -0.3675       6.797      0.7310       2.449
13     -0.3189       7.403      0.7783       2.716
14     -0.1889       8.089      0.9008       3.065
15    -0.08202       8.752       1.021       3.391
16    -0.01326       9.378     



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(2)   Log Likelihood               -5744.265
                                                              + intercept   AIC                          11928.529
Date:                                                    Sat, 25 Feb 2023   BIC                          12855.743
Time:                                                            18:23:12   HQIC                         12292.366
Sample:                                                        07-23-2022                                         
                                                             - 07-28-2022                                         
Covariance Type:                                                      opg       



 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        15.04       15.66   3.395e+06       15.28
1        2.323       3.506       10.21       2.788
2       1.522*      3.261*      4.586*      2.206*
3        1.547       3.843       4.708       2.450
4        1.639       4.491       5.165       2.760
5        1.656       5.065       5.270       2.996
6        1.701       5.666       5.530       3.260
7        1.782       6.303       6.021       3.559
8        1.772       6.850       5.994       3.768
9        1.808       7.443       6.260       4.023
10       1.773       7.964       6.097       4.207
11       1.875       8.623       6.823       4.527
12       1.802       9.106       6.422       4.673
13       1.832       9.693       6.714       4.922
14       1.947       10.36       7.657       5.255
15       1.972       10.95       8.005       5.499
16       1.959       11.49     



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(2)   Log Likelihood               -6463.878
                                                              + intercept   AIC                          13399.756
Date:                                                    Sat, 25 Feb 2023   BIC                          14394.404
Time:                                                            18:25:36   HQIC                         13790.054
Sample:                                                        10-04-2022                                         
                                                             - 10-09-2022                                         
Covariance Type:                                                      opg       



 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0      -0.9938     -0.5764      0.3702     -0.8297
1       -8.069     -7.095*   0.0003130      -7.687
2      -8.369*      -6.838  0.0002322*     -7.767*
3       -8.323      -6.237   0.0002431      -7.503
4       -8.291      -5.647   0.0002515      -7.252
5       -8.191      -4.991   0.0002786      -6.933
6       -8.091      -4.335   0.0003085      -6.615
7       -7.991      -3.678   0.0003426      -6.295
8       -7.930      -3.061   0.0003658      -6.016
9       -7.833      -2.407   0.0004056      -5.700
10      -7.706      -1.723   0.0004645      -5.354
11      -7.612      -1.073   0.0005153      -5.041
12      -7.574     -0.4781   0.0005416      -4.784
13      -7.562     0.08976   0.0005554      -4.554
14      -7.496      0.7126   0.0006030      -4.269
15      -7.401       1.364   0.0006756      -3.955
16      -7.322       1.999   0.



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(2)   Log Likelihood               -4089.578
                                                              + intercept   AIC                           8603.157
Date:                                                    Sat, 25 Feb 2023   BIC                           9496.654
Time:                                                            18:27:49   HQIC                          8953.764
Sample:                                                        07-23-2022                                         
                                                             - 07-23-2022                                         
Covariance Type:                                                      opg       



 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        10.54       11.17   3.792e+04       10.79
1        3.723      4.905*       41.39      4.188*
2        3.615       5.354       37.17       4.298
3        3.634       5.929       37.92       4.536
4        3.553       6.405       35.04       4.674
5       3.397*       6.805      30.03*       4.736
6        3.537       7.502       34.68       5.096
7        3.639       8.161       38.57       5.416
8        3.685       8.763       40.59       5.681
9        3.788       9.422       45.31       6.002
10       3.838       10.03       48.08       6.272
11       3.879       10.63       50.62       6.532
12       3.888       11.19       51.70       6.759
13       3.994       11.86       58.35       7.084
14       4.051       12.47       62.77       7.359
15       4.067       13.04       65.07       7.594
16       4.170       13.70     



                                             Statespace Model Results                                             
Dep. Variable:     ['CO', 'CO2', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2']   No. Observations:                  500
Model:                                                            VARX(5)   Log Likelihood               -6542.972
                                                              + intercept   AIC                          13941.944
Date:                                                    Sat, 25 Feb 2023   BIC                          15745.797
Time:                                                            18:41:54   HQIC                         14649.773
Sample:                                                        10-04-2022                                         
                                                             - 10-04-2022                                         
Covariance Type:                                                      opg       