# This file is used to train various models

## Unpickle dataframes from google drive and store in df dictionary

In [1]:
import pandas as pd
import os
import glob

DATA_PATH = "/content/drive/MyDrive/Air_Pollution_Data/dataframes"
all_pickle_files = glob.glob(DATA_PATH + '/*.pkl')

dfs = {}

for pkl_file in all_pickle_files:
  loc_and_interval = os.path.basename(pkl_file).split(".")[0] # i.e. Y&E_15m
  dfs[loc_and_interval] = pd.read_pickle(pkl_file)

Split data from 2022-10-03 11:00:00 to 2022-10-04 11:00:00

In [2]:
for df_name in dfs:
  df1 = dfs[df_name][:'2022-10-03 11:00:00-04:00']
  df2 = dfs[df_name]['2022-10-04 11:00:00-04:00':]
  dfs[df_name] = {"July-Oct": df1, "Oct-Jan": df2}

In [3]:
# deal with NaN values
def remove_nan_variables(df):
  for name, col in df.iteritems():
    if df[name].isnull().values.any():
      # column contains NaN values, drop from dataframe
      print(df[name].isnull().sum())
      df.drop(columns=name, inplace=True)
      print("Dropping column " + name + " because it contains NaN values.")

for interval in dfs:
  for date_range in dfs[interval]:
    print("Checking {} {} dataframe for NaN values\n".format(interval, date_range))
    remove_nan_variables(dfs[interval][date_range])

Checking Y&E_60m July-Oct dataframe for NaN values

1423
Dropping column NoiseLEQ because it contains NaN values.
1423
Dropping column NoiseMax because it contains NaN values.
1423
Dropping column WindGust because it contains NaN values.
Checking Y&E_60m Oct-Jan dataframe for NaN values

2127
Dropping column Noise because it contains NaN values.
Checking Y&E_15m July-Oct dataframe for NaN values

5693
Dropping column NoiseLEQ because it contains NaN values.
5693
Dropping column NoiseMax because it contains NaN values.
5694
Dropping column WindGust because it contains NaN values.
Checking Y&E_15m Oct-Jan dataframe for NaN values

8505
Dropping column Noise because it contains NaN values.
Checking Y&E_1m July-Oct dataframe for NaN values

403
Dropping column WindDirection because it contains NaN values.
85370
Dropping column NoiseLEQ because it contains NaN values.
85370
Dropping column NoiseMax because it contains NaN values.
85666
Dropping column WindGust because it contains NaN values

  for name, col in df.iteritems():
  for name, col in df.iteritems():
  for name, col in df.iteritems():
  for name, col in df.iteritems():
  for name, col in df.iteritems():
  for name, col in df.iteritems():


In [4]:
# test for stationarity, difference if seasonality exists
# https://michael-fuchs-python.netlify.app/2020/10/29/time-series-analysis-regression-extension-techniques-for-forecasting-multivariate-variables/#stationarity
from statsmodels.tsa.stattools import adfuller

def Augmented_Dickey_Fuller_Test_func(timeseries , column_name):
    '''
    Calculates statistical values whether the available data are stationary or not 
    
    Args:
        series (float64): Values of the column for which stationarity is to be checked, numpy array of floats 
        column_name (str): Name of the column for which stationarity is to be checked
    
    Returns:
        p-value that indicates whether the data are stationary or not
    '''
    print (f'Results of Dickey-Fuller Test for column: {column_name}')
    adfTest = adfuller(timeseries, autolag='AIC')   # why AIC vs BIC, t-stat, etc.?
    dfResults = pd.Series(adfTest[0:4], index=['ADF Test Statistic','P-Value','# Lags Used','# Observations Used'])
    for key, value in adfTest[4].items():
       dfResults['Critical Value (%s)'%key] = value
    print (dfResults)
    if adfTest[1] <= 0.05:
        print()
        print("Conclusion:")
        print("Reject the null hypothesis")
        print('\033[92m' + "Data is stationary" + '\033[0m')
        return True
    else:
        print()
        print("Conclusion:")
        print("Fail to reject the null hypothesis")
        print('\033[91m' + "Data is non-stationary" + '\033[0m')
        return False

# check each column of df for seasonality
def adf_df_test(df):
  non_station_cols = []
  for name, col in df.iteritems():
    if not df[name].isnull().values.any():
      is_stationary = Augmented_Dickey_Fuller_Test_func(df[name],name)
      print('\n')
      if not is_stationary:
        # add column to list of non-stationary ones
        non_station_cols.append(name)
    else:
      # column contains NaN values, drop from dataframe
      print(df[name].isnull().sum())
      df.drop(columns=name, inplace=True)
      print("Dropping column " + name + " because it contains NaN values.")
      

  return non_station_cols


In [7]:
import numpy as np

non_stat_cols_dict = {}
# test each df for stationarity
for group in diff_dfs:
  for split_df_names in diff_dfs[group]:
    print("STARTING ADF TEST FOR " + group + "_" + split_df_names + '\n')
    non_station_cols = adf_df_test(diff_dfs[group][split_df_names])
    non_stat_cols_dict[group + "_" + split_df_names] = non_station_cols

STARTING ADF TEST FOR Y&E_60m_July-Oct

Results of Dickey-Fuller Test for column: CO
ADF Test Statistic     -6.210428e+00
P-Value                 5.524864e-08
# Lags Used             2.400000e+01
# Observations Used     1.708000e+03
Critical Value (1%)    -3.434184e+00
Critical Value (5%)    -2.863234e+00
Critical Value (10%)   -2.567672e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: CO2
ADF Test Statistic     -6.283325e+00
P-Value                 3.750010e-08
# Lags Used             2.400000e+01
# Observations Used     1.708000e+03
Critical Value (1%)    -3.434184e+00
Critical Value (5%)    -2.863234e+00
Critical Value (10%)   -2.567672e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: Humidity
ADF Test Statistic        -5.007428
P-Value                    0.000021
# Lags Used               25.000000
# Observations Used   

  for name, col in df.iteritems():


ADF Test Statistic     -5.662861e+00
P-Value                 9.293722e-07
# Lags Used             2.500000e+01
# Observations Used     1.707000e+03
Critical Value (1%)    -3.434187e+00
Critical Value (5%)    -2.863235e+00
Critical Value (10%)   -2.567672e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: Noise
ADF Test Statistic        -4.289911
P-Value                    0.000462
# Lags Used               25.000000
# Observations Used     1707.000000
Critical Value (1%)       -3.434187
Critical Value (5%)       -2.863235
Critical Value (10%)      -2.567672
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: O3
ADF Test Statistic        -5.125476
P-Value                    0.000012
# Lags Used               24.000000
# Observations Used     1708.000000
Critical Value (1%)       -3.434184
Critical Value (5%)       -2.863234
Critical V

  for name, col in df.iteritems():


ADF Test Statistic     -1.194664e+01
P-Value                 4.414334e-22
# Lags Used             6.600000e+01
# Observations Used     1.039450e+05
Critical Value (1%)    -3.430413e+00
Critical Value (5%)    -2.861568e+00
Critical Value (10%)   -2.566785e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: CO2
ADF Test Statistic     -1.032390e+01
P-Value                 2.960031e-18
# Lags Used             6.500000e+01
# Observations Used     1.039460e+05
Critical Value (1%)    -3.430413e+00
Critical Value (5%)    -2.861568e+00
Critical Value (10%)   -2.566785e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: Humidity
ADF Test Statistic     -9.180160e+00
P-Value                 2.263276e-15
# Lags Used             6.500000e+01
# Observations Used     1.039460e+05
Critical Value (1%)    -3.430413e+00
Critical Value (5%)    -2.8615

In [None]:
print(non_stat_cols_dict)

{'Y&E_60m_July-Oct': ['Temperature'], 'Y&E_60m_Oct-Jan': [], 'Y&E_15m_July-Oct': [], 'Y&E_15m_Oct-Jan': [], 'Y&E_1m_July-Oct': ['Pressure'], 'Y&E_1m_Oct-Jan': []}


Difference non-stationary variables

In [6]:
non_stat_cols_dict = {'Y&E_60m': {'July-Oct': ['Temperature']}, 'Y&E_1m': {'July-Oct': ['Pressure']}}

def difference_df(df, non_stationary_columns):
  differenced_df = df.copy()
  differenced_df[non_stationary_columns] = df[non_stationary_columns].apply(lambda x: x.diff())

  # drop nan rows from beginning, differencing produces a NaN for first value
  while(differenced_df.iloc[0].isnull().values.any() == True):
    differenced_df.drop(index=differenced_df.index[0], axis=0, inplace=True)

  return differenced_df


diff_dfs = {}

for interval in non_stat_cols_dict:
  diff_dfs[interval] = {}
  for date_range in non_stat_cols_dict[interval]:
    diff_df = difference_df(dfs[interval][date_range], non_stat_cols_dict[interval][date_range])
    diff_dfs[interval][date_range] = diff_df


In [None]:
import pickle
#save dfs and diff_dfs for later use
with open('/content/drive/MyDrive/Air_Pollution_Data/dataframes/Y&E_dict.pickle', 'wb') as handle:
  pickle.dump(dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('/content/drive/MyDrive/Air_Pollution_Data/dataframes/Y&E_diff_dict.pickle', 'wb') as handle:
  pickle.dump(diff_dfs, handle, protocol=pickle.HIGHEST_PROTOCOL)

Split into train and validation set

In [None]:
# train_size is number of data points to use in training.
# i.e. one day with 1hr intervals would be train_size = 24.
def split_train_valid(df, df_stationary, train_size):
  if df_stationary is not None:
    train_df_stationary = df_stationary[:int(train_size)]
  else:
    train_df_stationary = None
  train_df = df[:int(train_size)]
  print(df.head())
  print(train_df.head())

  valid_df = df[int(train_size):]

  return train_df, valid_df, train_df_stationary


In [None]:
split_train_valid(dfs['Y&E_15m']['July-Oct'], None, 500)

                              CO         CO2  Humidity     NO    NO2  Noise  \
Time                                                                          
2022-07-23 00:15:00-04:00  315.1  446.777778     41.23  18.21  24.46  54.71   
2022-07-23 00:30:00-04:00  307.1  444.888889     42.31  17.75  24.97  54.99   
2022-07-23 00:45:00-04:00  303.4  441.444444     42.80  17.71  25.20  52.64   
2022-07-23 01:00:00-04:00  307.6  442.444444     43.50  17.41  25.64  54.58   
2022-07-23 01:15:00-04:00  316.8  442.777778     44.07  17.71  25.81  51.52   

                              O3    PM1    PM10    PM2  Pressure  Temperature  \
Time                                                                            
2022-07-23 00:15:00-04:00  46.66  7.424   9.576  8.955     994.0        26.99   
2022-07-23 00:30:00-04:00  47.08  7.774   9.975  9.369     994.2        26.77   
2022-07-23 00:45:00-04:00  45.83  7.855  10.122  9.526     994.2        26.68   
2022-07-23 01:00:00-04:00  45.19  7.958  

(                              CO         CO2   Humidity     NO    NO2  Noise  \
 Time                                                                           
 2022-07-23 00:15:00-04:00  315.1  446.777778  41.230000  18.21  24.46  54.71   
 2022-07-23 00:30:00-04:00  307.1  444.888889  42.310000  17.75  24.97  54.99   
 2022-07-23 00:45:00-04:00  303.4  441.444444  42.800000  17.71  25.20  52.64   
 2022-07-23 01:00:00-04:00  307.6  442.444444  43.500000  17.41  25.64  54.58   
 2022-07-23 01:15:00-04:00  316.8  442.777778  44.070000  17.71  25.81  51.52   
 ...                          ...         ...        ...    ...    ...    ...   
 2022-07-28 04:00:00-04:00  274.6  452.166667  82.700000  15.01  28.40  43.54   
 2022-07-28 04:15:00-04:00  264.2  450.833333  82.444444  16.01  28.82  43.28   
 2022-07-28 04:30:00-04:00  275.2  451.166667  82.755556  16.12  28.30  41.90   
 2022-07-28 04:45:00-04:00  273.2  455.166667  83.288889  16.08  29.27  42.13   
 2022-07-28 05:00:00-04:00  

VAR model train

In [None]:
from statsmodels.tsa.api import VAR
from statsmodels.tsa.statespace.varmax import VARMAX

def train_var(df, df_stationary):

  predicting_vars = ["CO", "CO2", "NO", "NO2", "O3", "PM1", "PM10", "PM2"]
  env_vars = list(set(df.columns) - set(predicting_vars))
  # used to select best AIC lag order
  # see if the df was differenced
  if not df_stationary.empty:
    #model = VAR(df_stationary[predicting_vars], exog=df_stationary[env_vars])
    model = VAR(df_stationary)
  else:
    #model = VAR(df[predicting_vars], exog=df[env_vars])
    model = VAR(df)

  sorted_order=model.select_order(maxlags=10)
  print(sorted_order.summary())

  # use the non differenced df since VARMAX can do its own automatic differencing
  # second order is 0 since we're not using moving average here (MAX part of VARMAX)
  #var_model = VARMAX(df[predicting_vars], exog=df[env_vars], order=(sorted_order.selected_orders['aic'],0), enforce_stationarity=True)
  var_model = VARMAX(df, order=(sorted_order.selected_orders['aic'],0), enforce_stationarity=True)
  fitted_model = var_model.fit(disp=False)
  print(fitted_model.summary())

  return fitted_model


In [None]:
pred_columns = ["PM1", "PM2", "PM10", "CO", "CO2", "O3", "NO", "NO2"]
metric_columns=["mape", "me", "mae", "mpe", "rmse", "corr", "minmax"]
def adjust(val, length= 6): 
  return str(val).ljust(length)


def get_metrics_df(pred_df_dict):
    metrics_df = pd.DataFrame()

    col_index = pd.MultiIndex.from_product([pred_columns, metric_columns], names=["pred_cols", "metric_cols"])
    metrics_df = pd.DataFrame(index=pred_df_dict.keys(), columns=col_index)
    for pred_range, pred_df in pred_df_dict.items():
        for col in pred_columns:
            print('\nForecast accuracy of ' + col)
            accuracy_prod = forecast_accuracy(pred_df[col + '_Prediction'].values, pred_df[col])
            for k, v in accuracy_prod.items():
                print(adjust(k), ': ', round(v, 4))
                metrics_df.at[pred_range, (col, k)] = round(v, 4)

    return metrics_df

In [None]:
!pip install pmdarima
import pmdarima as pm
import numpy as np
pred_columns = ["PM1", "PM2", "PM10", "CO", "CO2", "O3", "NO", "NO2"]

# get best p and q values
def get_p_q(df_stationary, df, test_df, model_id):
  pq = set()
  for col in pred_columns:
    print(f'Searching order of p and q for : {col}')
    stepwise_model = pm.auto_arima(df_stationary[col],start_p=1, start_q=1,max_p=5, max_q=5, seasonal=False,
      trace=True,error_action='ignore',suppress_warnings=True, stepwise=True,maxiter=1000)
    parameter = stepwise_model.get_params().get('order')
    print(f'optimal order for:{col} is: {parameter} \n\n')
    pq.add(parameter)

  mape_cols = ["MAPE " + pred_col for pred_col in pred_columns]
  df_results_VARMA = pd.DataFrame(columns=['p', 'q'] + mape_cols)

  for i in pq:
    if i[0]== 0 and i[2] ==0:
        pass
    else:
        print(f' Running for {i}')
        model = VARMAX(df_stationary, order=(i[0],i[2]), enforce_stationarity=False).fit(disp=False)
        forecast = model.forecast(steps = len(test_df))
        forecast = forecast.set_index(test_df.index)
        
        varma_dict = {}
        #calculate rmse for each air pollutant
        for col in pred_columns:
          varma_dict['MAPE ' + col] = np.mean(np.abs(forecast[col] - test_df[col])/np.abs(test_df[col]))  # MAPE
        varma_dict['p'] = i[0]
        varma_dict['q'] = i[2]
        print(varma_dict)
        df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)

  # average rmse for a given p and q
  df_results_VARMA["Average MAPE"] = df_results_VARMA[mape_cols].mean(axis=1)

  #sort by average rmse
  df_results_VARMA = df_results_VARMA.sort_values("Average MAPE")

  #save this dataframe as csv
  df_results_VARMA.to_csv(f'/content/drive/MyDrive/Air_Pollution_Models/P_Q_Table_{model_id}.csv')

  best_p = df_results_VARMA['p'].iloc[0]
  best_q = df_results_VARMA['q'].iloc[0]

  return best_p, best_q

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
class Trained_Model:
  def __init__(self, idx, model, model_type, train_size, variables_used, \
               data_interval, date_range, device_location, training_time):
    self.idx = idx
    self.model = model
    self.model_type = model_type
    self.train_size = train_size
    self.var_used = variables_used
    self.interval = data_interval
    self.date_range = date_range
    self.dev_loc = device_location
    self.train_time = training_time

  def getFileDescription(self):
    return self.model_type + '_' + self.dev_loc + '_' + self.interval + '_' + self.date_range + '_' + str(self.train_size) + '_' + self.var_used

  def getCsvDict(self):
    return {"Index":self.idx, "Model Type":self.model_type, "Device Location":self.dev_loc, \
            "Data Interval":self.interval, "Date Range":self.date_range, "Training Size":self.train_size, \
            "Variables Used":self.var_used, "Training Time":self.train_time}
  
  def getCsvRow(self):
    return [self.idx, self.model_type, self.dev_loc, self.interval, \
            self.date_range, self.train_size, self.var_used, self.train_time]


In [None]:
# # create csv file to match trained model to an index
# import csv

# col_names = ["Index", "Model Type", "Device Location", "Data Interval", "Date Range", "Training Size", "Variables Used", "Training Time"]

# with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', 'w') as f:
#   writer = csv.writer(f)
#   writer.writerow(col_names)
#   f.close()


In [None]:
import csv
def get_num_rows_csv():
  cnt = 0
  with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', mode="r") as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
      cnt += 1
    
    csv_file.close()
  
  return cnt


In [None]:
def append_to_csv(row):
  with open('/content/drive/MyDrive/Air_Pollution_Models/Model_Descriptions.csv', 'a') as csv_file:
    writer_obj = csv.writer(csv_file)
    writer_obj.writerow(row)
    csv_file.close()

In [None]:
import time
from statsmodels.tsa.statespace.varmax import VARMAX
import pickle

# try to train a single varma model
model_idx = get_num_rows_csv()
df_interval = 'Y&E_1m'
date_range = 'July-Oct'
train_size = 1000
# 1 week = 24*7 = 168 hours
test_size = 500
if df_interval in diff_dfs and date_range in diff_dfs[df_interval]:
  train_df, valid_df, train_df_stationary = split_train_valid(dfs[df_interval][date_range], \
                                                                    diff_dfs[df_interval][date_range], train_size)
else:
  train_df, valid_df, train_df_stationary = split_train_valid(dfs[df_interval][date_range], \
                                                              None, train_size)
title = "VARMA_" + df_interval + "_" + date_range
test_df = valid_df.iloc[:test_size]
print(test_df.head())
print("Starting training for {}".format(title))
start_time = time.time()
p, q = get_p_q(train_df_stationary, train_df, test_df, model_idx)
fitted_model = VARMAX(train_df, order=(int(p), int(q))).fit(disp=False)
end_time = time.time()
elapsed_time = end_time - start_time
dev_loc, interval = df_interval.split('_') # i.e. Y&E_1m
trained_model = Trained_Model(model_idx, fitted_model, 'VARMA', train_size, \
                              dfs[df_interval][date_range].columns, \
                              interval, date_range, dev_loc, elapsed_time)
csv_row = trained_model.getCsvRow()
append_to_csv(csv_row) # add the model description to the csv file tracking models to index
filename = '/content/drive/MyDrive/Air_Pollution_Models/' + str(model_idx) + '.obj'

fileObj = open(filename, 'wb')
pickle.dump(trained_model, fileObj)
fileObj.close()

                              CO         CO2  Humidity     NO    NO2  Noise  \
Time                                                                          
2022-07-23 00:01:00-04:00  297.8  445.000000     40.63  18.73  24.01  53.56   
2022-07-23 00:02:00-04:00  298.9  443.666667     40.74  17.66  23.83  55.45   
2022-07-23 00:03:00-04:00  304.0  444.000000     40.88  18.27  25.22  55.93   
2022-07-23 00:04:00-04:00  310.9  444.333333     40.92  18.88  24.02  54.21   
2022-07-23 00:05:00-04:00  380.5  445.111111     40.93  18.18  23.04  52.89   

                              O3    PM1   PM10    PM2  Pressure  Temperature  \
Time                                                                           
2022-07-23 00:01:00-04:00  47.69  7.215  9.504  8.731     994.0        27.13   
2022-07-23 00:02:00-04:00  47.20  7.255  9.639  8.757     994.0        27.10   
2022-07-23 00:03:00-04:00  47.12  7.238  9.368  8.693     993.9        27.10   
2022-07-23 00:04:00-04:00  46.89  7.220  9.265

  self._init_dates(dates, freq)


{'MAPE PM1': 0.5644870977009891, 'MAPE PM2': 0.5643337642938188, 'MAPE PM10': 0.5565665107565694, 'MAPE CO': 0.20741053723593791, 'MAPE CO2': 0.06188214047716334, 'MAPE O3': 0.15406863903068319, 'MAPE NO': 0.16254697113920233, 'MAPE NO2': 0.5331109723519146, 'p': 1, 'q': 0}
 Running for (1, 1, 3)


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


{'MAPE PM1': 0.5738067033038002, 'MAPE PM2': 0.5731320140944798, 'MAPE PM10': 0.5646565661239389, 'MAPE CO': 0.20757974856631684, 'MAPE CO2': 0.060645794743425754, 'MAPE O3': 0.1575746464133571, 'MAPE NO': 0.16855976744126946, 'MAPE NO2': 0.5349270190903472, 'p': 1, 'q': 3}
 Running for (5, 1, 1)


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


{'MAPE PM1': 0.6556025526408131, 'MAPE PM2': 0.6462489258971199, 'MAPE PM10': 0.631619897511498, 'MAPE CO': 0.16471233698253135, 'MAPE CO2': 0.0345727341677592, 'MAPE O3': 0.10201942392929476, 'MAPE NO': 0.15241756897468264, 'MAPE NO2': 0.403027415789639, 'p': 5, 'q': 1}
 Running for (3, 1, 2)


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


{'MAPE PM1': 0.633721330018632, 'MAPE PM2': 0.6280836008720073, 'MAPE PM10': 0.6154961421621148, 'MAPE CO': 0.18298631241172963, 'MAPE CO2': 0.0416519708149123, 'MAPE O3': 0.10932476654041054, 'MAPE NO': 0.1625789430770432, 'MAPE NO2': 0.4360372117533594, 'p': 3, 'q': 2}
 Running for (1, 1, 1)


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


{'MAPE PM1': 0.5883732165034777, 'MAPE PM2': 0.5866837577238723, 'MAPE PM10': 0.5773169495124598, 'MAPE CO': 0.20260011966057856, 'MAPE CO2': 0.05507827702761115, 'MAPE O3': 0.13833131255771777, 'MAPE NO': 0.16919779178142758, 'MAPE NO2': 0.5005117734067409, 'p': 1, 'q': 1}
 Running for (0, 1, 1)


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  self._init_dates(dates, freq)


{'MAPE PM1': 0.66511444666934, 'MAPE PM2': 0.6569244154113097, 'MAPE PM10': 0.6414205714049681, 'MAPE CO': 0.15522738965749072, 'MAPE CO2': 0.012605895366718772, 'MAPE O3': 0.2465050187785018, 'MAPE NO': 0.17596935083357088, 'MAPE NO2': 0.5921315713190436, 'p': 0, 'q': 1}


  df_results_VARMA = df_results_VARMA.append(varma_dict, ignore_index=True)
  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


In [None]:
import time
from statsmodels.tsa.statespace.varmax import VARMAX
import pickle

def train_different_sizes_varma(df_dict, diff_df_dict, train_sizes, df_interval, test_size):
  model_idx = get_num_rows_csv()
  for date_range in df_dict[df_interval]:
    for train_size in train_sizes:
      if df_interval in diff_df_dict and date_range in diff_df_dict:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    diff_df_dict[df_interval][date_range], train_size)
      else:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    None, train_size)
      title = df_interval + "_" + date_range
      print("Starting training for VARMA_{}".format(title))
      test_df = valid_df.iloc[:test_size]
      start_time = time.time()
      if train_df_stationary == None:
        train_df_stationary = train_df.copy()
      p, q = get_p_q(train_df_stationary, train_df, test_df, model_idx)
      fitted_model = VARMAX(train_df_stationary, order=(int(p), int(q)), enforce_stationarity=False).fit(disp=False)      
      end_time = time.time()
      elapsed_time = end_time - start_time
      dev_loc, interval = df_interval.split('_') # i.e. Y&E_1m
      trained_model = Trained_Model(model_idx, fitted_model, 'VARMA', train_size, \
                                    df_dict[df_interval][date_range].columns, \
                                    interval, date_range, dev_loc, elapsed_time)
      csv_row = trained_model.getCsvRow()
      append_to_csv(csv_row) # add the model description to the csv file tracking models to index
      filename = '/content/drive/MyDrive/Air_Pollution_Models/' + str(model_idx) + '.obj'

      fileObj = open(filename, 'wb')
      pickle.dump(trained_model, fileObj)
      fileObj.close()

      model_idx += 1 #increment model index  

In [None]:
# train various sizes
#sizes_15 = [100, 300, 500, 1000, 3000, 5000] #~ 1 day, 3 days, 1 week, 2 weeks, over a month, 2 months
sizes_part_2 = [3000, 5000]
train_different_sizes_varma(dfs, diff_dfs, sizes_part_2, 'Y&E_15m', test_size=672)

                              CO         CO2  Humidity     NO    NO2  Noise  \
Time                                                                          
2022-07-23 00:15:00-04:00  315.1  446.777778     41.23  18.21  24.46  54.71   
2022-07-23 00:30:00-04:00  307.1  444.888889     42.31  17.75  24.97  54.99   
2022-07-23 00:45:00-04:00  303.4  441.444444     42.80  17.71  25.20  52.64   
2022-07-23 01:00:00-04:00  307.6  442.444444     43.50  17.41  25.64  54.58   
2022-07-23 01:15:00-04:00  316.8  442.777778     44.07  17.71  25.81  51.52   

                              O3    PM1    PM10    PM2  Pressure  Temperature  \
Time                                                                            
2022-07-23 00:15:00-04:00  46.66  7.424   9.576  8.955     994.0        26.99   
2022-07-23 00:30:00-04:00  47.08  7.774   9.975  9.369     994.2        26.77   
2022-07-23 00:45:00-04:00  45.83  7.855  10.122  9.526     994.2        26.68   
2022-07-23 01:00:00-04:00  45.19  7.958  

  warn('Estimation of VARMA(p,q) models is not generically robust,'
  self._init_dates(dates, freq)


KeyboardInterrupt: ignored

Various tests

In [None]:
import time
import pickle


def train_intervals(df_dict, diff_df_dict, train_size = 500):
  model_idx = get_num_rows_csv()
  for df_interval in df_dict:
    for date_range in df_dict[df_interval]:
      if df_interval in diff_df_dict and date_range in diff_df_dict:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    diff_df_dict[df_interval][date_range], train_size)
      else:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    None, train_size)
      title = df_interval + "_" + date_range
      print("Starting training for {}".format(title))
      start_time = time.time()
      fitted_model = train_var(train_df, train_df_stationary)
      end_time = time.time()
      elapsed_time = end_time - start_time
      dev_loc, interval = df_interval.split('_') # i.e. Y&E_1m
      trained_model = Trained_Model(model_idx, fitted_model, 'VAR', train_size, \
                                    df_dict[df_interval][date_range].columns, \
                                    interval, date_range, dev_loc, elapsed_time)
      csv_row = trained_model.getCsvRow()
      append_to_csv(csv_row) # add the model description to the csv file tracking models to index
      filename = '/content/drive/MyDrive/Air_Pollution_Models/' + str(model_idx) + '.obj'

      fileObj = open(filename, 'wb')
      pickle.dump(trained_model, fileObj)
      fileObj.close()

      model_idx += 1 #increment model index


train_intervals(dfs, diff_dfs)

Starting training for Y&E_60m_July-Oct
 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        34.51       34.63   9.703e+14       34.56
1        7.756      9.553*       2336.       8.462
2       6.374*       9.850      587.7*      7.739*
3        6.426       11.58       621.3       8.449
4        6.597       13.43       743.5       9.279
5        6.805       15.31       928.3       10.15
6        7.109       17.30       1285.       11.11
7        7.284       19.15       1577.       11.94
8        7.435       20.98       1907.       12.75
9        7.539       22.76       2228.       13.52
10       7.716       24.61       2835.       14.35
--------------------------------------------------


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                                                                    Statespace Model Results                                                                                    
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'Noise', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed']   No. Observations:                  500
Model:                                                                                                                                           VAR(2)   Log Likelihood              -11138.641
                                                                                                                                            + intercept   AIC                          23299.283
Date:                                                                                                                                  Fri, 17 Mar 2023   BIC                          25452.948
Time:                              

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                                                                                  Statespace Model Results                                                                                                 
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed', 'NoiseLEQ', 'NoiseMax', 'WindGust']   No. Observations:                  500
Model:                                                                                                                                                                      VAR(2)   Log Likelihood              -10873.705
                                                                                                                                                                       + intercept   AIC                          23075.410
Date:                                                                                                                   

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                                                                    Statespace Model Results                                                                                    
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'Noise', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed']   No. Observations:                  500
Model:                                                                                                                                           VAR(2)   Log Likelihood               -8002.835
                                                                                                                                            + intercept   AIC                          17027.670
Date:                                                                                                                                  Fri, 17 Mar 2023   BIC                          19181.334
Time:                              

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                                                                                  Statespace Model Results                                                                                                 
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed', 'NoiseLEQ', 'NoiseMax', 'WindGust']   No. Observations:                  500
Model:                                                                                                                                                                      VAR(2)   Log Likelihood               -7379.666
                                                                                                                                                                       + intercept   AIC                          16087.332
Date:                                                                                                                   

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                                                                            Statespace Model Results                                                                           
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'Noise', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindSpeed']   No. Observations:                  500
Model:                                                                                                                          VAR(2)   Log Likelihood               -1911.405
                                                                                                                           + intercept   AIC                           4706.810
Date:                                                                                                                 Fri, 17 Mar 2023   BIC                           6569.667
Time:                                                                                                                   

  self._init_dates(dates, freq)


 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        12.65       12.79   3.120e+05       12.70
1       -11.93     -9.600*   6.603e-06     -11.01*
2      -12.32*      -7.804  4.460e-06*      -10.55
3       -12.09      -5.383   5.654e-06      -9.458
4       -12.03      -3.123   6.142e-06      -8.529
5       -12.08     -0.9893   5.938e-06      -7.726
6       -11.60       1.682   9.950e-06      -6.385
7       -11.26       4.220   1.479e-05      -5.178
8       -11.03       6.634   1.979e-05      -4.095
9       -10.76       9.095   2.832e-05      -2.965
10      -10.55       11.50   3.926e-05      -1.890
--------------------------------------------------


  self._init_dates(dates, freq)


                                                                                                  Statespace Model Results                                                                                                 
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed', 'NoiseLEQ', 'NoiseMax', 'WindGust']   No. Observations:                  500
Model:                                                                                                                                                                      VAR(2)   Log Likelihood               -7858.337
                                                                                                                                                                       + intercept   AIC                          17044.674
Date:                                                                                                                   

In [None]:
import time
from statsmodels.tsa.statespace.varmax import VARMAX
import pickle

def train_different_sizes_var(df_dict, diff_df_dict, train_sizes, df_interval, test_size):
  model_idx = get_num_rows_csv()
  for date_range in df_dict[df_interval]:
    for train_size in train_sizes:
      if df_interval in diff_df_dict and date_range in diff_df_dict:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    diff_df_dict[df_interval][date_range], train_size)
      else:
        train_df, valid_df, train_df_stationary = split_train_valid(df_dict[df_interval][date_range], \
                                                                    None, train_size)
      title = df_interval + "_" + date_range
      print("Starting training for VAR_{}".format(title))
      test_df = valid_df.iloc[:test_size]
      start_time = time.time()
      if train_df_stationary == None:
        train_df_stationary = train_df.copy()
      fitted_model = train_var(train_df, train_df_stationary)
      end_time = time.time()
      elapsed_time = end_time - start_time
      dev_loc, interval = df_interval.split('_') # i.e. Y&E_1m
      trained_model = Trained_Model(model_idx, fitted_model, 'VAR', train_size, \
                                    df_dict[df_interval][date_range].columns, \
                                    interval, date_range, dev_loc, elapsed_time)
      csv_row = trained_model.getCsvRow()
      append_to_csv(csv_row) # add the model description to the csv file tracking models to index
      filename = '/content/drive/MyDrive/Air_Pollution_Models/' + str(model_idx) + '.obj'

      fileObj = open(filename, 'wb')
      pickle.dump(trained_model, fileObj)
      fileObj.close()

      model_idx += 1 #increment model index  

In [None]:
#sizes_15 = [100, 300, 500, 1000, 3000, 5000] #~ 1 day, 3 days, 1 week, 2 weeks, over a month, 2 months
sizes = [1000]
train_different_sizes_var(dfs, diff_dfs, sizes, 'Y&E_60m', test_size=500)

                              CO         CO2  Humidity     NO    NO2  Noise  \
Time                                                                          
2022-07-23 01:00:00-04:00  308.2  443.777778     42.47  17.76  25.07  54.22   
2022-07-23 02:00:00-04:00  304.7  439.444444     46.10  17.40  26.69  51.85   
2022-07-23 03:00:00-04:00  296.1  440.111111     50.04  17.33  27.87  50.52   
2022-07-23 04:00:00-04:00  262.4  433.333333     49.51  18.67  24.20  49.39   
2022-07-23 05:00:00-04:00  245.2  431.555556     50.63  19.19  21.87  47.24   

                              O3    PM1    PM10    PM2  Pressure  Temperature  \
Time                                                                            
2022-07-23 01:00:00-04:00  46.18  7.752   9.943  9.357     994.2        26.77   
2022-07-23 02:00:00-04:00  48.15  7.711   9.772  9.222     994.1        26.55   
2022-07-23 03:00:00-04:00  47.12  8.120  10.235  9.670     994.2        25.89   
2022-07-23 04:00:00-04:00  45.65  6.653  

  self._init_dates(dates, freq)


 VAR Order Selection (* highlights the minimums)  
       AIC         BIC         FPE         HQIC   
--------------------------------------------------
0        38.22       38.29   3.963e+16       38.24
1        9.644       10.68   1.543e+04       10.04
2        7.861      9.870*       2595.      8.625*
3       7.730*       10.71      2276.*       8.862
4        7.779       11.73       2394.       9.280
5        7.832       12.75       2529.       9.702
6        7.960       13.85       2882.       10.20
7        8.072       14.93       3234.       10.68
8        8.188       16.01       3647.       11.16
9        8.269       17.07       3979.       11.61
10       8.343       18.11       4315.       12.06
--------------------------------------------------


  self._init_dates(dates, freq)


                                                                                    Statespace Model Results                                                                                    
Dep. Variable:     ['CO', 'CO2', 'Humidity', 'NO', 'NO2', 'Noise', 'O3', 'PM1', 'PM10', 'PM2', 'Pressure', 'Temperature', 'WindDirection', 'WindSpeed']   No. Observations:                 1000
Model:                                                                                                                                           VAR(3)   Log Likelihood              -23132.450
                                                                                                                                            + intercept   AIC                          47678.900
Date:                                                                                                                                  Wed, 29 Mar 2023   BIC                          51148.683
Time:                              

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


KeyboardInterrupt: ignored