<a href="https://colab.research.google.com/github/markuskunej/air-pollution-thesis/blob/master/training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This file is used to train various models

## Unpickle dataframes from google drive and store in df dictionary

In [45]:
import pandas as pd
import os
import glob

DATA_PATH = "/content/drive/MyDrive/Air_Pollution_Data/dataframes"
all_pickle_files = glob.glob(DATA_PATH + '/*.pkl')

dfs = {}

for pkl_file in all_pickle_files:
  loc_and_interval = os.path.basename(pkl_file).split(".")[0] # i.e. Y&E_15m
  dfs[loc_and_interval] = pd.read_pickle(pkl_file)

Split data from 2022-10-03 11:00:00 to 2022-10-04 11:00:00

In [46]:
for df_name in dfs:
  df1 = dfs[df_name][:'2022-10-03 11:00:00-04:00']
  df2 = dfs[df_name]['2022-10-04 11:00:00-04:00':]
  dfs[df_name] = {"July-Oct": df1, "Oct-Jan": df2}

In [47]:
# test for stationarity, difference if seasonality exists
# https://michael-fuchs-python.netlify.app/2020/10/29/time-series-analysis-regression-extension-techniques-for-forecasting-multivariate-variables/#stationarity
from statsmodels.tsa.stattools import adfuller

def Augmented_Dickey_Fuller_Test_func(timeseries , column_name):
    '''
    Calculates statistical values whether the available data are stationary or not 
    
    Args:
        series (float64): Values of the column for which stationarity is to be checked, numpy array of floats 
        column_name (str): Name of the column for which stationarity is to be checked
    
    Returns:
        p-value that indicates whether the data are stationary or not
    '''
    print (f'Results of Dickey-Fuller Test for column: {column_name}')
    adfTest = adfuller(timeseries, autolag='AIC')   # why AIC vs BIC, t-stat, etc.?
    dfResults = pd.Series(adfTest[0:4], index=['ADF Test Statistic','P-Value','# Lags Used','# Observations Used'])
    for key, value in adfTest[4].items():
       dfResults['Critical Value (%s)'%key] = value
    print (dfResults)
    if adfTest[1] <= 0.05:
        print()
        print("Conclusion:")
        print("Reject the null hypothesis")
        print('\033[92m' + "Data is stationary" + '\033[0m')
        return True
    else:
        print()
        print("Conclusion:")
        print("Fail to reject the null hypothesis")
        print('\033[91m' + "Data is non-stationary" + '\033[0m')
        return False

# check each column of df for seasonality
def adf_df_test(df):
  non_station_cols = []
  for name, col in df.iteritems():
    if not df[name].isnull().values.any():
      is_stationary = Augmented_Dickey_Fuller_Test_func(df[name],name)
      print('\n')
      if not is_stationary:
        # add column to list of non-stationary ones
        non_station_cols.append(name)
    else:
      # column contains NaN values, drop from dataframe
      print(df[name].isnull().sum())
      df.drop(columns=name, inplace=True)
      print("Dropping column " + name + " because it contains NaN values.")
      

  return non_station_cols


In [48]:
import numpy as np

non_stat_cols_dict = {}
# test each df for stationarity
for group in dfs:
  for split_df_names in dfs[group]:
    print("STARTING ADF TEST FOR " + group + "_" + split_df_names + '\n')
    non_station_cols = adf_df_test(dfs[group][split_df_names])
    non_stat_cols_dict[group + "_" + split_df_names] = non_station_cols

STARTING ADF TEST FOR Y&E_60m_July-Oct

Results of Dickey-Fuller Test for column: CO
ADF Test Statistic     -6.218195e+00
P-Value                 5.302048e-08
# Lags Used             2.400000e+01
# Observations Used     1.709000e+03
Critical Value (1%)    -3.434182e+00
Critical Value (5%)    -2.863233e+00
Critical Value (10%)   -2.567671e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: CO2
ADF Test Statistic     -6.317823e+00
P-Value                 3.118952e-08
# Lags Used             2.400000e+01
# Observations Used     1.709000e+03
Critical Value (1%)    -3.434182e+00
Critical Value (5%)    -2.863233e+00
Critical Value (10%)   -2.567671e+00
dtype: float64

Conclusion:
Reject the null hypothesis
[92mData is stationary[0m


Results of Dickey-Fuller Test for column: Humidity
ADF Test Statistic        -5.006961
P-Value                    0.000021
# Lags Used               25.000000
# Observations Used   

In [50]:
print(non_stat_cols_dict)

{'Y&E_60m_July-Oct': ['Temperature'], 'Y&E_60m_Oct-Jan': [], 'Y&E_15m_July-Oct': [], 'Y&E_15m_Oct-Jan': [], 'Y&E_1m_July-Oct': ['Pressure'], 'Y&E_1m_Oct-Jan': []}


In [None]:
def train(df):
  