# AIMS

In [1]:
%load_ext autoreload
%autoreload 2

In [11]:
import pandas as pd
import datetime
import plotly.express as px

from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [3]:
import sys
sys.path.append("../") 

import utils.paths as path
from utils.paths2 import direcciones

In [4]:
# Crear los de drive
G_raw, G_processed, G_interim, G_external, G_models, G_reports, G_reports_figures = direcciones('temperature')

In [9]:
def adf_test(dataset):
    dftest = adfuller(dataset, autolag = 'AIC')
    print("1. ADF : ",dftest[0])
    print("2. P-Value : ", dftest[1])
    print("3. Num Of Lags : ", dftest[2])
    print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
    print("5. Critical Values :")
    for key, val in dftest[4].items():
        print("\t",key, ": ", val)

In [5]:
# read data
# df = pd.read_parquet(path.data_processed_dir('dataset.parquet.gzip'))
df = pd.read_parquet(G_processed/'dataset.parquet.gzip')
df.head()

Unnamed: 0,date,t_max,t_min,t_mean,fenomeno
0,2007-01-01,26.4,4.4,14.11,1
1,2007-02-01,25.4,-4.6,13.48,0
2,2007-03-01,24.0,5.1,14.23,0
3,2007-04-01,23.0,6.6,14.75,0
4,2007-05-01,21.9,5.9,13.26,0


In [10]:
adf_test(df['t_mean'])

1. ADF :  -3.6125052887630544
2. P-Value :  0.00552640506700218
3. Num Of Lags :  13
4. Num Of Observations Used For ADF Regression and Critical Values Calculation : 166
5. Critical Values :
	 1% :  -3.4703698981001665
	 5% :  -2.8791138497902193
	 10% :  -2.576139407751488


In [12]:
stepwise_fit = auto_arima(df['t_mean'],  suppress_warnings=True)           
stepwise_fit.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,180.0
Model:,"SARIMAX(2, 1, 3)",Log Likelihood,-116.086
Date:,"Sun, 09 Oct 2022",AIC,244.173
Time:,08:42:22,BIC,263.297
Sample:,0,HQIC,251.928
,- 180,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.8609,0.060,14.354,0.000,0.743,0.978
ar.L2,-0.9297,0.051,-18.128,0.000,-1.030,-0.829
ma.L1,-1.3335,0.089,-14.986,0.000,-1.508,-1.159
ma.L2,1.2960,0.104,12.403,0.000,1.091,1.501
ma.L3,-0.5275,0.067,-7.889,0.000,-0.658,-0.396
sigma2,0.2128,0.022,9.686,0.000,0.170,0.256

0,1,2,3
Ljung-Box (L1) (Q):,0.14,Jarque-Bera (JB):,10.3
Prob(Q):,0.71,Prob(JB):,0.01
Heteroskedasticity (H):,0.52,Skew:,-0.24
Prob(H) (two-sided):,0.01,Kurtosis:,4.07


## Split Data into Training and Testing

In [14]:
print(df.shape)
train=df.iloc[:-30]
test=df.iloc[-30:]
print(train.shape,test.shape)
print(test.iloc[0],test.iloc[-1])

(180, 5)
(150, 5) (30, 5)
date        2019-08-01 00:00:00
t_max                      27.1
t_min                       4.5
t_mean                    14.36
fenomeno                      0
Name: 151, dtype: object date        2021-12-01 00:00:00
t_max                      26.6
t_min                       3.5
t_mean                    15.49
fenomeno                      2
Name: 179, dtype: object


# Train the Model

In [22]:
import statsmodels.api as sm

In [25]:
model = sm.tsa.arima.ARIMA(train['t_mean'], order=(2,1,3))
model = model.fit()
model.summary()

0,1,2,3
Dep. Variable:,t_mean,No. Observations:,150.0
Model:,"ARIMA(2, 1, 3)",Log Likelihood,-94.985
Date:,"Sun, 09 Oct 2022",AIC,201.97
Time:,08:50:35,BIC,219.993
Sample:,0,HQIC,209.292
,- 150,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ar.L1,0.9840,0.012,82.840,0.000,0.961,1.007
ar.L2,-0.9963,0.008,-128.336,0.000,-1.011,-0.981
ma.L1,-1.4230,0.083,-17.129,0.000,-1.586,-1.260
ma.L2,1.4017,0.161,8.724,0.000,1.087,1.717
ma.L3,-0.4029,0.078,-5.150,0.000,-0.556,-0.250
sigma2,0.2031,0.031,6.582,0.000,0.143,0.264

0,1,2,3
Ljung-Box (L1) (Q):,0.1,Jarque-Bera (JB):,24.9
Prob(Q):,0.76,Prob(JB):,0.0
Heteroskedasticity (H):,0.49,Skew:,-0.12
Prob(H) (two-sided):,0.01,Kurtosis:,4.99


# Predictions

In [45]:
start=len(train)
end=len(train)+len(test)-1
#if the predicted values dont have date values as index, you will have to uncomment the following two commented lines to plot a graph
#index_future_dates=pd.date_range(start='2018-12-01',end='2018-12-30')
pred=model.predict(start=start,end=end,typ='levels').rename('ARIMA predictions')
# pred.plot(legend=True)
# test['t_mean'].plot(legend=True)

In [42]:
# pred.plot(legend='ARIMA Predictions')
# test['t_mean'].plot(legend=True)

In [35]:
test['t_mean'].mean()

14.887666666666664

In [36]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [38]:
rmse=sqrt(mean_squared_error(pred,test['t_mean']))
print(rmse)

0.437943982047642
