In [1]:
# preliminaries

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from sktime.classification.interval_based import TimeSeriesForestClassifier



In [8]:
import numpy as np
import pandas as pd

# load data and fix issues w original

weather = pd.read_csv('madweather.csv')
#del weather['Unnamed: 0']
weather.head()
#weather.dtypes

Unnamed: 0,DATE,PRCP,TAVG
0,1960-01,2.19,21.1
1,1960-02,1.15,19.7
2,1960-03,1.93,18.3
3,1960-04,4.02,48.1
4,1960-05,6.26,55.1


In [10]:
# change data from object to datetime
weather['DATE'] = pd.to_datetime(weather.DATE, format = '%Y/%m')
data = weather.drop(['DATE'], axis = 1)
data.index = pd.DatetimeIndex(weather.DATE).to_period('m')
data.head()

Unnamed: 0_level_0,PRCP,TAVG
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1960-01,2.19,21.1
1960-02,1.15,19.7
1960-03,1.93,18.3
1960-04,4.02,48.1
1960-05,6.26,55.1


In [11]:
# missing values
cols = data.columns

data = data.fillna(method="ffill")
data = data.iloc[0:,0:3]
data

Unnamed: 0_level_0,PRCP,TAVG
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
1960-01,2.19,21.1
1960-02,1.15,19.7
1960-03,1.93,18.3
1960-04,4.02,48.1
1960-05,6.26,55.1
...,...,...
1959-08,5.67,74.5
1959-09,3.44,63.7
1959-10,5.55,46.5
1959-11,2.29,27.1


In [6]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

In [7]:
data.head()


Unnamed: 0_level_0,PRCP,TAVG,TMAX
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900-01,0.69,23.7,30.8
1900-02,1.26,13.0,21.5
1900-03,1.33,25.1,32.3
1900-04,1.31,48.0,57.8
1900-05,1.87,60.5,70.9


In [12]:
# split into test and train
train = data[:int(0.8*(len(data)))]
test = data[int(0.8*(len(data))):]


In [13]:
from statsmodels.tsa.vector_ar.var_model import VAR

In [14]:
# select order of VAR model
data_differenced = train.diff().dropna()
model = VAR(data_differenced)
for i in [1,2,3,4,5,6,7,8,9]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)

x = model.select_order(maxlags=12)
x.summary()
#minimums occur at p=12

Lag Order = 1
AIC :  5.891086819575578
Lag Order = 2
AIC :  5.82328379282249
Lag Order = 3
AIC :  5.632937626285908
Lag Order = 4
AIC :  5.414541446171959
Lag Order = 5
AIC :  5.218960899737916
Lag Order = 6
AIC :  5.025341434946806
Lag Order = 7
AIC :  4.886720697928114
Lag Order = 8
AIC :  4.78777902176837
Lag Order = 9
AIC :  4.664169327418437




0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,6.548,6.557,698.0,6.552
1.0,5.889,5.916,361.2,5.899
2.0,5.821,5.865,337.2,5.837
3.0,5.638,5.699,280.8,5.661
4.0,5.418,5.497,225.5,5.448
5.0,5.223,5.320,185.5,5.260
6.0,5.031,5.145,153.0,5.074
7.0,4.890,5.022,133.0,4.940
8.0,4.792,4.941,120.5,4.848


In [15]:
# train model with selected order
model_fitted = model.fit(12)
model_fitted.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 29, Nov, 2021
Time:                     20:50:57
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                    4.77721
Nobs:                     1148.00    HQIC:                   4.64041
Log likelihood:          -5823.86    FPE:                    95.3408
AIC:                      4.55744    Det(Omega_mle):         91.3201
--------------------------------------------------------------------
Results for equation PRCP
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const           -0.002535         0.052274           -0.049           0.961
L1.PRCP         -0.889325         0.029743          -29.900           0.000
L1.TAVG          0.015532         0.009750            1.593           0.111
L2.PR

In [17]:
#check for serial correlation of residuals
def adjust(val, length= 6): return str(val).ljust(length)

from statsmodels.stats.stattools import durbin_watson
out = durbin_watson(model_fitted.resid)

for col, val in zip(data.columns, out):
    print(adjust(col), ':', round(val, 2))

# no significant correlation

PRCP   : 2.01
TAVG   : 2.0
