In [1]:
# preliminaries

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

from sktime.classification.interval_based import TimeSeriesForestClassifier



In [3]:
import numpy as np
import pandas as pd

# load data and fix issues w original

weather = pd.read_csv('madweather.csv')
#del weather['Unnamed: 0']
weather.head()
#weather.dtypes

Unnamed: 0,DATE,PRCP,TAVG,TMAX,TMIN
0,1900-01,0.69,23.7,30.8,16.6
1,1900-02,1.26,13.0,21.5,4.5
2,1900-03,1.33,25.1,32.3,17.9
3,1900-04,1.31,48.0,57.8,38.3
4,1900-05,1.87,60.5,70.9,50.2


In [4]:
# change data from object to datetime
weather['DATE'] = pd.to_datetime(weather.DATE, format = '%Y/%m')
data = weather.drop(['DATE'], axis = 1)
data.index = pd.DatetimeIndex(weather.DATE).to_period('m')
data.head()

Unnamed: 0_level_0,PRCP,TAVG,TMAX,TMIN
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900-01,0.69,23.7,30.8,16.6
1900-02,1.26,13.0,21.5,4.5
1900-03,1.33,25.1,32.3,17.9
1900-04,1.31,48.0,57.8,38.3
1900-05,1.87,60.5,70.9,50.2


In [17]:
# missing values
cols = data.columns

data = data.fillna(method="ffill")
data = data.iloc[0:,0:3]
data

Unnamed: 0_level_0,PRCP
DATE,Unnamed: 1_level_1
1900-01,0.69
1900-02,1.26
1900-03,1.33
1900-04,1.31
1900-05,1.87
...,...
1974-09,0.54
1974-10,1.81
1974-11,1.59
1974-12,1.67


In [6]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen

In [7]:
data.head()


Unnamed: 0_level_0,PRCP,TAVG,TMAX
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900-01,0.69,23.7,30.8
1900-02,1.26,13.0,21.5
1900-03,1.33,25.1,32.3
1900-04,1.31,48.0,57.8
1900-05,1.87,60.5,70.9


In [18]:
# split into test and train
train = data[:int(0.8*(len(data)))]
test = data[int(0.8*(len(data))):]


In [9]:
from statsmodels.tsa.vector_ar.var_model import VAR

In [19]:
# select order of VAR model
data_differenced = train.diff().dropna()
model = VAR(data_differenced)
for i in [1,2,3,4,5,6,7,8,9]:
    result = model.fit(i)
    print('Lag Order =', i)
    print('AIC : ', result.aic)

x = model.select_order(maxlags=12)
x.summary()
#minimums occur at p=12



ValueError: Only gave one variable to VAR

In [16]:
# train model with selected order
model_fitted = model.fit(12)
model_fitted.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Mon, 29, Nov, 2021
Time:                     12:08:30
--------------------------------------------------------------------
No. of Equations:         3.00000    BIC:                    6.81522
Nobs:                     4145.00    HQIC:                   6.70570
Log likelihood:          -31306.8    FPE:                    769.485
AIC:                      6.64572    Det(Omega_mle):         749.241
--------------------------------------------------------------------
Results for equation PRCP
              coefficient       std. error           t-stat            prob
---------------------------------------------------------------------------
const            0.001966         0.030155            0.065           0.948
L1.PRCP         -0.832015         0.015618          -53.273           0.000
L1.TAVG          0.016030         0.010395            1.542           0.123
L1.TM

In [12]:
#check for serial correlation of residuals
def adjust(val, length= 6): return str(val).ljust(length)

from statsmodels.stats.stattools import durbin_watson
out = durbin_watson(model_fitted.resid)

for col, val in zip(data.columns, out):
    print(adjust(col), ':', round(val, 2))

# no significant correlation

PRCP   : 2.01
TAVG   : 2.0
TMAX   : 2.0


In [13]:
prediction = model_fit.forecast(model_fit.y, steps = len(test))

NameError: name 'model_fit' is not defined

In [90]:
# dataframe of predictions
pred = pd.DataFrame(index=range(0,len(prediction)),columns=[cols])
for j in range(0,5):
    for i in range(0, len(prediction)):
       pred.iloc[i][j] = prediction[i][j]

In [91]:
# check rmse
from sklearn.metrics import mean_squared_error
for i in cols:
    print('rmse value for', i, 'is : ', np.sqrt(mean_squared_error(test[i], pred[i])))

rmse value for PRCP is :  2.5875148554341263
rmse value for SNOW is :  5.489238550905626
rmse value for TAVG is :  20.230711134422


KeyError: 'TMAX'

In [92]:
#final predictions
modelfin = VAR(endog=data)
modelfin_fit = modelfin.fit()
ypred = modelfin_fit.forecast(modelfin_fit.y, steps = 1)
print(ypred)

[[ 2.08015611  9.74504194 27.73572844]]


  obj = getattr(results, attr)


In [17]:
# evaluate a forecast in the format [chunk][variable][time]
def evaluate_forecasts(prediction, test):
	lead_times = get_lead_times()
	total_mae, times_mae = 0.0, [0.0 for _ in range(len(lead_times))]
	total_c, times_c = 0, [0 for _ in range(len(lead_times))]
	# enumerate test chunks
	for i in range(len(test_chunks)):
		# convert to forecasts
		actual = testset[i]
		predicted = predictions[i]
		# enumerate target variables
		for j in range(predicted.shape[0]):
			# enumerate lead times
			for k in range(len(lead_times)):
				# skip if actual in nan
				if isnan(actual[j, k]):
					continue
				# calculate error
				error = calculate_error(actual[j, k], predicted[j, k])
				# update statistics
				total_mae += error
				times_mae[k] += error
				total_c += 1
				times_c[k] += 1
	# normalize summed absolute errors
	total_mae /= total_c
	times_mae = [times_mae[i]/times_c[i] for i in range(len(times_mae))]
	return total_mae, times_mae

In [21]:
# summarize scores
def summarize_error(name, total_mae, times_mae):
	# print summary
	lead_times = get_lead_times()
	formatted = ['+%d %.3f' % (lead_times[i], times_mae[i]) for i in range(len(lead_times))]
	s_scores = ', '.join(formatted)
	print('%s: [%.3f MAE] %s' % (name, total_mae, s_scores))
	# plot summary
	pyplot.plot([str(x) for x in lead_times], times_mae, marker='.')
	pyplot.show()