In [1]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import max_error

In [2]:
data = pd.read_pickle(r"../data/B00020S.pkl")
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
#%matplotlib qt

stations = ['GŁOGÓW', 'MALCZYCE', 'BRZEG', 'KRAPKOWICE', 'KRZYŻANOWICE', 'CHAŁUPKI']

data_temp = data.groupby(['Date', 'Station'])['B00020S'].mean().reset_index()
data_202x = data_temp[(data_temp['Date'].dt.year >= 2020) & (data_temp['Date'].dt.year <= 2020) & (data_temp['Station'].isin(stations))].reset_index()

data_brzeg = data_202x[data_202x['Station'] == 'BRZEG'].reset_index()
data_malczyce = data_202x[data_202x['Station'] == 'MALCZYCE'].reset_index()
data_chalupki = data_202x[data_202x['Station'] == 'CHAŁUPKI'].reset_index()
data_krapkowice = data_202x[data_202x['Station'] == 'KRAPKOWICE'].reset_index()
data_krzyzanowice = data_202x[data_202x['Station'] == 'KRZYŻANOWICE'].reset_index()
data_glogow = data_202x[data_202x['Station'] == 'GŁOGÓW'].reset_index()


testdata_chalupki = data_temp[(data_temp.Date.between('2019-12-29', '2020-12-28')) & (data_temp['Station'] == 'CHAŁUPKI')].reset_index()
testdata_krzyzanowice = data_temp[(data_temp.Date.between('2019-12-29', '2020-12-28')) & (data_temp['Station'] == 'KRZYŻANOWICE')].reset_index()
testdata_brzeg = data_temp[(data_temp.Date.between('2019-12-30', '2020-12-29')) & (data_temp['Station'] == 'BRZEG')].reset_index()
testdata_malczyce = data_temp[(data_temp.Date.between('2019-12-31', '2020-12-30')) & (data_temp['Station'] == 'MALCZYCE')].reset_index()


data_g = data_glogow.copy()
extracted_col = [testdata_chalupki["B00020S"], testdata_krzyzanowice["B00020S"], testdata_brzeg["B00020S"], testdata_malczyce["B00020S"]]

data_g = pd.concat([data_g, extracted_col[0].rename("Chałupki")], axis=1)
data_g = pd.concat([data_g, extracted_col[1].rename("Krzyżanowice")], axis=1)
data_g = pd.concat([data_g, extracted_col[2].rename("Brzeg")], axis=1)
data_g = pd.concat([data_g, extracted_col[3].rename("Malczyce")], axis=1)

x = data_g[["Chałupki", "Krzyżanowice", "Brzeg", "Malczyce"]]
y = data_g['B00020S']
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()

print(model.summary())

residuals = pd.DataFrame(model.resid)
residuals.plot()
residuals.plot(kind='kde')
print(residuals.describe())

#plt.plot(data_malczyce['Date'], data_malczyce['B00020S'],'b', data_glogow['Date'], data_glogow['B00020S'], 'g')

#Chałupki - Krzyżanowice: 0-1 dni
#Chałupki - Brzeg: 1-3 dni
#Chałupki - Malczyce: 1-4 dni (raczej 2)
#Chałupki - Głogów: 2-7 dni (raczej 3)

#Krzyżanowice - Brzeg: 1-3 dni (raczej 1)
#Krzyżanowice - Malczyce: 1-4 dni (raczej 2)
#Krzyżanowice - Głogów: 2-6 dni (raczej 3)

#Brzeg - Malczyce: 0-1 dni (?poziom w Brzegu prawie zawsze wyższy niż w Malczycach?)
#Brzeg - Głogów: 1-3 dni (raczej 2)

#Malczyce - Głogów: 1-2 dni

#Głogów(t) = a0 + a1 * Chałupki(t-3) + a2 * Krzyżanowice(t-3) + a3 * Brzeg(t-2) + a4 * Malczyce(t-1)

In [None]:
X = data_g['B00020S'].values
size = int(len(X) * 0.8)

train, test = data_g.iloc[0:size], data_g.iloc[size:len(X)]

x = train[["Chałupki", "Krzyżanowice", "Brzeg", "Malczyce"]]
y = train['B00020S']
x = sm.add_constant(x)

model = sm.OLS(y, x).fit()
model.predict(x)
print(model.params)
test_values = model.params['const'] + model.params['Chałupki'] * test['Chałupki'] + model.params['Krzyżanowice'] * test['Krzyżanowice'] + model.params['Brzeg'] * test['Brzeg'] + model.params['Malczyce'] * test['Malczyce']
plt.plot(train['Date'], train['B00020S'], 'b', label='faktyczne dane')
plt.plot(train['Date'], model.predict(), 'r', label='model')
plt.plot(test['Date'], test['B00020S'], 'g', label='faktyczne dane (test)')
plt.plot(test['Date'], test_values, 'm', label='model (test)')
plt.legend()
plt.grid()
plt.show()

mse = mean_squared_error(y_true = test['B00020S'], y_pred = test_values, squared = True)
rmse = mean_squared_error(y_true = test['B00020S'], y_pred = test_values, squared = False)
mape = mean_absolute_percentage_error(y_true = test['B00020S'], y_pred = test_values)
max_error = max_error(y_true = test['B00020S'], y_pred = test_values)
print('Test MSE: %.3f' % mse)
print('Test RMSE: %.3f' % rmse)
print('Test MAPE: %.3f' % mape)
print('Test max error: %.3f' % max_error)