In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats as sts

from matplotlib import pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
sns.despine()

import os
import glob

%matplotlib inline

## Data

35 по phi, 5 по p

10 по v, 40 по p

In [2]:
data = pd.DataFrame(np.vstack([pd.read_csv(f'data_sparse/{fname}').drop('t', 1).values \
                               for fname in os.listdir('data_sparse') if fname != '.DS_Store']),
                    columns=['v', 'phi', 'p', 'q'])

In [None]:
data_test = pd.DataFrame(np.vstack([pd.read_csv(f'data_test/{fname}').drop('t', 1).values \
                                    for fname in os.listdir('data_test') if fname != '.DS_Store']),
                         columns=['v', 'phi', 'p', 'q'])

In [None]:
fig, axes = plt.subplots(2, 2, True, True, figsize=(20,10))
fig.tight_layout()
for ax, pair in zip(axes.flatten(), [('p', 'v'), ('p', 'phi'), ('q', 'v'), ('q', 'phi')]):
    mtx = np.array([[data[pair[0]].shift(x).corr(data[pair[1]].shift(y)) for y in range(50)] for x in range(50)])
    mtx_inv = np.linalg.inv(mtx)
    sns.heatmap(mtx_inv, cmap='RdYlBu', ax=ax)
    ax.set_xlabel(pair[1] + ' shifted')
    ax.set_ylabel(pair[0] + ' shifted')

In [None]:
data = [pd.read_csv(f'data_sparse/{fname}').drop('t', 1) \
        for fname in os.listdir('data_sparse') if fname != '.DS_Store']

In [None]:
fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(20,20))
varnames = ['$V$', '$\\varphi$', '$P$', '$Q$']
varidx = ['v', 'phi', 'p', 'q']
for i, (iname, iidx, axrow) in enumerate(zip(varnames, varidx, axes)):
    for j, (jname, jidx, ax) in enumerate(zip(varnames, varidx, axrow)):
        lags = range(1, 150)
        corrs_all = list()
        for df in data:
            corrs = list()
            for lag in lags:
                corrs.append(df[iidx].corr(df[jidx].shift(lag)))
            corrs_all.append(corrs)
        corrs = np.sum(corrs_all, axis=0) / len(data)
        ax.plot(lags, corrs)
        bounds = ax.get_xbound()
        ax.hlines([sts.norm.cdf(0.95), -sts.norm.cdf(0.95)] / np.sqrt(len(data)),
                  *ax.get_xbound())
        ax.hlines([sts.norm.cdf(0.99), -sts.norm.cdf(0.99)] / np.sqrt(len(data)),
                  *ax.get_xbound(), linestyles='dashed')
        ax.vlines(30, sts.norm.cdf(0.99) / np.sqrt(len(data)), -sts.norm.cdf(0.99) / np.sqrt(len(data)), 'r')
        if i == 3:
            ax.set_xlabel('Lag')
        if j == 0:
            ax.set_ylabel('Correlation')
        ax.set_title(iname + ' / ' + jname)
        ax.set_xlim(bounds)

In [None]:
fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(20,20))
varnames = ['$V$', '$\\varphi$', '$P$', '$Q$']
varidx = ['v', 'phi', 'p', 'q']
for i, (iname, iidx, axrow) in enumerate(zip(varnames, varidx, axes)):
    for j, (jname, jidx, ax) in enumerate(zip(varnames, varidx, axrow)):
        lags = range(1, 150)
        corrs_all = list()
        for df in data:
            corrs = list()
            for lag in lags:
                corrs.append(df[iidx].corr(df[jidx].shift(lag).apply(lambda x: np.sin(x))))
            corrs_all.append(corrs)
        corrs = np.sum(corrs_all, axis=0) / len(data)
        ax.plot(lags, corrs)
        bounds = ax.get_xbound()
        ax.hlines([sts.norm.cdf(0.95), -sts.norm.cdf(0.95)] / np.sqrt(len(data)),
                  *ax.get_xbound())
        ax.hlines([sts.norm.cdf(0.99), -sts.norm.cdf(0.99)] / np.sqrt(len(data)),
                  *ax.get_xbound(), linestyles='dashed')
        ax.vlines(30, sts.norm.cdf(0.99) / np.sqrt(len(data)), -sts.norm.cdf(0.99) / np.sqrt(len(data)), 'r')
        if i == 3:
            ax.set_xlabel('Lag')
        if j == 0:
            ax.set_ylabel('Correlation')
        ax.set_title(iname + ' / sin(' + jname + ')')
        ax.set_xlim(bounds)

In [None]:
fig, axes = plt.subplots(4, 4, sharex=True, sharey=True, figsize=(20,20))
varnames = ['$V$', '$\\varphi$', '$P$', '$Q$']
varidx = ['v', 'phi', 'p', 'q']
for i, (iname, iidx, axrow) in enumerate(zip(varnames, varidx, axes)):
    for j, (jname, jidx, ax) in enumerate(zip(varnames, varidx, axrow)):
        lags = range(1, 150)
        corrs_all = list()
        for df in data:
            corrs = list()
            for lag in lags:
                corrs.append(df[iidx].corr(df[jidx].shift(lag).apply(lambda x: np.exp(x))))
            corrs_all.append(corrs)
        corrs = np.sum(corrs_all, axis=0) / len(data)
        ax.plot(lags, corrs)
        bounds = ax.get_xbound()
        ax.hlines([sts.norm.cdf(0.95), -sts.norm.cdf(0.95)] / np.sqrt(len(data)),
                  *ax.get_xbound())
        ax.hlines([sts.norm.cdf(0.99), -sts.norm.cdf(0.99)] / np.sqrt(len(data)),
                  *ax.get_xbound(), linestyles='dashed')
        ax.vlines(30, sts.norm.cdf(0.99) / np.sqrt(len(data)), -sts.norm.cdf(0.99) / np.sqrt(len(data)), 'r')
        if i == 3:
            ax.set_xlabel('Lag')
        if j == 0:
            ax.set_ylabel('Correlation')
        ax.set_title(iname + ' / sin(' + jname + ')')
        ax.set_xlim(bounds)

## ARIMA

In [None]:
data['phiD'] = data['phi'].diff()

In [None]:
data_test['phiD'] = data_test['phi'].diff()

In [None]:
data['vD'] = data['v'].diff()

In [None]:
data_test['vD'] = data_test['v'].diff()

In [None]:
data = data.dropna()

In [None]:
data_test = data_test.dropna()

In [None]:
m = sm.tsa.ARIMA(endog=data['p'].values, exog=data['phiD'].values, order=(30, 0, 0))

In [None]:
m_test = sm.tsa.ARIMA(endog=data_test['p'].values, exog=data_test['phiD'].values, order=(10, 0, 0))

In [None]:
%%time
results_30 = m.fit()

In [None]:
# AM (10, 0, 0)
plt.figure(figsize=(20,10))
plt.plot(m.predict(results_10.params)[:150], label='pred')
plt.plot(data['p'].values[:150], label='true')
plt.legend(loc='best', fontsize='xx-large')

In [None]:
err = [np.sqrt(np.sum((t - p) ** 2)) / np.sqrt(np.sum(t ** 2)) \
       for t, p in zip(data['p'].values[:len(data['p'].values) // 151 * 151].reshape((len(data['p'].values) // 151, 151)),
                       m.predict(results_10.params)[:len(data['p'].values) // 151 * 151].reshape((len(data['p'].values) // 151, 151)))]

In [None]:
print('\n'.join(['RMSE',
                 'mean %.4f' % np.mean(err).round(4),
                 'median %.4f' % np.median(err).round(4),
                 'max %.4f' % np.max(err).round(4),
                 'min %.4f' % np.min(err).round(4),
                 '95%% percentile %.4f' % np.percentile(err, 95).round(4)]))

In [None]:
# AM (30, 0, 0)
plt.figure(figsize=(20,10))
plt.plot(m.predict(results_30.params)[:150], label='pred')
plt.plot(data['p'].values[:150], label='true')
plt.legend(loc='best', fontsize='xx-large')

In [None]:
err = [np.sqrt(np.sum((t - p) ** 2)) / np.sqrt(np.sum(t ** 2)) \
       for t, p in zip(data['p'].values[:len(data['p'].values) // 151 * 151].reshape((len(data['p'].values) // 151, 151)),
                       m.predict(results_30.params)[:len(data['p'].values) // 151 * 151].reshape((len(data['p'].values) // 151, 151)))]

In [None]:
print('\n'.join(['RMSE',
                 'mean %.4f' % np.mean(err).round(4),
                 'median %.4f' % np.median(err).round(4),
                 'max %.4f' % np.max(err).round(4),
                 'min %.4f' % np.min(err).round(4),
                 '95%% percentile %.4f' % np.percentile(err, 95).round(4)]))