# Biblioteki

In [7]:
import numpy as np
import pandas as pd
import json
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import matplotlib.pyplot as plt
plt.style.use("bmh")
plt.rcParams['figure.figsize'] = [15, 10]
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tools.eval_measures import rmse
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [8]:
from matplotlib.dates import bytespdate2num, num2date
from matplotlib.ticker import Formatter

# Deklaracje

In [9]:
def plot_ts(ts):
    fig, ax = plt.subplots()
    ts_roll_mean = ts.rolling(window=12).mean()
    ts_roll_std = ts.rolling(window=12).std()
    
    ts.plot(color='blue', label='Original', use_index=False)
    ts_roll_mean.plot(color='red', label="Rolling mean", use_index=False)
    ts_roll_std.plot(color='black', label="Rolling std", use_index=False)
    
    ax.set_xticklabels(ts.index)
    fig.autofmt_xdate()
    
    
    plt.xlabel('Date')
    plt.ylabel('Return ratio')
    plt.show()
    
def perform_dft(ts):
    #Perform Augmented Dickey–Fuller test:
    print('Results of Dickey Fuller Test:')
    dftest = adfuller(y_train, autolag='AIC')
    add_col = []
    values = list(dftest[0:4])
    for key,value in dftest[4].items():
        add_col.append('Critical Value (%s)'%key)
        values.append(value)

    dfoutput = pd.DataFrame(data = [values], index=['Wartość'], columns=['Test Statistic','p-value','#Lags Used','Number of Observations Used']+add_col)
    display(dfoutput)

def decompose(ts):
    decomposition = seasonal_decompose(ts, freq=10) 
    decomposition.plot()
    plt.show()
    
def find_best_hparameters(ts, pmax, d, qmax):
    errors_dict = {}
    rmse_value_list = []
    order_list = []
    for p in range(pmax+1):
        for q in range(qmax+1):
            order = (p,d,q)
            try:
                model = ARIMA(ts, order=order, freq='B')
                results = model.fit()
                rmse_value = rmse(results.fittedvalues, ts)
                rmse_value_list.append(rmse_value)
                order_list.append(order)
            except Exception as e:
                errors_dict[str(order)] = {'error_str': str(e)}
    
    errors = pd.DataFrame.from_dict(errors_dict, orient='index')
    display(errors)
    
    results = sorted([(order, rmse_value) for order, rmse_value in zip(order_list, rmse_value_list)], key = lambda x: x[1])
    best10 = tuple(order for order, rmse_value in results[:10])
    return best10, results

def select_best_orders_from_best10(ts, best10):
    errors_dict = {}
    rmse_value_list = []
    order_list = []
    for order in best10:
        try:
            model = ARIMA(ts, order=order, freq='B')
            results = model.fit()
            rmse_value = rmse(results.fittedvalues, ts)
            rmse_value_list.append(rmse_value)
            order_list.append(order)
        except Exception as e:
            errors_dict[str(order)] = {'error_str': str(e)}
            
    errors = pd.DataFrame.from_dict(errors_dict, orient='index')
    display(errors)
    
    results = sorted([(order, rmse_value) for order, rmse_value in zip(order_list, rmse_value_list)], key = lambda x: x[1])
    best = results[0][0]
    return best, results

def test_orders(ts, orders):
    errors_dict = {}
    rmse_value_list = []
    order_list = []
    for order in orders:
        try:
            model = ARIMA(ts, order=order, freq='B')
            results = model.fit()
            rmse_value = rmse(results.fittedvalues, ts)
            rmse_value_list.append(rmse_value)
            order_list.append(order)
        except Exception as e:
            errors_dict[str(order)] = {'error_str': str(e)}
            
    errors = pd.DataFrame.from_dict(errors_dict, orient='index')
    display(errors)
    
    results = sorted([(order, rmse_value) for order, rmse_value in zip(order_list, rmse_value_list)], key = lambda x: x[1])
    best = results[0][0]
    return best, results

def show_est_plot(ts, order=None, zero=False, naiwny=False, label="Wykres"):
    if zero == True:
        z = [0]*len(ts)
        rmse_value = rmse(ts, z)
        plt.plot(ts.index, z, label="Zero", color='red')
        plt.plot(ts.index, ts, label=label, color='blue')
        plt.title(f'{label} (RMSE: {rmse_value})')
        plt.show()
    elif naiwny == True:
        rmse_value = rmse(ts[1:], ts.shift()[1:])
        plt.plot(ts[1:].index, ts.shift()[1:], label="Naiwny", color='red')
        plt.plot(ts[1:].index, ts[1:], label=label, color='blue')
        plt.title(f'{label} (RMSE: {rmse_value})')
        plt.show()
    else:
        model = ARIMA(ts, order=order, freq='B')
        results = model.fit()
        rmse_value = rmse(results.fittedvalues, ts)
        plt.plot(ts.index, results.fittedvalues, label=label, color='red')
        plt.plot(ts.index, ts, label=label, color='blue')
        plt.title(f'{label} (RMSE: {rmse_value})')
        plt.show()
        

In [10]:
best_results = {}

# Wczytanie zbiorów

In [11]:
train = pd.read_csv('datasets/podzielone/ekonometryczne/train_ekon.csv', index_col=0)
val1 = pd.read_csv('datasets/podzielone/ekonometryczne/valid1_ekon.csv', index_col=0)
val2 = pd.read_csv('datasets/podzielone/ekonometryczne/valid2_ekon.csv', index_col=0)
val3 = pd.read_csv('datasets/podzielone/ekonometryczne/valid3_ekon.csv', index_col=0)
test = pd.read_csv('datasets/podzielone/ekonometryczne/test_ekon.csv', index_col=0)

train.index = pd.to_datetime(train.index)
val1.index = pd.to_datetime(val1.index)
val2.index = pd.to_datetime(val2.index)
val3.index = pd.to_datetime(val3.index)
test.index = pd.to_datetime(test.index)

whole = pd.concat([train, val1, val2, val3, test])

train1 = train.asfreq('B', method='bfill')
train2 = pd.concat([train, val1]).asfreq('B', method='bfill')
train3 = pd.concat([train, val1, val2]).asfreq('B', method='bfill')

val1 = val1.asfreq('B', method='bfill')
val2 = val2.asfreq('B', method='bfill')
val3 = val3.asfreq('B', method='bfill')
test = test.asfreq('B', method='bfill')

FileNotFoundError: [Errno 2] File b'datasets/podzielone/ekonometryczne/train_ekon.csv' does not exist: b'datasets/podzielone/ekonometryczne/train_ekon.csv'

In [None]:
display(train.head(3))
display(val1.head(3))
display(val2.head(3))
display(val3.head(3))
display(test.head(3))
display(whole.head(3))

# Testy stacjonarności

## Trening 1

In [None]:
y_train = train1['return ratio']
y_valid = val1['return ratio']
plot_ts(y_train)

In [None]:
perform_dft(y_train)

In [None]:
decompose(y_train)

In [None]:
plot_acf(y_train, lags=20, alpha=0.05)
plot_pacf(y_train, lags=20, alpha=0.05)
plt.show()

In [None]:
pmax = 3
qmax = 3
d = 0

best10, results = find_best_hparameters(y_train, pmax=pmax, qmax=qmax, d=d)
print(best10)
print(results)
print('####################################################################')
best, results = select_best_orders_from_best10(y_valid, best10)
best_results['train1'] = results
print(best)
print(results)

## Trening 2

In [None]:
y_train = train2['return ratio']
y_valid = val2['return ratio']
plot_ts(y_train)

In [None]:
perform_dft(y_train)

In [None]:
decompose(y_train)

In [None]:
plot_acf(y_train, lags=20, alpha=0.05)
plot_pacf(y_train, lags=20, alpha=0.05)
plt.show()

In [None]:
pmax = 3
qmax = 3
d = 0

best10, results = find_best_hparameters(y_train, pmax=pmax, qmax=qmax, d=d)
print(best10)
print(results)
print('###############################################')
best, results = select_best_orders_from_best10(y_valid, best10)
best_results['train2'] = results
print(best)
print(results)

## Trening 3

In [None]:
y_train = train3['return ratio']
y_valid = val3['return ratio']
plot_ts(y_train)

In [None]:
perform_dft(y_train)

In [None]:
decompose(y_train)

In [None]:
plot_acf(y_train, lags=20, alpha=0.05)
plot_pacf(y_train, lags=20, alpha=0.05)
plt.show()

In [None]:
pmax = 3
qmax = 3
d = 0

best10, results = find_best_hparameters(y_train, pmax=pmax, qmax=qmax, d=d)
print(best10)
print(results)
print('########################################')
best, results = select_best_orders_from_best10(y_valid, best10)
best_results['train3'] = results
print(best)
print(results)

## Wyniki selekcji

In [None]:
print(best_results['train1'])

In [None]:
print(best_results['train2'])

In [None]:
print(best_results['train3'])

In [None]:
rmse_values_dict = {}
for k, results in best_results.items():
    for order, rmse_value in results:
        rmse_values_dict[order] = [rmse_value] if order not in rmse_values_dict else rmse_values_dict[order] + [rmse_value]

        
chosen_one_list = []
for k, v in rmse_values_dict.items():
    if len(v) == 3:
        chosen_one_list.append((k, np.mean(v)))
        
chosen_one_list.sort(key=lambda x: x[1])
chosen_one_list

# Testowanie

In [None]:
y_test = test['return ratio']
best_of_the_best, results = test_orders(y_test, [order for order, rmse_value in chosen_one_list])
print(results)

In [None]:
show_est_plot(y_test, best_of_the_best, label=best_of_the_best)
show_est_plot(y_test, (3, 0, 1), label=(3, 0, 1))
show_est_plot(y_test, zero=True, label="Zero")
show_est_plot(y_test, naiwny=True, label="Naiwny")