## Loading Packages

In [581]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from preprocessing import *
from utils import *
import warnings

# plt.style.use('dark_background')
warnings.filterwarnings("ignore")

## Loading Data

In [649]:
path = "Data/"
universe_dict = universe_select(path, "Cu")

## Preprocessing

In [650]:
# Renaming the columns to price
universe_dict = price_rename(universe_dict)
# Cleaning the dataset of any erroneous datapoints
universe_dict = clean_dict_gen(universe_dict)
# Making sure that all the points in the window have consistent lenght
universe_dict = truncate_window_length(universe_dict)

Included Instrument:
cu_shfe
cu_lme
cu_comex_p
cu_comex_s
peso
sol
bdi
ted
vix
skew
gsci


In [651]:
def generate_target(df_full, target_col="price_cu_lme", lag=5):
    """Generate the target variable"""
    df_target = df_full[[target_col]].apply(log_returns, lag=lag)
    df_target = df_target.shift(-lag)
    df_target.rename(columns={"price_cu_lme":target_col.replace("price_", str(lag) + "_day_forecast_")}, inplace=True)
    return df_target


def generate_lg_return(df_full, lag=1):
    """Returns a dictionary containing dataframes
    with the additional log returns column"""
    for col in df_full.columns:
        # Selecting out the dataframe of interest
        df = df_full[[col]]
        if lag==1: 
            df_full[col.replace('price_', "")] = log_returns(df[col], lag=lag)
            df_full.dropna(inplace=True)
        else:
            if ("price" in col) == True:
                df_full[col.replace('price', str(lag) + "_day_lg_return")] = log_returns(df[col], lag=lag)
                df_full.dropna(inplace=True)

    return df_full


def generate_dataset(universe_dict, target_col="price_cu_lme", lag=5, lg_returns_only=True):
    """Generates the full dataset"""
    # Renames the columns with the name of the instrument series
    universe_dict = column_rename(universe_dict)
    universe = [] 
    for df_name in universe_dict: universe.append(universe_dict[df_name])
    df_full = pd.concat(universe, axis = 1)
    # Must do log returns calculations after this forwards fill
    df_full.ffill(inplace=True)
    # Calculating the log returns
    df_full = generate_lg_return(df_full)
    
    # Fill in nan to allow inverse calculations
    df_full["target"] = np.nan
    # As target is forecast backdated the first row values should have
    # value and the last should have nulls
    df_full["target"][:-lag] = generate_target(df_full, target_col="price_cu_lme", lag=5)[:-lag].values.ravel()
    if lg_returns_only: df_full = df_full[df_full.columns.drop(list(df_full.filter(regex='price')))]
    return df_full

In [652]:
# df_full = generate_dataset(universe_dict)
df_full = generate_dataset(universe_dict, lg_returns_only=False)

The target column represents the log returns at one forecast length out in the future for the instrument of interest (aluminium or copper prices on the London Metals Exchange). 

To normalise the independent variables, the 1 day log returns between closing prices have been used.

In [653]:
# Visualise the plots
# visualise_universe(universe_dict)
df = df_full[["target"]]

In [654]:
df_full.head(5)
# df_full.tail(5)

Unnamed: 0_level_0,price_cu_shfe,price_cu_lme,price_cu_comex_p,price_cu_comex_s,price_peso,price_sol,price_bdi,price_ted,price_vix,price_skew,...,cu_comex_p,cu_comex_s,peso,sol,bdi,ted,vix,skew,gsci,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2006-09-13,69540.0,7484.5,3.3925,15748.0,537.35,3.253,4129.0,4.901,11.18,120.44,...,0.00059,0.055554,-0.000558,0.000307,0.029244,-0.004276,-0.064091,-0.048458,0.002624,-0.001404
2006-09-14,71350.0,7439.0,3.382,14180.0,537.73,3.2475,4207.0,4.926,11.55,119.47,...,-0.0031,-0.104881,0.000707,-0.001692,0.018715,0.005088,0.032559,-0.008086,-0.010694,0.021675
2006-09-15,69400.0,7281.0,3.3165,14775.0,537.1,3.248,4258.0,4.939,11.76,135.25,...,-0.019557,0.041104,-0.001172,0.000154,0.01205,0.002636,0.018019,0.12406,-0.004952,0.043012
2006-09-18,69830.0,7459.0,3.418,15263.0,537.1,3.245,4279.0,4.949,11.78,127.35,...,0.030146,0.032495,0.0,-0.000924,0.00492,0.002023,0.001699,-0.060186,0.007824,0.01345
2006-09-19,71470.0,7516.0,3.3755,17523.0,537.1,3.245,4275.0,4.948,11.98,125.03,...,-0.012512,0.138083,0.0,0.0,-0.000935,-0.000202,0.016835,-0.018385,-0.019642,0.025225


In [659]:
df = df_full[["target"]]
# Taking t-1 to be the value for t
df["persistance"] = df.shift(1)
df.dropna(inplace=True)
# Calculating metrics for these columns
MSE, MAE, MDE = evaluate(df, "target", "persistance")

In [660]:
print(df[:5])
print(df[-5:])

              target  persistance
date                             
2006-09-14  0.021675    -0.001404
2006-09-15  0.043012     0.021675
2006-09-18  0.013450     0.043012
2006-09-19  0.025225     0.013450
2006-09-20  0.024843     0.025225
              target  persistance
date                             
2019-06-24 -0.000589     0.004104
2019-06-25 -0.026401    -0.000589
2019-06-26 -0.010480    -0.026401
2019-06-27 -0.011358    -0.010480
2019-06-28 -0.015457    -0.011358


In [661]:
# Placing in results dataframe
results = pd.DataFrame(columns={"MSE", "MAE", "MDE"})
results.index.name = 'Name'
results.head()
results.loc["persistance"] = [MSE, MAE, MDE] 
results.head()

Unnamed: 0_level_0,MSE,MAE,MDE
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
persistance,0.000582,0.017105,0.465585
