## Loading Packages

In [30]:
from fbprophet import Prophet
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import *
import warnings

# plt.style.use('dark_background')
warnings.filterwarnings("ignore")

## Loading Data

In [31]:
def universe_select(path, commodity_name):
    """Selects the instruments believed to be of
    interest for the commodity selected
    Returns: A dictionary of dataframes which are
    intruments of interest"""
    universe_dict = {}
    
    if commodity_name == "Al": 
        aluminium_list = ["al_shfe", "al_lme", "al_comex_p", "al_comex_s", "al_lme_s", "yuan",
                 "bdi", "ted", "vix", "skew", "gsci"]
        
        for instrument in aluminium_list:
            df = pd.read_csv(path + instrument + ".csv", index_col='date', parse_dates=['date'], dayfirst=True).sort_index(ascending=True)
            universe_dict[instrument] = df
            
    elif commodity_name == "Cu":
        copper_list = ["cu_shfe", "cu_lme", "cu_comex_p", "cu_comex_s", "peso", "sol",
                 "bdi", "ted", "vix", "skew", "gsci"]
        
        for instrument in copper_list:
            df = pd.read_csv(path + instrument + ".csv", index_col='date', parse_dates=['date'], dayfirst=True).sort_index(ascending=True)
            universe_dict[instrument] = df
    
    else: print("Select an appropriate commodity")
    return universe_dict


In [32]:
path = "Data/"
universe_dict = universe_select(path, "Cu")

## Preprocessing

In [33]:
# Renaming the columns to price
universe_dict = price_rename(universe_dict)
# Cleaning the dataset of any erroneous datapoints
universe_dict = clean_dict_gen(universe_dict)
# Making sure that all the points in the window have consistent lenght
universe_dict = truncate_window_length(universe_dict)
# Generate the full training dataset
df_full = generate_dataset(universe_dict, lg_returns_only=True)

Included Instrument:
cu_shfe
cu_lme
cu_comex_p
cu_comex_s
peso
sol
bdi
ted
vix
skew
gsci


In [42]:
# Visualise the plots
# visualise_universe(universe_dict)
df = df_full[['lg_return_cu_lme']]

In [43]:
df.head()

Unnamed: 0_level_0,lg_return_cu_lme
date,Unnamed: 1_level_1
2006-08-29,0.0
2006-08-30,0.007395
2006-08-31,0.033459
2006-09-01,-0.014616
2006-09-04,0.008117


In [22]:
def feature_spawn(df):
  """Spawns features for each instrument
  Returns df with the following columns for each
  instrument
  Log Returns
  EWMA 1 day
  EWMA 1 week
  EWMA 1 month
  EWMA 1 quarter
  EWMA 6 months
  EWMA 1 year
  Rolling vol 1 week
  Rolling vol 1 month
  Rolling vol 1 quarter
  """
  hlf_dict = {"week":5, "month":22, "quarter":66, "half_year":130, "year":261}
    
  for col in df.columns:
      for half_life in hlf_dict:
        df[col + "_ema_" + half_life] = df[col].ewm(span=hlf_dict[half_life]).mean()

      for i, half_life in enumerate(hlf_dict):
        if i < 3:
          df[col + "_roll_vol_" + half_life] = df[col].rolling(window=hlf_dict[half_life]).std(ddof=0)
  
  df.dropna(inplace=True)
  return df

In [44]:
df_full = feature_spawn(df_full)

## Principal Component Analysis

In [47]:
def dimension_selector(df, thresh=0.99):
  """Returns the number of dimensions that reaches the 
  threshold level of desired variance"""
  for n_dim in range(1, 11):
    pca = PCA(n_components=n_dim)
    pca.fit(df)
    if sum(pca.explained_variance_ratio_) > thresh: 
      print("Number of dimensions:", n_dim)
      return n_dim
  print("No level of dimensionality reaches threshold variance level")
  return None


def dimension_reduce(df, n_dim):
  """"""
  pca = PCA(n_components=n_dim)
  pca.fit(df)
  df_reduced = pca.transform(df) 
  print("Explained Variance:", pca.explained_variance_ratio_, 
     "\nExplained Variance Sum:", sum(pca.explained_variance_ratio_))
  return pd.DataFrame(df_reduced, index=df.index)


def inverse_pca(df_reduced, df, n_dim):
  """"""
  pca = PCA(n_components=n_dim)
  pca.fit(df)
  df_inverse = pd.DataFrame(pca.inverse_transform(pca_reduced), index=df.index, columns=df.columns)
  return 

n_dim = dimension_selector(df_full)
df_reduced = dimension_reduce(df_full, n_dim)

Number of dimensions: 6
Explained Variance: [0.88530351 0.0773672  0.00956477 0.00848314 0.00755128 0.00682361] 
Explained Variance Sum: 0.9950935118227293
