## Loading Packages

In [3]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from utils import *
import warnings

# plt.style.use('dark_background')
warnings.filterwarnings("ignore")

## Loading Data

In [12]:
def universe_select(path, commodity_name):
    """Selects the instruments believed to be of
    interest for the commodity selected
    Returns: A dictionary of dataframes which are
    intruments of interest"""
    universe_dict = {}
    
    if commodity_name == "Al": 
        aluminium_list = ["al_shfe", "al_lme", "al_comex_p", "al_comex_s", "al_lme_s", "yuan",
                 "bdi", "ted", "vix", "skew", "gsci"]
        
        for instrument in aluminium_list:
            df = pd.read_csv(path + instrument + ".csv", index_col='date', parse_dates=['date'], dayfirst=True).sort_index(ascending=True)
            universe_dict[instrument] = df
            
    elif commodity_name == "Cu":
        copper_list = ["cu_shfe", "cu_lme", "cu_comex_p", "cu_comex_s", "peso", "sol",
                 "bdi", "ted", "vix", "skew", "gsci"]
        
        for instrument in copper_list:
            df = pd.read_csv(path + instrument + ".csv", index_col='date', parse_dates=['date'], dayfirst=True).sort_index(ascending=True)
            universe_dict[instrument] = df
    
    else: print("Select an appropriate commodity")
    return universe_dict


In [13]:
path = "Data/"
universe_dict = universe_select(path, "Cu")

## Preprocessing

In [14]:
# Renaming the columns to price
universe_dict = price_rename(universe_dict)
# Cleaning the dataset of any erroneous datapoints
universe_dict = clean_dict_gen(universe_dict)
# Making sure that all the points in the window have consistent lenght
universe_dict = truncate_window_length(universe_dict)
# Generate the full training dataset
df_full = generate_dataset(universe_dict, lg_returns_only=False)

Included Instrument:
cu_shfe
cu_lme
cu_comex_p
cu_comex_s
peso
sol
bdi
ted
vix
skew
gsci


In [19]:
# Visualise the plots
# visualise_universe(universe_dict)
df = df_full[["price_cu_lme"]]

In [25]:
df_full.head()
df.head()

Unnamed: 0_level_0,price_cu_shfe
date,Unnamed: 1_level_1
2006-08-29,68160.0
2006-08-30,67650.0
2006-08-31,67630.0
2006-09-01,70140.0
2006-09-04,71240.0


In [27]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

np.random.seed(1)

# Instanciate a Gaussian Process model
kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)

In [56]:
X = np.atleast_2d([1., 3., 5., 6., 7., 8.])

In [59]:
X = df.index[:20].values.T
# x_pred = df.index[20].T
y = df["price_cu_shfe"][:20].values.T

AttributeError: 'Timestamp' object has no attribute 'T'

In [60]:
X

array(['2006-08-29T00:00:00.000000000', '2006-08-30T00:00:00.000000000',
       '2006-08-31T00:00:00.000000000', '2006-09-01T00:00:00.000000000',
       '2006-09-04T00:00:00.000000000', '2006-09-05T00:00:00.000000000',
       '2006-09-06T00:00:00.000000000', '2006-09-07T00:00:00.000000000',
       '2006-09-08T00:00:00.000000000', '2006-09-11T00:00:00.000000000',
       '2006-09-12T00:00:00.000000000', '2006-09-13T00:00:00.000000000',
       '2006-09-14T00:00:00.000000000', '2006-09-15T00:00:00.000000000',
       '2006-09-18T00:00:00.000000000', '2006-09-19T00:00:00.000000000',
       '2006-09-20T00:00:00.000000000', '2006-09-21T00:00:00.000000000',
       '2006-09-22T00:00:00.000000000', '2006-09-25T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [53]:
X = np.linspace(0, 20, 21)
print(X)
x_pred = 21


[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20.]


In [54]:
print(x_pred)
print(type(x_pred))

21
<class 'int'>


In [55]:
# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(X, y)

# Make the prediction on the meshed x-axis (ask for MSE as well)
y_pred, sigma = gp.predict(x, return_std=True)

ValueError: Expected 2D array, got 1D array instead:
array=[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [30]:
X = df.iloc[1:1+10, :]
Y = df.iloc[1+5:1+10+5, -1]  

In [31]:
Y

date
2006-09-06    73990.0
2006-09-07    74860.0
2006-09-08    74330.0
2006-09-11    71760.0
2006-09-12    69430.0
2006-09-13    69540.0
2006-09-14    71350.0
2006-09-15    69400.0
2006-09-18    69830.0
2006-09-19    71470.0
Name: price_cu_shfe, dtype: float64