# 1. Import packages

In [1]:
import os
import numpy as np
import pandas as pd

# 2. Loading a pre-processed data set

In [2]:
path = './dataset/toyset/'
filename = 'processed_toyset.csv'
df = pd.read_csv(path + filename)

#### Define dataset row/column sizes to begin with

In [3]:
nrow = 3
ncol = 3

In [4]:
proc_covar = 0.0001**2*np.identity(nrow)
post_covar = proc_covar
meas_covar = 0.0003**2*np.identity(nrow)
pred_covar = meas_covar
ident = np.identity(nrow)

In [5]:
state_old = np.ones((3, 1))
emit_mat = np.array([[1., -1, 0.],
                      [1.,  0, -1.],
                      [0., -1., 1.],
                      ])
obs_mat = df.drop(['timestamp', 'time_gap'], axis=1)
obs_mat = obs_mat.to_numpy()
time_gap = df.time_gap.to_numpy()

latent_states = np.empty(shape=(df.shape[0], nrow))
predicted_obs = np.empty(shape=(df.shape[0], nrow))

## Kalman updates

#### Assumptions:
- the order of measurements is GBPJPY, GBPUSD, USDJPY.
- the order of latent variables is GBP, JPY, USD.

In [6]:
ccy_list = ['gbpjpy', 'gbpusd', 'usdjpy']
latent_cols = ['GBP', 'JPY', 'USD']

In [7]:
for i in range(df.shape[0]):
    post_covar = post_covar + time_gap[i]*proc_covar

    # obs_mat[i,:] is a (3,) vector. So, we need flatten() to match
    innovation = obs_mat[i, :] - (emit_mat@state_old).flatten()
    innovation_covar = emit_mat@post_covar@(emit_mat.T) + meas_covar
    kalman_gain = post_covar@(emit_mat.T)@np.linalg.inv(innovation_covar)

    # We transpose (kalman_gain@innovation) from (3,) to (3,1)
    state_new = state_old + (kalman_gain@innovation).reshape(-1, 1)
    post_covar = (ident - kalman_gain@emit_mat)@post_covar
    predicted_obs[i] = (emit_mat@state_new).T
    latent_states[i] - state_new.T
    state_old = state_new

#### Remove the oldest timestamp and shift each row upward by one row.
- e.g. If the raw dataset starts from 2019-05-01 00:00:00 at 0th element, predictions would start from 2019-05-01 00:00:01 at 0th element instead ('lead' by one minute)

In [8]:
new_idx = df.timestamp.shift(-1).to_numpy().reshape(-1, 1)
predictions = np.hstack((new_idx, predicted_obs))
latent_estimates = np.hstack((new_idx, latent_states))

In [9]:
assert predictions.shape[0] == predicted_obs.shape[0], "We have some data loss."
assert latent_estimates.shape[0] == latent_states.shape[0], "We have some data loss."

#### Convert arrays to DataFrame instances

In [10]:
predictions = pd.DataFrame(predictions, columns=['timestamp'] + ccy_list)
latent_estimates = pd.DataFrame(latent_estimates,
                                columns=['timestampe'] + latent_cols)

## How good are the predictions?

#### It's worse than the baseline. Let's see what's going on:

As we expected, currency exchange rates are extremely volatile, and our naive approach fails. Then again, the covariance matrices are not estimated in this case, and there may be a setting that works better. But on the bright side, the graph of currency intrinsic values is really neat:

## Parameter Estimation: EM

### E step

In [13]:
def forward_pass(data, initial_state, emit_mat, meas_covar, proc_covar):    
    '''
    Parameters:
      data(DataFrame, ): a (batch) dataset
    '''
    obs_mat = data.drop(['timestamp', 'time_gap'], axis=1)
    obs_mat = obs_mat.to_numpy()
    time_gap = data.time_gap.to_numpy().reshape(-1,1)

    latent_states = np.empty(shape=(data.shape[0], nrow))
    predicted_obs = np.empty(shape=(data.shape[0], nrow))

    state_old = initial_state    
    post_covar = proc_covar

    for i in range(data.shape[0]):
        post_covar = post_covar + time_gap[i]*proc_covar

        # obs_mat[i,:] is a (3,) vector. So, we need flatten() to match
        innovation = obs_mat[i, :] - (emit_mat@state_old).flatten()
        innovation_covar = emit_mat@post_covar@(emit_mat.T) + meas_covar
        kalman_gain = post_covar@(emit_mat.T)@np.linalg.inv(innovation_covar)

        # We transpose (kalman_gain@innovation) from (3,) to (3,1)
        state_new = state_old + (kalman_gain@innovation).reshape(-1, 1)
        post_covar = (ident - kalman_gain@emit_mat)@post_covar
        predicted_obs[i] = (emit_mat@state_new).T
        latent_states[i] - state_new.T
        state_old = state_new
        
#     # Convert np arrays to DataFrame instances
#     new_idx = data.timestamp.shift(-1)
#     na_idx = new_idx.index[new_idx.isna() == True]

#     # We exclude any row with nan when we create a timestamp index
#     new_idx = new_idx[na_idx]
#     predicted_obs = predicted_obs[na_idx]
#     time_gap = time_gap[na_idx]
    new_idx = data.timestamp.shift(-1).to_numpy().reshape(-1, 1)
    predictions = np.hstack((new_idx, predicted_obs))
    predictions = np.hstack((predictions, time_gap))
    latent_estimates = np.hstack((new_idx, latent_states))
    latent_estimates = np.hstack((latent_estimates, time_gap))
    
    return predictions, latent_estimates
    

In [55]:
training = df.loc[10001:11000, :]

predictions, latent_estimates = forward_pass(data=training[:100],
                                             initial_state=np.ones((3, 1)),
                                             emit_mat=emit_mat,
                                             meas_covar=meas_covar,
                                             proc_covar=proc_covar)

# Convert to pandas Data Frame
predictions = pd.DataFrame(predictions,
                           columns=['timestamp'] + ccy_list + ['time_gap'])

latent_estimates = pd.DataFrame(latent_estimates,
                                columns=['timestamp'] + latent_cols + ['time_gap'])

check = []
check.append(predictions)
check.append(latent_estimates)

### M step

In [56]:
predictions.dropna(how='any')

Unnamed: 0,timestamp,gbpjpy,gbpusd,usdjpy,time_gap
0,2019-05-09 17:42:00,1.98516,0.105469,1.87969,1
1,2019-05-09 17:43:00,3.24472,0.172356,3.07237,1
2,2019-05-09 17:44:00,3.98418,0.211683,3.7725,1
3,2019-05-09 17:45:00,4.40759,0.23423,4.17336,1
4,2019-05-09 17:46:00,4.64805,0.247042,4.40101,1
...,...,...,...,...,...
94,2019-05-09 19:16:00,4.96242,0.264643,4.69777,1
95,2019-05-09 19:17:00,4.96239,0.264608,4.69778,1
96,2019-05-09 19:18:00,4.96236,0.264573,4.69778,1
97,2019-05-09 19:19:00,4.96239,0.264568,4.69782,1


In [150]:
def update_proc_covar(latent_estimates):
    # drop any row with na
    latent_estimates = latent_estimates.dropna(how='any')
    time_gap = latent_estimates.time_gap
    
    # drop two columns
    latent_estimates = latent_estimates.drop(['timestamp', 'time_gap'], axis=1)

    sample_means = [np.average(latent_estimates[ccy], weights=time_gap) for ccy in latent_cols]
    latent_estimates = latent_estimates[latent_cols] - sample_means
    
    return (latent_estimates.T)@latent_estimates

def update_meas_covar(pred, obs):
    unique_timestamp = np.sort(np.unique(np.union1d(pred['timestamp'].dropna(
        how='any').to_numpy(), obs['timestamp'].dropna(how='any').to_numpy())))

    obs_mat = obs[np.isin(obs.timestamp, unique_timestamp)]
    pred_mat = pred[np.isin(pred.timestamp, unique_timestamp)]

    obs_pred_mat = pd.merge(left=obs_mat, right=pred_mat,
                            how='inner', on='timestamp', suffixes=('_obs', '_pred'))

    ccy_list_obs = [ccy+'_obs' for ccy in ccy_list]
    ccy_list_pred = [ccy+'_pred' for ccy in ccy_list]

    error_mat = np.array(obs_pred_mat[ccy_list_pred], dtype=float) - np.array(obs_pred_mat[ccy_list_obs], dtype=float) 
    cov_mat = (error_mat.T)@error_mat
    
    return cov_mat

In [152]:
check.append(update_proc_covar(check[1]))
check.append(update_meas_covar(pred=check[0], obs=training[:100]))

### EM

In [155]:
def kalman_smoother(data, prior_state, meas_mat, prior_df, prior_meas_scale, prior_proc_scale):
    predictions, latent_estimates = forward_pass(data=data, initial_state=prior_state,
                           emit_mat=emit_mat, meas_cover=prior_meas_scale/prior_df,
                           proc_covar=prior_proce_scale/proir_df)
    
    proc_covar_new = update_proc_covar(latent_estimates)
    meas_covar_new = update_meas_covar(pred=predictions, obs=data)
    
    proc_covar_old = proc_covar_new*2
    meas_covar_old = meas_covar_new*2
    
    while np.sqrt(np.sum((proc_covar_old-proc_covar_new)^2) +
                  np.sum((meas_covar_old-meas_covar_new)^2)) > 10^(-16):
        print(np.sum((proc_covar_old - proc_covar_new)^2) + np.sum((meas_covar_old - meas_covar_new)^2))
        
        meas_covar_old = meas_covar_new
        proc_covar_old = proc_covar_new
        
        predictions, latent_estimates = forward_pass(data=data, initial_state=prior_state,
                           emit_mat=emit_mat, meas_cover=meas_covar_new,
                           proc_covar=proc_covar_new)
        
        proc_covar_new = update_proc_covar(latent_estimates)
        meas_covar_new = update_meas_covar(pred=predictions, obs=data)
    
    new_df = prior_df + data.shape[0] - emit_mat.shape[0] - 1