In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.tsa.stattools import adfuller, kpss
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from statsmodels.tsa.seasonal import seasonal_decompose
import seaborn as sns

df = pd.read_csv('spiff_data.csv',parse_dates=True, index_col=1)
df = df.iloc[:, 1:] # Get rid of index column
df.replace(1000, np.nan, inplace=True) # replace 1000 with NaN
df.rename(columns={'gurkor':'cucumbers'}, inplace=True)


  df = pd.read_csv('spiff_data.csv',parse_dates=True, index_col=1)


In [2]:
# Calculate returns
log_returns = np.log(df / df.shift(1))
clean_returns = log_returns.dropna()
returns = df.pct_change()

corr_matrix = clean_returns.corr()
# Display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

corr_matrix

  returns = df.pct_change()


Unnamed: 0,cucumbers,guitars,slingshots,stocks,sugar,water,tranquillity
cucumbers,1.0,-0.25847,-0.228873,-0.043198,-0.071552,0.574593,-0.137998
guitars,-0.25847,1.0,0.531217,0.022457,0.206163,-0.245264,0.229327
slingshots,-0.228873,0.531217,1.0,0.017324,0.116873,-0.21115,0.192116
stocks,-0.043198,0.022457,0.017324,1.0,-0.014031,-0.042846,0.007864
sugar,-0.071552,0.206163,0.116873,-0.014031,1.0,-0.078619,0.128811
water,0.574593,-0.245264,-0.21115,-0.042846,-0.078619,1.0,-0.161141
tranquillity,-0.137998,0.229327,0.192116,0.007864,0.128811,-0.161141,1.0


In [3]:
# Function for cross-correlation with lag
def crosscorr(a, b, lag=0):
    return a.corr(b.shift(lag))


# Autocorrelation lag
labels = clean_returns.columns.values
n = len(labels)

def lag_ccf(data,l,max_lag = 370):
    ccf = pd.DataFrame()
    for label in labels:
        correlation = np.zeros(max_lag)
        for h in range(max_lag):
            correlation[h] = crosscorr(data[l],data[label],lag=h)

        ccf.insert(len(ccf.columns),label, correlation)

    return ccf

strongest_ccf = pd.DataFrame()
strongest_lag = pd.DataFrame()

for label in labels:
    ccf = lag_ccf(clean_returns,label)

    strongest_vals = pd.Series(index=ccf.columns)
    strongest_indices = pd.Series(index=ccf.columns)

    for col in ccf.columns:
        col_series = ccf[col]
        abs_col_series = col_series.abs()
        idx = abs_col_series.idxmax()
        strongest_indices[col] = idx
        strongest_vals[col] = col_series.loc[idx]

    strongest_ccf = pd.concat([strongest_ccf,strongest_vals],axis=1)
    strongest_lag = pd.concat([strongest_lag,strongest_indices],axis=1)

strongest_ccf.columns = labels
strongest_lag.columns = labels

print(strongest_ccf) # showing the strongest cross-correlation for each pair
print(strongest_lag) # this shows the lag at which the cross-correlation is the strongest


              cucumbers   guitars  slingshots    stocks     sugar     water  \
cucumbers      1.000000 -0.258470   -0.228873  0.047994 -0.144956  0.574593   
guitars       -0.258470  1.000000    0.531217  0.052276  0.375472 -0.245264   
slingshots    -0.228873  0.531217    1.000000  0.054695  0.399244 -0.211150   
stocks        -0.043198 -0.041685    0.039115  1.000000  0.053134  0.049684   
sugar         -0.071552  0.206163    0.116873 -0.053931  1.000000 -0.078619   
water          0.574593 -0.245264   -0.211150  0.052977 -0.143326  1.000000   
tranquillity  -0.137998  0.229327    0.192116 -0.057417  0.128811 -0.161141   

              tranquillity  
cucumbers        -0.137998  
guitars           0.229327  
slingshots        0.192116  
stocks           -0.043538  
sugar             0.128811  
water            -0.161141  
tranquillity      1.000000  
              cucumbers  guitars  slingshots  stocks  sugar  water  \
cucumbers           0.0      0.0         0.0   340.0    1.0    0.

Here we see that the strongest correlation is between cucumber/water, then slingshots/guitars, slingshots/sugar and sugar/guitars. All other correlations can be deemed insignificant

In [4]:
# Check if returns are stationary
 # ADF: p-value < 0.05 --> stationary
 # KPSS: p-value > 0.05 --> stationary

stationary = pd.DataFrame()
for label in labels:
    adf = adfuller(clean_returns[label])
    kps = kpss(clean_returns[label])

    results = [adf[0], adf[1], kps[0],kps[1]]
    stationary.insert(len(stationary.columns),label,results)

stationary.index = ['ADF stat', 'ADF p-val','KPSS stat', 'KPSS p-val']
print(stationary)

look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])
look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])
look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])
look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])
look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])
look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])


            cucumbers       guitars    slingshots     stocks      sugar  \
ADF stat   -51.708874 -1.146766e+01 -1.856872e+01 -70.655806 -53.145186   
ADF p-val    0.000000  5.361688e-21  2.083667e-30   0.000000   0.000000   
KPSS stat    0.045447  1.410788e-01  4.156419e-02   0.097195   0.086293   
KPSS p-val   0.100000  1.000000e-01  1.000000e-01   0.100000   0.100000   

                water  tranquillity  
ADF stat   -42.072981    -71.833645  
ADF p-val    0.000000      0.000000  
KPSS stat    0.079399      0.080476  
KPSS p-val   0.100000      0.100000  


look-up table. The actual p-value is greater than the p-value returned.

  kps = kpss(clean_returns[label])


In summary: these results show that the returns are stationary for all time series