In [None]:
import pandas as pd
!pip install matplotlib
import matplotlib.pyplot as plt
#from statsmodels.tsa.stattools import adfuller
import numpy as np

In [None]:
files = ["./round-1-island-data-bottle/prices_round_1_day_-2.csv",
         "./round-1-island-data-bottle/prices_round_1_day_-1.csv",
         "./round-1-island-data-bottle/prices_round_1_day_0.csv",
         ]

dfs = []

for file in files:
    tmp_df = pd.read_csv(file, sep=';')
    dfs.append(tmp_df)

df = pd.concat(dfs, ignore_index=True)
df["global_timestamp"] = (df["day"] + 2) * 1_000_000 + df["timestamp"]

In [None]:
df.sort_values("global_timestamp", inplace=True)

df.drop(columns=["day", "mid_price", "profit_and_loss", "timestamp"], inplace = True, errors = "ignore")

bid_cols = [col for col in df.columns if col.startswith("bid_price_")]
ask_cols = [col for col in df.columns if col.startswith("ask_price_")]

def get_lowest_bid(row):
    bids = [row[b] for b in bid_cols if pd.notnull(row[b])]
    if not bids:
        return None
    return min(bids)

def get_highest_ask(row):
    asks = [row[a] for a in ask_cols if pd.notnull(row[a])]
    if not asks:
        return None
    return max(asks)

df["lowest_bid"] = df.apply(get_lowest_bid, axis=1)
df["highest_ask"] = df.apply(get_highest_ask, axis=1)
df["fair_value"] = (df["lowest_bid"] + df["highest_ask"]) / 2.0

df_kelp = df[df["product"] == "KELP"].copy()
df_squid = df[df["product"] == "SQUID_INK"].copy()

mean_kelp = df_kelp["fair_value"].mean()
mean_squid = df_squid["fair_value"].mean()
std_kelp = df_kelp["fair_value"].std()
std_squid = df_squid["fair_value"].std()


## P test for random walk

In [None]:
!pip install statsmodels
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox

In [None]:
df_kelp.sort_values(by="global_timestamp", inplace=True)
df_squid.sort_values(by="global_timestamp", inplace=True)

# Get the fair_value as a numpy array (dropping missing values)
kelp_values = df_kelp["fair_value"].dropna().values
squid_values = df_squid["fair_value"].dropna().values

In [None]:
# ADF (Augmented Dickey-Fuller) test
def run_adf_test(timeseries, label):
    result = adfuller(timeseries, autolag="AIC")
    adf_statistic = result[0]
    p_value = result[1]
    used_lag = result[2]
    nobs = result[3]

    print(f"\nADF Test for {label}:")
    print(f"  ADF Statistic: {adf_statistic:.4f}")
    print(f"  p-value:       {p_value:.20f}")
    print(f"  # Lags Used:   {used_lag}")
    print(f"  # Observations:{nobs}")
    
    # Interpretation based on p < 0.05 threshold
    p_threshold = 0.05
    if p_value < p_threshold:
        print(f"  --> p < {p_threshold} => Reject H0 (random walk), "
              "suggesting non-random/meaningful pattern.")
    else:
        print(f"  --> p >= {p_threshold} => Fail to Reject H0 (random walk), "
              "suggesting random/walk-like behavior.")

In [None]:
# p value is the chance of seeing data like this if the series really were a random walk.
run_adf_test(kelp_values, "KELP")
run_adf_test(squid_values, "SQUID_INK")

In [None]:
# 2) Ljung-Box Test
def run_ljung_box_test(timeseries, label, lags=10):
    # The 'lags' parameter is somewhat arbitrary; you can experiment with it.
    lb_result = acorr_ljungbox(timeseries, lags=[lags], return_df=True)
    p_value = lb_result["lb_pvalue"].iloc[0]
    test_stat = lb_result["lb_stat"].iloc[0]
    
    print(f"\nLjung-Box Test for {label} (lags={lags}):")
    print(f"  LB Statistic: {test_stat:.4f}")
    print(f"  p-value:      {p_value:.20f}")
    p_threshold = 0.05
    if p_value < p_threshold:
        print(f"  --> p < {p_threshold} => Suggests autocorrelation (non-random).")
    else:
        print(f"  --> p >= {p_threshold} => No strong evidence of autocorrelation (random-like).")

In [None]:
run_ljung_box_test(kelp_values, "KELP", lags=10)
run_ljung_box_test(squid_values, "SQUID_INK", lags=10)

In [None]:
# EXPLANATIONS
#High Autocorrelation: Each data point in the series closely depends on previous points—so if yesterday’s price was high, today’s price is likely high too.

#Random-Walk-Like (ADF Test): The series doesn’t settle around a constant average but keeps “wandering” over time. The ADF test can’t rule out that the series is drifting randomly.

## Leading and Lagging Detection

In [None]:
df_kelp["fair_value"] = df_kelp["fair_value"] - mean_kelp
df_squid["fair_value"] = df_squid["fair_value"] - mean_squid

#derek's scaling and alingment that supports the pair trading algorithm
df_squid["fair_value"] = (-1 * df_squid["fair_value"]) + 2000
df_kelp["fair_value"] = df_kelp["fair_value"] * (std_squid / std_kelp) + 2000

In [None]:
plt.figure()
plt.plot(df_kelp["global_timestamp"], df_kelp["fair_value"], label="KELP Fair Value")
plt.plot(df_squid["global_timestamp"], df_squid["fair_value"], label="SQUID_INK Fair Value")

#plt.axhline(y=2000, label="Mean Fair Value")

plt.xlabel("Timestamp")
plt.ylabel("Fair Value")
plt.title("KELP vs SQUID_INK Fair Value")
plt.legend()
plt.show()

In [None]:
plt.figure()
plt.plot(df_kelp["global_timestamp"], ratio_series, label="KELP / SQUID_INK")

plt.axhline(
    y=1
)

plt.xlabel("Timestamp")
plt.ylabel("Ratio of Fair Values")
plt.title("Ratio: KELP Fair Value / SQUID_INK Fair Value")
plt.legend()
plt.show()

## Messing Around for Chat

In [None]:
df_squid.columns

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_squid["global_timestamp"], df_squid["fair_value"])
plt.xlabel("Timestamp")
plt.ylabel("Fair Value")
plt.title("SQUID_INK Fair Value over Time")
plt.show()

In [None]:
# Assuming df has a 'SQUID_INK' column with price data for each iteration
# 1. Calculate returns (price differences)
df_squid['returns'] = df_squid["fair_value"].diff()

# 2. Drop NaN from the first diff
#df_squid_no_start = df_squid[int(len(df) * 0.10):]
returns = df_squid['returns'].dropna()
# 3. Autocorrelation Plot
plt.figure()
plot_acf(returns, lags=50)
plt.title("Autocorrelation of SQUID_INK Returns")
plt.show()

# 4. Partial Autocorrelation Plot
plt.figure()
plot_pacf(returns, lags=50, method='ywm')  # 'ywm' is often stable for financial time series
plt.title("Partial Autocorrelation of SQUID_INK Returns")
plt.show()

big spike approximates
- lag 0 - 1.00
- lag 1 - -0.10
- lag 2 - -0.07
- lag 3 - 0.05
- lag 12 - 0.05

In [None]:
# Using the same 'returns' from above
plt.figure()
returns.hist(bins=100)  # Adjust bins as you wish
plt.yscale('log')  # Set y-axis to log scale
plt.title("Distribution of SQUID_INK Returns")
plt.xlabel("Return")
plt.ylabel("Frequency")
plt.show()


In [None]:
from scipy.stats import normaltest

k2, p_value = normaltest(returns.dropna())
print(f"Statistic: {k2}, p-value: {p_value}")

if p_value < 0.05:
    print("Reject null hypothesis of normality (data is likely not normal).")
else:
    print("Fail to reject null hypothesis of normality (data is likely normal).")


## generating volatitiliy score

In [None]:
import matplotlib.pyplot as plt

plt.plot(df_squid["global_timestamp"], df_squid["fair_value"])
plt.xlabel("Timestamp")
plt.ylabel("Fair Value")
plt.title("SQUID_INK Fair Value over Time")
plt.show()

In [None]:
# Make sure fair_value is float
df_squid['fair_value'] = df_squid['fair_value'].astype(float)

# Use a rolling window (e.g., 30 rows)
window = 10
df_squid['fair_value_volatility'] = df_squid['fair_value'].rolling(window).std()
print(df_squid[['global_timestamp', 'fair_value', 'fair_value_volatility']].head(20))

In [None]:
# Sum top 3 bid and ask volumes
df_squid['total_bid_volume'] = df_squid[['bid_volume_1', 'bid_volume_2', 'bid_volume_3']].sum(axis=1)
df_squid['total_ask_volume'] = df_squid[['ask_volume_1', 'ask_volume_2', 'ask_volume_3']].sum(axis=1)

# Order book imbalance
df_squid['book_imbalance'] = (
    df_squid['total_bid_volume'] - df_squid['total_ask_volume']
) / (
    df_squid['total_bid_volume'] + df_squid['total_ask_volume']
)

# Rolling volatility of order book imbalance
df_squid['imbalance_volatility'] = df_squid['book_imbalance'].rolling(window).std()
print(df_squid[['global_timestamp', 'book_imbalance', 'imbalance_volatility']].head(20))

In [None]:
df_squid['volatility_score'] = (
    0.6 * df_squid['fair_value_volatility'].fillna(0) +
    0.4 * df_squid['imbalance_volatility'].fillna(0)
)
print(df_squid[['global_timestamp', 'fair_value_volatility', 'imbalance_volatility', 'volatility_score']].head(20))

In [None]:
df_squid[['fair_value', 'volatility_score']].plot(figsize=(12, 6), secondary_y='volatility_score')
plt.title('Fair Value and Volatility Score Over Time')
plt.show()