notebooks/crypto_exploration.ipynb

In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import acf, pacf
import glob
import os
from datetime import datetime

In [None]:
# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

In [None]:
# Read all parquet files
def load_all_crypto_data(data_dir="../data/processed"):
    all_data = {}
    for file in glob.glob(os.path.join(data_dir, "*.parquet")):
        symbol = os.path.basename(file).split('_')[0]
        df = pq.read_table(file).to_pandas()
        all_data[symbol] = df
    return all_data

In [None]:
# Load the data
crypto_data = load_all_crypto_data()

In [None]:
# 1. Price Series Visualization
plt.figure(figsize=(15, 10))
for symbol, df in crypto_data.items():
    plt.plot(df['timestamp'], df['close'].astype(float), label=symbol)
plt.title('Crypto Price Series')
plt.xlabel('Time')
plt.ylabel('Price (USDT)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# 2. Returns Calculation and Correlation Analysis
# Calculate returns for each crypto
returns_data = {}
for symbol, df in crypto_data.items():
    returns = pd.DataFrame()
    returns['timestamp'] = df['timestamp']
    returns['returns'] = np.log(df['close'].astype(float)).diff()
    returns_data[symbol] = returns

In [None]:
# Create a combined returns dataframe
combined_returns = pd.DataFrame()
for symbol, returns in returns_data.items():
    combined_returns[symbol] = returns['returns']
combined_returns.index = list(returns_data.values())[0]['timestamp']

In [None]:
# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(combined_returns.corr(), 
            annot=True, 
            cmap='RdYlBu', 
            center=0,
            fmt='.2f')
plt.title('Correlation Heatmap of Crypto Returns')
plt.tight_layout()
plt.show()

In [None]:
# 3. ACF and PACF Analysis
def plot_acf_pacf(series, symbol, lags=50):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))
    
    # ACF
    acf_values = acf(series.dropna(), nlags=lags)
    ax1.stem(range(len(acf_values)), acf_values)
    ax1.axhline(y=0, linestyle='-', color='black')
    ax1.axhline(y=-1.96/np.sqrt(len(series)), linestyle='--', color='gray')
    ax1.axhline(y=1.96/np.sqrt(len(series)), linestyle='--', color='gray')
    ax1.set_title(f'Autocorrelation Function for {symbol}')
    
    # PACF
    pacf_values = pacf(series.dropna(), nlags=lags)
    ax2.stem(range(len(pacf_values)), pacf_values)
    ax2.axhline(y=0, linestyle='-', color='black')
    ax2.axhline(y=-1.96/np.sqrt(len(series)), linestyle='--', color='gray')
    ax2.axhline(y=1.96/np.sqrt(len(series)), linestyle='--', color='gray')
    ax2.set_title(f'Partial Autocorrelation Function for {symbol}')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot ACF and PACF for each crypto
for symbol in returns_data.keys():
    print(f"\nAnalyzing {symbol}")
    plot_acf_pacf(returns_data[symbol]['returns'], symbol)

In [None]:
# 4. Summary Statistics
summary_stats = pd.DataFrame()
for symbol, returns in returns_data.items():
    stats = returns['returns'].describe()
    stats['skewness'] = returns['returns'].skew()
    stats['kurtosis'] = returns['returns'].kurtosis()
    summary_stats[symbol] = stats

In [None]:
print("\nSummary Statistics for Returns:")
print(summary_stats)

In [None]:
# 5. Volatility Analysis
# Calculate rolling volatility (30-minute window)
plt.figure(figsize=(15, 10))
for symbol, returns in returns_data.items():
    vol = returns['returns'].rolling(window=30).std() * np.sqrt(30)
    plt.plot(returns['timestamp'], vol, label=symbol)

In [None]:
plt.title('30-Minute Rolling Volatility')
plt.xlabel('Time')
plt.ylabel('Annualized Volatility')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()