In [2]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
from statsmodels.graphics.tsaplots import plot_acf

sys.path.insert(0, str(Path.cwd().parent))

DATA_DIR = Path('../data/processed')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(exist_ok=True)

with open(DATA_DIR / 'train_test_split.pkl', 'rb') as f:
    data = pickle.load(f)

returns_df = data['train']
TICKERS = data['tickers']

In [4]:
print("SUMMARY STATISTICS")

summary = pd.DataFrame({
    'Mean': returns_df.mean(),
    'Std': returns_df.std(),
    'Min': returns_df.min(),
    'Max': returns_df.max(),
    'Skewness': returns_df.skew(),
    'Kurtosis': returns_df.kurtosis()
})

print(summary.to_string())
summary.to_csv(DATA_DIR / 'summary_statistics.csv')

SUMMARY STATISTICS
            Mean       Std       Min       Max  Skewness   Kurtosis
Ticker                                                             
AAPL    0.000679  0.016783 -0.060472  0.085237  0.060322   1.866926
AMZN    0.000334  0.022001 -0.151398  0.126949 -0.179520   5.141190
GOOGL   0.000756  0.019223 -0.099924  0.097348 -0.098439   3.084549
META    0.000765  0.028195 -0.306391  0.209308 -1.431237  28.536585
MSFT    0.000692  0.016258 -0.080296  0.079059 -0.110126   1.999852
NVDA    0.002246  0.032610 -0.105412  0.218088  0.383923   2.830875
TSLA    0.000773  0.037716 -0.131643  0.198187  0.098794   2.055923


In [6]:
fig, axes = plt.subplots(4, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, ticker in enumerate(TICKERS):
    axes[idx].plot(returns_df.index, returns_df[ticker], linewidth=0.8, color='steelblue')
    axes[idx].axhline(y=0, color='red', linestyle='--', linewidth=0.5, alpha=0.5)
    axes[idx].set_title(f'{ticker} - Log Returns', fontsize=10, fontweight='bold')
    axes[idx].set_ylabel('Log Return')
    axes[idx].grid(True, alpha=0.3)

axes[7].remove()
plt.tight_layout()
plt.savefig(FIG_DIR / '01_returns_timeseries.png', dpi=300, bbox_inches='tight')
plt.close()

In [8]:
fig, axes = plt.subplots(4, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, ticker in enumerate(TICKERS):
    returns = returns_df[ticker]
    axes[idx].hist(returns, bins=50, density=True, alpha=0.6, color='steelblue', edgecolor='black')
    
    mu, sigma = returns.mean(), returns.std()
    x = np.linspace(returns.min(), returns.max(), 100)
    axes[idx].plot(x, stats.norm.pdf(x, mu, sigma), 'r-', linewidth=2, label='Normal')
    
    axes[idx].set_title(f'{ticker} - Distribution (Kurtosis: {returns.kurtosis():.2f})', fontsize=10)
    axes[idx].set_xlabel('Log Return')
    axes[idx].legend(fontsize=8)
    axes[idx].grid(True, alpha=0.3)

axes[7].remove()
plt.tight_layout()
plt.savefig(FIG_DIR / '02_distributions.png', dpi=300, bbox_inches='tight')
plt.close()

In [10]:
#  ACF OF RETURNS (Test AR need)
fig, axes = plt.subplots(4, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, ticker in enumerate(TICKERS):
    plot_acf(returns_df[ticker], lags=20, ax=axes[idx])
    axes[idx].set_title(f'{ticker} - ACF of Returns', fontsize=10)
    axes[idx].set_xlabel('Lag')

axes[7].remove()
plt.tight_layout()
plt.savefig(FIG_DIR / '03_acf_returns.png', dpi=300, bbox_inches='tight')
plt.close()

In [12]:
# ACF OF SQUARED RETURNS (Test GARCH need)
fig, axes = plt.subplots(4, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, ticker in enumerate(TICKERS):
    squared_returns = returns_df[ticker] ** 2
    plot_acf(squared_returns, lags=20, ax=axes[idx])
    axes[idx].set_title(f'{ticker} - ACF of Squared Returns', fontsize=10)
    axes[idx].set_xlabel('Lag')

axes[7].remove()
plt.tight_layout()
plt.savefig(FIG_DIR / '04_acf_squared_returns.png', dpi=300, bbox_inches='tight')
plt.close()

In [14]:
fig, axes = plt.subplots(4, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, ticker in enumerate(TICKERS):
    rolling_vol = returns_df[ticker].rolling(window=60).std()
    axes[idx].plot(rolling_vol.index, rolling_vol, linewidth=1.5, color='darkgreen')
    axes[idx].set_title(f'{ticker} - 60-Day Rolling Volatility', fontsize=10)
    axes[idx].set_ylabel('Volatility')
    axes[idx].grid(True, alpha=0.3)

axes[7].remove()
plt.tight_layout()
plt.savefig(FIG_DIR / '05_rolling_volatility.png', dpi=300, bbox_inches='tight')
plt.close()

In [16]:
corr_matrix = returns_df.corr()
print("\n" + "="*80)
print("CORRELATION MATRIX")
print("="*80)
print(corr_matrix.to_string())
corr_matrix.to_csv(DATA_DIR / 'correlation_matrix.csv')

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8}, ax=ax)
ax.set_title('Correlation Matrix', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(FIG_DIR / '06_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()


CORRELATION MATRIX
Ticker      AAPL      AMZN     GOOGL      META      MSFT      NVDA      TSLA
Ticker                                                                      
AAPL    1.000000  0.560963  0.602766  0.486219  0.674799  0.545470  0.483848
AMZN    0.560963  1.000000  0.641713  0.584590  0.673793  0.560654  0.424995
GOOGL   0.602766  0.641713  1.000000  0.589152  0.697717  0.542436  0.378027
META    0.486219  0.584590  0.589152  1.000000  0.572446  0.486398  0.308455
MSFT    0.674799  0.673793  0.697717  0.572446  1.000000  0.629779  0.409270
NVDA    0.545470  0.560654  0.542436  0.486398  0.629779  1.000000  0.458799
TSLA    0.483848  0.424995  0.378027  0.308455  0.409270  0.458799  1.000000


In [24]:
print("\n" + "="*80)
print("ARCH TEST (Test if GARCH needed)")
print("="*80)

from statsmodels.stats.diagnostic import het_arch

arch_results = {}
for ticker in TICKERS:
    lm_stat, lm_pvalue, f_stat, f_pvalue = het_arch(returns_df[ticker], maxlag=5)
    arch_results[ticker] = {'p_value': f_pvalue, 'significant': f_pvalue < 0.05}
    status = "✓ GARCH needed" if f_pvalue < 0.05 else "✗ No GARCH"
    print(f"{ticker}: p-value = {f_pvalue:.4f} {status}")


ARCH TEST (Test if GARCH needed)
AAPL: p-value = 0.0000 ✓ GARCH needed
MSFT: p-value = 0.0016 ✓ GARCH needed
GOOGL: p-value = 0.0106 ✓ GARCH needed
AMZN: p-value = 0.0000 ✓ GARCH needed
META: p-value = 0.8494 ✗ No GARCH
NVDA: p-value = 0.2719 ✗ No GARCH
TSLA: p-value = 0.1793 ✗ No GARCH


  lm_stat, lm_pvalue, f_stat, f_pvalue = het_arch(returns_df[ticker], maxlag=5)
