In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pywt
from scipy.stats import kurtosis

# === PATH SETTINGS ===
base_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\splits_SP500"
output_dir = r"C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\EDA"
os.makedirs(output_dir, exist_ok=True)

# === LOAD DATA (choose one representative window) ===
subset_folder = "SP500_2015-10-05_2018-06-03"  # change if needed
label_path = os.path.join(base_dir, subset_folder, "label.csv")

df = pd.read_csv(label_path, index_col=0, parse_dates=True)
df.fillna(0, inplace=True)
returns = df.values

# === NORMALIZE RETURNS ===
returns_norm = (returns - np.mean(returns)) / np.std(returns)

# === BASIC STATS ===
corr_matrix = np.corrcoef(returns_norm, rowvar=False)
avg_corr = np.mean(corr_matrix[np.triu_indices_from(corr_matrix, k=1)])
excess_kurt = kurtosis(returns_norm.flatten(), fisher=True)

# === PRINT SUMMARY ===
print(f"Average cross-sectional correlation: {avg_corr:.4f}")
print(f"Excess kurtosis of normalized returns: {excess_kurt:.4f}")

# === SAVE SUMMARY TO TEXT FILE ===
summary_path = os.path.join(output_dir, "eda_summary.txt")
with open(summary_path, "w") as f:
    f.write("Exploratory Data Analysis Summary\n")
    f.write("--------------------------------\n")
    f.write(f"Average cross-sectional correlation: {avg_corr:.4f}\n")
    f.write(f"Excess kurtosis of normalized returns: {excess_kurt:.4f}\n")

# === FEATURE CORRELATION HEATMAP ===
plt.figure(figsize=(8, 6))
im = plt.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Feature Correlation Heatmap (Representative Window)", fontsize=12, pad=12)
plt.colorbar(im, label="Correlation")
plt.xlabel("Feature Index")
plt.ylabel("Feature Index")
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "feature_correlation_heatmap.png"), dpi=300, bbox_inches='tight')
plt.close()

# === WAVELET DECOMPOSITION EXAMPLE ===
example_series = df.iloc[:, 0].values  # first stock
example_series = (example_series - np.mean(example_series)) / np.std(example_series)

wavelet = 'db4'
level = 3
coeffs = pywt.wavedec(example_series, wavelet, level=level)
A = coeffs[0]
D = coeffs[1:]

fig, axes = plt.subplots(level + 1, 1, figsize=(10, 6))
axes[0].plot(A, lw=1)
axes[0].set_title("Approximation (Low-Frequency Component)", fontsize=11)
for i, d in enumerate(D):
    axes[i + 1].plot(d, lw=1)
    axes[i + 1].set_title(f"Detail Level {i + 1} (High-Frequency Component)", fontsize=11)
for ax in axes:
    ax.set_xlabel("Time", fontsize=9)
    ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "wavelet_decomposition_example.png"), dpi=300, bbox_inches='tight')
plt.close()

# === DISTRIBUTION OF NORMALIZED RETURNS ===
plt.figure(figsize=(8, 5))
plt.hist(returns_norm.flatten(), bins=80, density=True, alpha=0.7)
plt.title("Distribution of Normalized Returns", fontsize=12)
plt.xlabel("Normalized Return")
plt.ylabel("Density")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, "normalized_return_distribution.png"), dpi=300, bbox_inches='tight')
plt.close()

# === REPORT COMPLETION ===
print(f"\nEDA complete. Files saved in:\n{output_dir}")
print("\nSaved files:")
print(" - feature_correlation_heatmap.png")
print(" - wavelet_decomposition_example.png")
print(" - normalized_return_distribution.png")
print(" - eda_summary.txt")


Average cross-sectional correlation: 0.2347
Excess kurtosis of normalized returns: 25.4657

EDA complete. Files saved in:
C:\Users\ns243\Documents\Academic\AI Master\Internship\Data\EDA

Saved files:
 - feature_correlation_heatmap.png
 - wavelet_decomposition_example.png
 - normalized_return_distribution.png
 - eda_summary.txt
