# Stock Clustering Exploratory Analysis

This notebook demonstrates the exploratory data analysis and clustering process for stock price data.

## Setup and Imports

In [None]:
# Install required packages if needed
# !pip install -r requirements.txt

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path().absolute().parent / "src"))

from src.data_fetcher import DataFetcher
from src.feature_extractor import FeatureExtractor
from src.clustering import StockClustering
from src.visualizer import ClusterVisualizer

# Set style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 1. Data Fetching Exploration

In [None]:
# Initialize data fetcher
data_fetcher = DataFetcher(cache_dir="../data/raw")

# Test with a small sample of symbols
test_symbols = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'AMZN']

print("Fetching data for test symbols...")
stock_data_dict = data_fetcher.fetch_multiple_stocks_data(
    test_symbols, 
    period="2y",  # 2 years for faster testing
    validate_symbols=True
)

print(f"Successfully fetched data for {len(stock_data_dict)} symbols")
print(f"Symbols: {list(stock_data_dict.keys())}")

In [None]:
# Combine the data
if stock_data_dict:
    combined_data = data_fetcher.combine_all_data(stock_data_dict)
    print(f"Combined data shape: {combined_data.shape}")
    print(f"Columns: {list(combined_data.columns)}")
    print(f"Date range: {combined_data['date'].min()} to {combined_data['date'].max()}")
    
    # Display first few rows
    display(combined_data.head())
    
    # Basic statistics
    print("\nBasic statistics:")
    display(combined_data.describe())
else:
    print("No data fetched to analyze")

## 2. Feature Extraction Exploration

In [None]:
if stock_data_dict:
    # Initialize feature extractor
    feature_extractor = FeatureExtractor()
    
    # Extract features
    print("Extracting features...")
    features_with_data = feature_extractor.extract_features_for_clustering(combined_data)
    
    print(f"Features extracted. Shape: {features_with_data.shape}")
    print(f"Feature columns: {[col for col in features_with_data.columns if col not in ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']]}")
    
    # Display sample of features
    print("\nSample of extracted features:")
    feature_cols = [col for col in features_with_data.columns if col not in ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']]
    display(features_with_data[['symbol', 'date'] + feature_cols[:10]].head())

## 3. Fluctuation Analysis

In [None]:
if stock_data_dict:
    # Analyze fluctuation patterns (30-70% ranges)
    fluctuation_stats = feature_extractor.count_fluctuation_cycles(
        combined_data, 
        min_pct=30, 
        max_pct=70
    )
    
    print("Fluctuation Analysis (30-70% ranges):")
    for symbol, stats in fluctuation_stats.items():
        print(f"\n{symbol}:")
        print(f"  Fluctuation count: {stats['fluctuation_count']}")
        print(f"  Average period: {stats['avg_fluctuation_period']:.1f} days")
        print(f"  Frequency: {stats['fluctuation_frequency']:.2f} per year")

## 4. Feature Matrix Creation

In [None]:
if stock_data_dict:
    # Create feature matrix for clustering
    feature_matrix = feature_extractor.create_feature_matrix(
        features_with_data, 
        features_per_symbol=True
    )
    
    print(f"Feature matrix shape: {feature_matrix.shape}")
    print(f"Feature matrix columns: {list(feature_matrix.columns)}")
    
    # Prepare features for clustering
    clustering_features, scaler = feature_extractor.prepare_features_for_clustering(
        feature_matrix
    )
    
    print(f"Clustering features shape: {clustering_features.shape}")
    
    # Display feature matrix
    display(feature_matrix.head())

## 5. Clustering Analysis

In [None]:
if stock_data_dict and len(feature_matrix) > 2:
    # Initialize clustering analyzer
    clustering_analyzer = StockClustering(max_clusters=5)  # Smaller for testing
    
    # Find optimal number of clusters
    print("Finding optimal number of clusters...")
    cluster_results = clustering_analyzer.find_optimal_clusters(
        clustering_features, 
        cluster_range=range(2, min(6, len(feature_matrix)))  # Test 2-5 clusters
    )
    
    print("\nCluster evaluation results:")
    for n_clusters, metrics in cluster_results.items():
        print(f"{n_clusters} clusters: Silhouette={metrics['silhouette_score']:.3f}, "
              f"Calinski-Harabasz={metrics['calinski_harabasz_score']:.1f}, "
              f"Davies-Bouldin={metrics['davies_bouldin_score']:.3f}")
    
    # Select best and perform clustering
    best_n_clusters = clustering_analyzer.select_best_n_clusters(cluster_results)
    print(f"\nSelected optimal clusters: {best_n_clusters}")
    
    # Perform clustering
    cluster_labels = clustering_analyzer.perform_clustering(
        clustering_features,
        n_clusters=best_n_clusters,
        auto_optimize=False
    )
    
    print(f"\nClustering completed. Labels: {cluster_labels}")
    
    # Analyze clusters
    cluster_analysis = clustering_analyzer.analyze_clusters(feature_matrix, cluster_labels)
    
    print("\nCluster analysis:")
    for cluster_id, stats in cluster_analysis.items():
        print(f"\nCluster {cluster_id}:")
        print(f"  Size: {stats['size']} stocks")
        print(f"  Symbols: {', '.join(stats.get('symbols', []))}")
        print(f"  Avg volatility: {stats.get('volatility_252d_mean', 0):.3f}")
        print(f"  Avg return: {stats.get('return_mean', 0):.4f}")

## 6. Visualization

In [None]:
if stock_data_dict and len(feature_matrix) > 2:
    # Initialize visualizer
    visualizer = ClusterVisualizer(output_dir="../reports")
    
    # Get cluster assignments
    cluster_assignments = clustering_analyzer.get_cluster_assignments(feature_matrix)
    
    # Reduce dimensions for visualization
    features_2d, _ = clustering_analyzer.reduce_dimensions(
        clustering_features, method='pca', n_components=2
    )
    
    # Generate descriptive labels
    descriptive_labels = clustering_analyzer.generate_cluster_labels(cluster_analysis)
    print("\nDescriptive cluster labels:")
    for cluster_id, label in descriptive_labels.items():
        print(f"  Cluster {cluster_id}: {label}")
    
    # Create visualizations
    plots_created = visualizer.create_comprehensive_report(
        features_2d=features_2d,
        cluster_labels=cluster_labels,
        cluster_assignments=cluster_assignments,
        cluster_analysis=cluster_analysis,
        clustering_metrics=clustering_analyzer.evaluate_clustering_quality(clustering_features, cluster_labels),
        stock_data=combined_data,
        method='PCA'
    )
    
    print("\nVisualizations created:")
    for plot_name, plot_path in plots_created.items():
        print(f"  {plot_name}: {plot_path}")

## 7. Summary and Insights

In [None]:
if stock_data_dict:
    print("=== STOCK CLUSTERING ANALYSIS SUMMARY ===")
    print(f"\nData Overview:")
    print(f"  - Symbols analyzed: {len(stock_data_dict)}")
    print(f"  - Data points per symbol: {len(combined_data) // len(stock_data_dict)}")
    print(f"  - Date range: {combined_data['date'].min().strftime('%Y-%m-%d')} to {combined_data['date'].max().strftime('%Y-%m-%d')}")
    
    print(f"\nFeature Engineering:")
    print(f"  - Total features extracted: {len([col for col in features_with_data.columns if col not in ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']])}")
    print(f"  - Feature matrix shape: {feature_matrix.shape}")
    
    if len(feature_matrix) > 2:
        print(f"\nClustering Results:")
        print(f"  - Optimal clusters found: {best_n_clusters}")
        print(f"  - Clustering quality metrics calculated")
        
        for cluster_id, stats in cluster_analysis.items():
            label = descriptive_labels.get(cluster_id, f"Cluster {cluster_id}")
            print(f"  - {label}: {stats['size']} stocks")
    
    print(f"\nFluctuation Analysis (30-70% ranges):")
    for symbol, stats in fluctuation_stats.items():
        if stats['fluctuation_count'] > 0:
            print(f"  - {symbol}: {stats['fluctuation_count']} cycles, avg period {stats['avg_fluctuation_period']:.0f} days")
        else:
            print(f"  - {symbol}: No significant fluctuations detected")
    
    print(f"\nOutputs Generated:")
    if 'plots_created' in locals():
        for plot_name, plot_path in plots_created.items():
            print(f"  - {plot_name}: {plot_path}")
else:
    print("No data available for analysis")