In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os

warnings.filterwarnings('ignore')

# Set plot style
try:
    plt.style.use('ggplot')  # Use 'ggplot' style for modern look
except:
    plt.style.use('default')  # Fallback to default if ggplot is unavailable
sns.set()  # Apply Seaborn's aesthetic enhancements

# Load the data
df = pd.read_csv('../data/daily_stock_price/sp500_top25_technical_indicators.csv')

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'])

# Set up output directory for plots
OUTPUT_DIR = 'eda_plots'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# 1. Data Quality Checks
def check_data_quality(df):
    print("=== Data Quality Checks ===")
    print(f"Shape: {df.shape}")
    
    # Missing values
    missing_values = df.isnull().sum()
    print("\nMissing Values:")
    print(missing_values[missing_values > 0])
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"\nNumber of duplicate rows: {duplicates}")
    
    # Data types
    print("\nData Types:")
    print(df.dtypes)
    
    # Check date range
    print(f"\nDate Range: {df['date'].min()} to {df['date'].max()}")

check_data_quality(df)

# 2. Summary Statistics
def summary_statistics(df):
    print("\n=== Summary Statistics ===")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    summary = df[numeric_cols].describe()
    print(summary)
    
    # Per symbol summary
    print("\n=== Per Symbol Summary ===")
    symbol_summary = df.groupby('symbol')['Close'].agg(['mean', 'std', 'min', 'max'])
    print(symbol_summary)

summary_statistics(df)

# 3. Time Series Visualization (for all symbols)
def plot_time_series(df):
    for symbol in df['symbol'].unique():
        plt.figure(figsize=(12, 6))
        symbol_df = df[df['symbol'] == symbol]
        
        plt.plot(symbol_df['date'], symbol_df['Close'], label='Close Price')
        plt.plot(symbol_df['date'], symbol_df['SMA_20'], label='SMA 20', alpha=0.7)
        plt.fill_between(symbol_df['date'], symbol_df['BB_Upper'], symbol_df['BB_Lower'], 
                         alpha=0.1, label='Bollinger Bands')
        
        plt.title(f'{symbol} Close Price and Technical Indicators')
        plt.xlabel('Date')
        plt.ylabel('Price (USD)')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'{OUTPUT_DIR}/{symbol}_time_series.png', dpi=300, bbox_inches='tight')
        plt.close()

plot_time_series(df)

# 4. Correlation Analysis (for all symbols + overall)
def correlation_analysis(df):
    # Overall correlation for the dataset
    print("\n=== Overall Correlation Analysis ===")
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
    plt.title('Overall Correlation Matrix of Numeric Features')
    plt.savefig(f'{OUTPUT_DIR}/overall_correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Print top correlations
    print("\n=== Top 5 Positive Correlations ===")
    corr_unstack = corr_matrix.unstack()
    sorted_corr = corr_unstack.sort_values(kind="quicksort")
    print(sorted_corr[sorted_corr < 1].tail())
    
    print("\n=== Top 5 Negative Correlations ===")
    print(sorted_corr[sorted_corr > -1].head())
    
    # Per-symbol correlation heatmaps
    for symbol in df['symbol'].unique():
        plt.figure(figsize=(12, 8))
        symbol_df = df[df['symbol'] == symbol]
        numeric_cols = symbol_df.select_dtypes(include=[np.number]).columns
        corr_matrix = symbol_df[numeric_cols].corr()
        
        sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', center=0)
        plt.title(f'Correlation Matrix for {symbol}')
        plt.savefig(f'{OUTPUT_DIR}/{symbol}_correlation_matrix.png', dpi=300, bbox_inches='tight')
        plt.close()

correlation_analysis(df)

# 5. Distribution Analysis
def distribution_analysis(df):
    # RSI Distribution
    plt.figure(figsize=(10, 6))
    for symbol in df['symbol'].unique()[:5]:  # Limit to 5 symbols for clarity
        sns.histplot(df[df['symbol'] == symbol]['RSI_14'], label=symbol, alpha=0.5)
    
    plt.title('RSI Distribution for Top 5 Symbols')
    plt.xlabel('RSI (14-day)')
    plt.ylabel('Count')
    plt.legend()
    plt.savefig(f'{OUTPUT_DIR}/rsi_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    # Daily Return Distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(df['Daily_Return'].dropna(), bins=100)
    plt.title('Distribution of Daily Returns (All Symbols)')
    plt.xlabel('Daily Return')
    plt.ylabel('Count')
    plt.savefig(f'{OUTPUT_DIR}/daily_return_distribution.png', dpi=300, bbox_inches='tight')
    plt.close()

distribution_analysis(df)

# 6. Volatility Analysis
def volatility_analysis(df):
    plt.figure(figsize=(12, 6))
    volatility_summary = df.groupby('symbol')['Volatility_20'].mean().sort_values()
    
    volatility_summary.plot(kind='bar')
    plt.title('Average 20-day Volatility by Symbol')
    plt.xlabel('Symbol')
    plt.ylabel('Average Volatility (USD)')
    plt.xticks(rotation=45)
    plt.savefig(f'{OUTPUT_DIR}/volatility_by_symbol.png', dpi=300, bbox_inches='tight')
    plt.close()

volatility_analysis(df)

# 7. Box Plot of Key Indicators
def box_plot_indicators(df):
    plt.figure(figsize=(12, 6))
    indicators = ['RSI_14', 'MACD', 'ATR_14']
    df_melted = df.melt(id_vars=['symbol'], value_vars=indicators, 
                        var_name='Indicator', value_name='Value')
    
    sns.boxplot(x='Indicator', y='Value', data=df_melted)
    plt.title('Distribution of Key Technical Indicators')
    plt.savefig(f'{OUTPUT_DIR}/indicator_boxplot.png', dpi=300, bbox_inches='tight')
    plt.close()

box_plot_indicators(df)

# 8. Rolling Statistics
def rolling_statistics(df, symbol='AAPL'):
    aapl_df = df[df['symbol'] == symbol].set_index('date')
    
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    rolling_mean = aapl_df['Close'].rolling(window=20).mean()
    rolling_std = aapl_df['Close'].rolling(window=20).std()
    plt.plot(aapl_df.index, aapl_df['Close'], label='Close Price')
    plt.plot(rolling_mean.index, rolling_mean, label='20-day Rolling Mean')  # Fixed: Use rolling_mean
    plt.fill_between(rolling_std.index, 
                     rolling_mean - rolling_std, 
                     rolling_mean + rolling_std, 
                     alpha=0.1, label='±1 Std Dev')
    plt.title(f'{symbol} Rolling Statistics')
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    
    plt.subplot(2, 1, 2)
    plt.plot(aapl_df.index, aapl_df['Daily_Return'].rolling(window=20).std(), 
             label='20-day Return Volatility')
    plt.xlabel('Date')
    plt.ylabel('Volatility')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'{OUTPUT_DIR}/{symbol}_rolling_stats.png', dpi=300, bbox_inches='tight')
    plt.close()

rolling_statistics(df)

# 9. MACD Analysis
def macd_analysis(df, symbol='AAPL'):
    aapl_df = df[df['symbol'] == symbol]
    
    plt.figure(figsize=(12, 8))
    
    plt.subplot(2, 1, 1)
    plt.plot(aapl_df['date'], aapl_df['Close'], label='Close Price')
    plt.title(f'{symbol} Price and MACD')
    plt.ylabel('Price (USD)')
    plt.legend()
    
    plt.subplot(2, 1, 2)
    plt.plot(aapl_df['date'], aapl_df['MACD'], label='MACD')
    plt.plot(aapl_df['date'], aapl_df['MACD_Signal'], label='Signal Line')
    plt.bar(aapl_df['date'], aapl_df['MACD_Hist'], alpha=0.3, label='MACD Histogram')
    plt.axhline(0, color='black', linestyle='--', alpha=0.3)
    plt.ylabel('MACD')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig(f'{OUTPUT_DIR}/{symbol}_macd_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

macd_analysis(df)

# 10. Save summary report
def save_summary_report(df):
    with open(f'{OUTPUT_DIR}/eda_summary.txt', 'w') as f:
        f.write("=== Exploratory Data Analysis Summary ===\n\n")
        f.write(f"Dataset Shape: {df.shape}\n")
        f.write(f"Date Range: {df['date'].min()} to {df['date'].max()}\n")
        f.write(f"Number of Symbols: {len(df['symbol'].unique())}\n")
        
        f.write("\n=== Missing Values ===\n")
        missing_values = df.isnull().sum()
        f.write(str(missing_values[missing_values > 0]))
        
        f.write("\n=== Key Observations ===\n")
        f.write("- High correlation between price-based indicators (Close, SMA_20, BB_Upper/Lower)\n")
        f.write("- RSI distributions vary by symbol, indicating different momentum characteristics\n")
        f.write("- Volatility differs significantly across symbols\n")
        f.write("- Daily returns show fat-tailed distributions, suggesting potential for extreme movements\n")

save_summary_report(df)

print("EDA completed. Visualizations saved in 'eda_plots' directory.")

=== Data Quality Checks ===
Shape: (141151, 27)

Missing Values:
SMA_20             475
RSI_14             325
BB_Upper           475
BB_Lower           475
ATR_14             325
Close_Lag_1         25
Close_Lag_2         50
Close_Lag_3         75
Close_Lag_5        125
Volume_Lag_1        25
Volume_Lag_3        75
Daily_Return        25
Volatility_20      475
MACD_Hist_Slope     25
dtype: int64

Number of duplicate rows: 0

Data Types:
symbol                      object
date                datetime64[ns]
Open                       float64
High                       float64
Low                        float64
Close                      float64
Volume                       int64
SMA_20                     float64
RSI_14                     float64
MACD                       float64
MACD_Signal                float64
MACD_Hist                  float64
BB_Upper                   float64
BB_Lower                   float64
ATR_14                     float64
OBV                          int6