In [None]:
# 02_data_exploration.ipynb

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import scipy.stats as stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('default')
sns.set_theme()

# Load preprocessed data
def load_preprocessed_data():
    """Load the preprocessed data and return the dataframe"""
    data_path = Path('../data/processed/preprocessed_data.csv')
    if not data_path.exists():
        raise FileNotFoundError("Preprocessed data not found. Please run 01_data_preprocessing first.")
    
    df = pd.read_csv(data_path)
    print(f"Loaded data with shape: {df.shape}")
    return df

# Correlation analysis with depression scores
def explore_correlations(df, dep_col='depression_score', corr_threshold=0.3):
    """Explore correlations with depression scores"""
    print("\nAnalyzing correlations with depression scores...")
    
    # Calculate correlations
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    numeric_cols = [col for col in numeric_cols if col not in ['src_subject_id', 'depression_score']]
    
    correlations = df[numeric_cols].corrwith(df[dep_col]).sort_values(ascending=False)
    
    # Find highly correlated variables
    high_corr = correlations[abs(correlations) >= corr_threshold]
    
    print(f"\nFound {len(high_corr)} variables with |correlation| >= {corr_threshold}")
    print("\nTop 20 positive correlations:")
    print(high_corr.head(20))
    print("\nTop 20 negative correlations:")
    print(high_corr.tail(20))
    
    # Plot top correlations
    top_n = 20
    plt.figure(figsize=(12, 8))
    high_corr.head(top_n).plot(kind='bar')
    plt.title(f'Top {top_n} Positive Correlations with Depression Score')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    plt.figure(figsize=(12, 8))
    high_corr.tail(top_n).plot(kind='bar')
    plt.title(f'Top {top_n} Negative Correlations with Depression Score')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
    return high_corr

# Depression score analysis
def analyze_depression_scores(df):
    """Analyze the distribution and characteristics of depression scores"""
    print("\nDepression Score Analysis:")
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(df['depression_score'].describe())
    
    # Distribution plot
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='depression_score', bins=30)
    plt.title('Distribution of Depression Scores')
    plt.show()
    
    # Calculate percentiles
    percentiles = [25, 50, 75, 90, 95, 99]
    print("\nPercentiles:")
    for p in percentiles:
        value = df['depression_score'].quantile(p/100)
        print(f"{p}th percentile: {value:.2f}")

# Feature importance analysis
def analyze_feature_importance(df, top_n=100):
    """Analyze feature importance using Random Forest"""
    print("\nFeature Importance Analysis:")
    
    # Prepare data
    X = df.drop(columns=['src_subject_id', 'depression_score'])
    y = df['depression_score']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Train Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_scaled, y)
    
    # Get feature importance
    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Print top 100 features and their importance scores
    print("\nTop 100 Most Important Features:")
    print(importance.head(100).to_string())
    
    # Plot top 100 features in multiple subplots for better readability
    n_plots = 4  # Number of subplots
    features_per_plot = top_n // n_plots
    
    plt.figure(figsize=(20, 5*n_plots))
    for i in range(n_plots):
        start_idx = i * features_per_plot
        end_idx = (i + 1) * features_per_plot
        
        plt.subplot(n_plots, 1, i+1)
        plot_data = importance.iloc[start_idx:end_idx]
        sns.barplot(data=plot_data, x='importance', y='feature')
        plt.title(f'Features {start_idx+1}-{end_idx} by Importance')
        plt.tight_layout()
    
    plt.tight_layout()
    plt.show()
    
    # Save importance scores to CSV
    output_path = Path('../data/processed/feature_importance.csv')
    importance.to_csv(output_path, index=False)
    print(f"\nFeature importance scores saved to: {output_path}")
    
    return importance

# Main execution
if __name__ == "__main__":
    # Load the data
    print("Loading preprocessed data...")
    df = load_preprocessed_data()
    
    # Analyze depression scores
    print("\n=== Depression Score Analysis ===")
    analyze_depression_scores(df)
    
    # Explore correlations
    print("\n=== Correlation Analysis ===")
    high_correlations = explore_correlations(df)
    
    # Analyze feature importance
    print("\n=== Feature Importance Analysis ===")
    feature_importance = analyze_feature_importance(df)
    
    print("\nExploration complete!")