In [None]:
# import relevant libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.seasonal import seasonal_decompose
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('TEST_Trader_Quant_dataset.csv')

In [23]:
df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,47,48,49,50,51,52,53,54,55,56
0,74.23525,124.000,23.000,149.187,7.459,7.872,257.347,77.510,86.753,0,...,15582,6.8649,44,533,201481,85873,42474,237989,69445,33461
1,74.17525,33.105,280.280,133.749,0.709,31.305,87.454,51.044,130.774,2,...,13398,6.8589,45012,63355,230732,84898,45959,155027,45111,44533
2,74.18325,375.086,323.644,170.037,3.999,25.476,168.794,72.876,270.396,2,...,12777,6.8595,27710,55091,247450,98765,43705,206995,74398,42840
3,74.17625,48.775,25.853,93.927,39.872,14.148,72.699,65.654,352.091,0,...,14498,6.8595,32364,39759,237211,95343,35553,217658,75365,53509
4,74.17125,48.774,301.886,90.637,30.003,20.829,201.224,24.241,96.640,0,...,14704,6.8583,17667,52192,236698,102647,44195,196868,74185,19152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8683,65.77675,497.010,578.836,67.163,152.204,7.749,99.030,47.483,84.494,2,...,5742,6.2493,55771,203724,183714,10791,10770,48708,10358,7681
8684,65.76575,384.242,257.090,90.231,9.971,61.501,30.629,8.503,33.369,2,...,6359,6.2415,63770,59750,104302,32718,19958,155624,56757,13380
8685,65.80125,540.689,237.266,51.585,6.927,15.746,36.759,6.136,9.268,1,...,5693,6.2427,68479,26975,55850,32381,6030,123323,56775,6422
8686,65.83125,309.421,182.045,60.519,18.490,4.964,27.717,21.620,16.830,0,...,6330,6.2418,49254,49670,52459,33657,3869,124079,67859,4275


In [None]:
"""
Comprehensive Time Series Data Analysis Code
============================================

This script provides a complete framework for analyzing time series data with unknown relationships.
Each section includes interpretation guidelines and what to look for in the outputs.

Usage: python time-series-analysis-code.py
"""


# ========================================================================
# 1. DATA LOADING AND INITIAL EXPLORATION
# ========================================================================

def load_and_explore_data(file_path_or_data):
    """
    Load data and perform initial exploration
    
    INTERPRETATION GUIDE:
    - Look for missing values, data types, and basic statistics
    - Check dataset dimensions (n_samples vs n_features)
    - High-dimensional data (p >> n) requires different approaches
    """
    
    # Load your data here - adjust based on your data source
    if isinstance(file_path_or_data, str):
        df = pd.read_csv(file_path_or_data)
    else:
        df = file_path_or_data
    
    print("="*60)
    print("📊 DATASET OVERVIEW")
    print("="*60)
    
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.2f} MB")
    print(f"Missing values: {df.isnull().sum().sum()}")
    
    print("\nData types:")
    print(df.dtypes.value_counts())
    
    print("\nBasic statistics:")
    print(df.describe())
    
    # WHAT TO LOOK FOR:
    print("\n💡 INTERPRETATION GUIDELINES:")
    print("• Large shape (many features): Consider dimensionality reduction")
    print("• Missing values: Choose appropriate handling strategy") 
    print("• Mixed data types: May need separate analysis approaches")
    print("• Extreme min/max values: Potential outliers or data errors")
    
    return df

# ========================================================================
# 2. TIME COLUMN DETECTION
# ========================================================================

def detect_time_column(df):
    """
    Automatically detect which column represents time
    
    INTERPRETATION GUIDE:
    - Time columns typically have: monotonic values, low variability, unique entries
    - Sequential integers (1,2,3...) often indicate row indices used as time
    - Small coefficient of variation suggests stable time intervals
    - First column is often time index by convention
    """
    
    print("\n" + "="*60)
    print("🕰️  TIME COLUMN DETECTION")
    print("="*60)
    
    candidates = []
    
    for col in df.columns:
        series = df[col]
        score = 0
        reasons = []
        
        # Check monotonic property
        if series.is_monotonic_increasing:
            score += 30
            reasons.append("monotonic increasing")
        elif series.is_monotonic_decreasing:
            score += 25
            reasons.append("monotonic decreasing")
            
        # Check uniqueness
        if series.is_unique:
            score += 20
            reasons.append("unique values")
            
        # Check for consistent intervals
        if pd.api.types.is_numeric_dtype(series):
            diffs = series.diff().dropna()
            if len(diffs.unique()) <= 3:  # Allow for 2-3 different intervals
                score += 25
                reasons.append("regular intervals")
                
        # Check variability (time indices usually have low CV)
        cv = series.std() / abs(series.mean()) * 100 if series.mean() != 0 else np.inf
        if cv < 5:
            score += 15
            reasons.append("low variability")
            
        # Position bonus (time often first column)
        if df.columns.tolist().index(col) == 0:
            score += 10
            reasons.append("first column")
            
        # Check for sequential pattern
        if pd.api.types.is_numeric_dtype(series):
            sorted_vals = sorted(series.dropna())
            if len(sorted_vals) > 1:
                intervals = np.diff(sorted_vals)
                if np.std(intervals) / np.mean(intervals) < 0.1:  # Very regular intervals
                    score += 20
                    reasons.append("very regular spacing")
        
        if score > 15:
            candidates.append((col, score, reasons, cv))
    
    # Display candidates
    candidates.sort(key=lambda x: x[1], reverse=True)
    
    print("TIME COLUMN CANDIDATES (ranked by score):")
    for col, score, reasons, cv in candidates:
        print(f"• {col}: Score {score}, CV={cv:.3f}% - {', '.join(reasons)}")
    
    print("\n💡 INTERPRETATION:")
    print("• Higher scores indicate stronger time column characteristics")
    print("• Look for monotonic + unique + low CV combinations")
    print("• Verify detected column makes sense for your domain")
    print("• Sequential integers often indicate observation order")
    
    return candidates[0][0] if candidates else None

# ========================================================================
# 3. STATIONARITY TESTING
# ========================================================================

def test_stationarity(df, time_col=None, max_features=10):
    """
    Test stationarity of time series using ADF and KPSS tests
    
    INTERPRETATION GUIDE:
    - Stationary series have constant mean/variance over time
    - ADF tests null hypothesis of non-stationarity (p < 0.05 = stationary)
    - KPSS tests null hypothesis of stationarity (p > 0.05 = stationary)
    - Non-stationary series need differencing or detrending
    """
    
    print("\n" + "="*60)
    print("📈 STATIONARITY ANALYSIS")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_ts = df.set_index(time_col)
    else:
        df_ts = df
    
    stationary_results = {}
    
    # Test high-variance columns first (more likely to be interesting)
    numeric_cols = df_ts.select_dtypes(include=[np.number]).columns
    col_variances = [(col, df_ts[col].var()) for col in numeric_cols]
    col_variances.sort(key=lambda x: x[1], reverse=True)
    
    test_cols = [col for col, _ in col_variances[:max_features]]
    
    for col in test_cols:
        series = df_ts[col].dropna()
        
        if len(series) < 3 or series.std() == 0:
            print(f"\n{col}: Insufficient data or constant series")
            continue
            
        try:
            # ADF Test
            adf_stat, adf_pval, _, _, adf_crit, _ = adfuller(series)
            
            # KPSS Test
            kpss_stat, kpss_pval, _, kpss_crit = kpss(series)
            
            # Interpretation
            adf_stationary = adf_pval < 0.05
            kpss_stationary = kpss_pval > 0.05
            
            if adf_stationary and kpss_stationary:
                status = "STATIONARY"
            elif not adf_stationary and not kpss_stationary:
                status = "NON-STATIONARY"
            else:
                status = "UNCERTAIN"
            
            stationary_results[col] = {
                'adf_pvalue': adf_pval,
                'kpss_pvalue': kpss_pval,
                'status': status
            }
            
            print(f"\n{col}:")
            print(f"  ADF p-value: {adf_pval:.6f} ({'Stationary' if adf_stationary else 'Non-stationary'})")
            print(f"  KPSS p-value: {kpss_pval:.6f} ({'Stationary' if kpss_stationary else 'Non-stationary'})")
            print(f"  Overall: {status}")
            
        except Exception as e:
            print(f"\n{col}: Error in testing - {str(e)}")
    
    print("\n💡 INTERPRETATION:")
    print("• STATIONARY: Can use ARIMA, linear regression directly")
    print("• NON-STATIONARY: Apply differencing, log transforms, or detrending")
    print("• UNCERTAIN: Collect more data or try different transformations")
    print("• Both tests should agree for confidence in result")
    
    return stationary_results

# ========================================================================
# 4. TREND AND SEASONALITY ANALYSIS
# ========================================================================

def analyze_trends(df, time_col=None, max_features=8):
    """
    Analyze trends and patterns in time series
    
    INTERPRETATION GUIDE:
    - Linear trends: Consistent increase/decrease over time
    - R² > 0.7: Strong trend, R² 0.3-0.7: Moderate, R² < 0.3: Weak
    - Slope indicates rate of change per time unit
    - P-value < 0.05 indicates significant trend
    """
    
    print("\n" + "="*60)
    print("📊 TREND ANALYSIS")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_ts = df.set_index(time_col)
    else:
        df_ts = df
        
    trend_results = {}
    
    # Focus on high-variance columns
    numeric_cols = df_ts.select_dtypes(include=[np.number]).columns
    col_variances = [(col, df_ts[col].var()) for col in numeric_cols]
    col_variances.sort(key=lambda x: x[1], reverse=True)
    
    for col, _ in col_variances[:max_features]:
        series = df_ts[col].dropna()
        
        if len(series) < 3:
            continue
            
        # Linear trend analysis
        x = np.arange(len(series))
        slope, intercept, r_value, p_value, std_err = stats.linregress(x, series)
        
        # Categorize trend strength
        r_squared = r_value ** 2
        if r_squared > 0.7:
            strength = "STRONG"
        elif r_squared > 0.3:
            strength = "MODERATE"  
        else:
            strength = "WEAK"
            
        direction = "INCREASING" if slope > 0 else "DECREASING" if slope < 0 else "FLAT"
        significant = "YES" if p_value < 0.05 else "NO"
        
        trend_results[col] = {
            'slope': slope,
            'r_squared': r_squared,
            'p_value': p_value,
            'direction': direction,
            'strength': strength
        }
        
        print(f"\n{col}:")
        print(f"  Slope: {slope:.6f} (rate of change per time unit)")
        print(f"  R-squared: {r_squared:.3f}")
        print(f"  P-value: {p_value:.6f}")
        print(f"  Trend: {strength} {direction} ({'Significant' if significant == 'YES' else 'Not significant'})")
    
    print("\n💡 INTERPRETATION:")
    print("• Strong trends (R² > 0.7): Predictable, good for forecasting")
    print("• Moderate trends (R² 0.3-0.7): Some predictability with noise")
    print("• Weak trends (R² < 0.3): Mostly noise, consider other patterns")
    print("• Significant p-values (< 0.05): Trend is statistically reliable")
    print("• Slope magnitude indicates how fast change occurs")
    
    return trend_results

# ========================================================================
# 5. CHANGE POINT DETECTION
# ========================================================================

def detect_change_points(df, time_col=None, max_features=6):
    """
    Detect sudden changes or regime shifts in time series
    
    INTERPRETATION GUIDE:
    - Change points indicate regime shifts, system changes, or anomalies
    - Large changes (> 2x std dev) suggest structural breaks
    - Multiple change points indicate unstable/switching systems
    - Timing of changes may correspond to known events
    """
    
    print("\n" + "="*60)
    print("📍 CHANGE POINT DETECTION")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_ts = df.set_index(time_col)
    else:
        df_ts = df
        
    change_results = {}
    
    # Focus on volatile features
    numeric_cols = df_ts.select_dtypes(include=[np.number]).columns
    col_vars = [(col, df_ts[col].var()) for col in numeric_cols if df_ts[col].std() > 0]
    col_vars.sort(key=lambda x: x[1], reverse=True)
    
    for col, _ in col_vars[:max_features]:
        series = df_ts[col].dropna()
        
        if len(series) < 3:
            continue
            
        # Calculate consecutive differences
        diffs = series.diff().dropna()
        abs_diffs = np.abs(diffs)
        
        if len(abs_diffs) == 0:
            continue
            
        # Find largest changes
        max_change_idx = abs_diffs.idxmax()
        max_change = diffs.loc[max_change_idx]
        mean_change = abs_diffs.mean()
        std_change = abs_diffs.std()
        
        # Identify significant changes (> 2 standard deviations)
        threshold = mean_change + 2 * std_change if std_change > 0 else mean_change * 2
        significant_changes = abs_diffs[abs_diffs > threshold]
        
        change_results[col] = {
            'max_change': max_change,
            'max_change_position': max_change_idx,
            'n_significant_changes': len(significant_changes),
            'change_magnitude_ratio': abs(max_change) / mean_change if mean_change > 0 else np.inf
        }
        
        print(f"\n{col}:")
        print(f"  Largest change: {max_change:.3f} at position {max_change_idx}")
        print(f"  Change magnitude: {abs(max_change):.3f}")
        print(f"  Significant changes: {len(significant_changes)}")
        print(f"  Magnitude ratio: {abs(max_change)/mean_change:.1f}x average" if mean_change > 0 else "  Magnitude ratio: ∞")
        
        if abs(max_change) > threshold:
            print(f"  ⚠️  POTENTIAL REGIME CHANGE detected!")
    
    print("\n💡 INTERPRETATION:")
    print("• Large changes (>2x average): Investigate for system events")
    print("• Multiple significant changes: Unstable or switching regimes")
    print("• Timing patterns: Look for periodic or seasonal change points")
    print("• Magnitude ratio >3x: Strong evidence of structural break")
    print("• Consider domain knowledge to validate detected changes")
    
    return change_results

# ========================================================================
# 6. CORRELATION AND RELATIONSHIP ANALYSIS
# ========================================================================

def analyze_relationships(df, time_col=None, correlation_threshold=0.5):
    """
    Analyze correlations and relationships between features
    
    INTERPRETATION GUIDE:
    - Correlation > 0.7: Strong relationship, possible redundancy
    - Correlation 0.3-0.7: Moderate relationship, useful for modeling
    - Correlation < 0.3: Weak relationship, mostly independent
    - Positive: Variables move together, Negative: Variables move opposite
    """
    
    print("\n" + "="*60)
    print("🔗 CORRELATION ANALYSIS")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_numeric = df.drop(columns=[time_col]).select_dtypes(include=[np.number])
    else:
        df_numeric = df.select_dtypes(include=[np.number])
    
    # Calculate correlation matrix
    corr_matrix = df_numeric.corr()
    
    # Find strong correlations
    strong_pairs = []
    moderate_pairs = []
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            corr_val = corr_matrix.iloc[i, j]
            col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
            
            if abs(corr_val) > 0.7:
                strong_pairs.append((col1, col2, corr_val))
            elif abs(corr_val) > correlation_threshold:
                moderate_pairs.append((col1, col2, corr_val))
    
    print(f"STRONG CORRELATIONS (|r| > 0.7):")
    if strong_pairs:
        for col1, col2, corr in sorted(strong_pairs, key=lambda x: abs(x[2]), reverse=True):
            print(f"  {col1} ↔ {col2}: {corr:.3f}")
    else:
        print("  None found")
    
    print(f"\nMODERATE CORRELATIONS ({correlation_threshold} < |r| < 0.7):")
    if moderate_pairs:
        for col1, col2, corr in sorted(moderate_pairs, key=lambda x: abs(x[2]), reverse=True)[:10]:
            print(f"  {col1} ↔ {col2}: {corr:.3f}")
    else:
        print("  None found")
    
    # Summary statistics
    all_corrs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            all_corrs.append(abs(corr_matrix.iloc[i, j]))
    
    print(f"\nCORRELATION SUMMARY:")
    print(f"  Average absolute correlation: {np.mean(all_corrs):.3f}")
    print(f"  Strong correlations (>0.7): {len(strong_pairs)}")
    print(f"  Moderate correlations (>0.5): {len(moderate_pairs)}")
    print(f"  Total feature pairs: {len(all_corrs)}")
    
    print("\n💡 INTERPRETATION:")
    print("• Strong correlations: Consider removing redundant features")
    print("• Moderate correlations: Useful for feature engineering")
    print("• No correlations: Features are independent, good for diversification")
    print("• High average correlation: Dataset has underlying structure")
    print("• Many pairs with same correlation: Potential data generation pattern")
    
    return corr_matrix, strong_pairs, moderate_pairs

# ========================================================================
# 7. VOLATILITY AND MOMENTUM ANALYSIS
# ========================================================================

def analyze_volatility_momentum(df, time_col=None, max_features=8):
    """
    Analyze volatility and momentum patterns
    
    INTERPRETATION GUIDE:
    - High volatility: Unpredictable, risky, but potentially high-information
    - Low volatility: Stable, predictable, but potentially low-information
    - Positive momentum: Upward trend, growth pattern
    - Negative momentum: Downward trend, decline pattern
    - Consistency: How often changes go in same direction
    """
    
    print("\n" + "="*60)
    print("⚡ VOLATILITY & MOMENTUM ANALYSIS")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_ts = df.set_index(time_col)
    else:
        df_ts = df
        
    results = {}
    
    # Focus on variable features
    numeric_cols = df_ts.select_dtypes(include=[np.number]).columns
    col_vars = [(col, df_ts[col].var()) for col in numeric_cols if df_ts[col].std() > 0]
    col_vars.sort(key=lambda x: x[1], reverse=True)
    
    for col, _ in col_vars[:max_features]:
        series = df_ts[col].dropna()
        
        if len(series) < 2:
            continue
            
        # Volatility metrics
        returns = series.pct_change().dropna()
        volatility = returns.std() if len(returns) > 0 else 0
        
        # Momentum metrics
        total_change = series.iloc[-1] - series.iloc[0]
        pct_change = (total_change / series.iloc[0]) * 100 if series.iloc[0] != 0 else 0
        
        # Direction consistency
        if len(series) > 1:
            diffs = series.diff().dropna()
            if len(diffs) > 0:
                positive_changes = (diffs > 0).sum()
                negative_changes = (diffs < 0).sum()
                consistency = max(positive_changes, negative_changes) / len(diffs)
                dominant_direction = "Upward" if positive_changes > negative_changes else "Downward" if negative_changes > positive_changes else "Sideways"
            else:
                consistency = 0
                dominant_direction = "Unknown"
        else:
            consistency = 0
            dominant_direction = "Unknown"
        
        results[col] = {
            'volatility': volatility,
            'total_change_pct': pct_change,
            'consistency': consistency,
            'direction': dominant_direction
        }
        
        print(f"\n{col}:")
        print(f"  Volatility (std of returns): {volatility:.4f}")
        print(f"  Total change: {pct_change:+.2f}%")
        print(f"  Direction consistency: {consistency:.1%}")
        print(f"  Dominant trend: {dominant_direction}")
        
        # Risk/reward categorization
        if abs(pct_change) > 50 and volatility > 1:
            print(f"  ⚠️  HIGH RISK/HIGH REWARD pattern")
        elif abs(pct_change) < 10 and volatility < 0.5:
            print(f"  ✅ LOW RISK/STABLE pattern")
    
    print("\n💡 INTERPRETATION:")
    print("• High volatility + high momentum: Explosive growth or decline")
    print("• High volatility + low momentum: Random walk, difficult to predict")
    print("• Low volatility + high momentum: Steady trend, good for forecasting")
    print("• High consistency (>70%): Directional trend, momentum trading opportunity")
    print("• Low consistency (<50%): Mean-reverting, range-bound behavior")
    
    return results

# ========================================================================
# 8. DIMENSIONALITY REDUCTION ANALYSIS
# ========================================================================

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import numpy as np

def analyze_dimensionality(df, time_col=None, target_variance=0.8, max_components=40):
    """
    Perform PCA and print explained variance for each component.
    - target_variance: float, e.g. 0.8 for 80% cumulative explained variance
    - max_components: int, max number of PCs to consider
    """
    print("\n" + "="*60)
    print("🧩 DIMENSIONALITY REDUCTION (PCA)")
    print("="*60)

    # Select numeric features only
    if time_col and time_col in df.columns:
        df_numeric = df.drop(columns=[time_col]).select_dtypes(include=[np.number])
    else:
        df_numeric = df.select_dtypes(include=[np.number])
    if df_numeric.shape[1] < 2:
        print("Need at least 2 numeric features for PCA")
        return None

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_numeric)

    # Fit PCA with enough components
    n_components = min(max_components, df_numeric.shape[1], df_numeric.shape[0])
    pca = PCA(n_components=n_components)
    pca.fit(X_scaled)
    explained_var = pca.explained_variance_ratio_
    cum_var = np.cumsum(explained_var)

    # Print explained and cumulative variance
    print("PC\tExplained Var\tCumulative Var")
    for i, (ev, cv) in enumerate(zip(explained_var, cum_var), 1):
        print(f"{i}\t{ev:.3f}\t\t{cv:.3f}")

    # Find number of components to reach target variance
    n_needed = np.argmax(cum_var >= target_variance) + 1
    print(f"\nComponents needed for {target_variance*100:.0f}% variance: {n_needed}")

    # Show top contributing features for first 3 PCs
    print("\nTop contributing features per PC:")
    for i in range(min(3, len(explained_var))):
        loadings = pca.components_[i]
        top_idx = np.argsort(np.abs(loadings))[-5:][::-1]
        print(f"PC{i+1}: " + ", ".join(f"{df_numeric.columns[j]} ({loadings[j]:+.2f})" for j in top_idx))

    return pca, explained_var, cum_var

# ========================================================================
# 9. OUTLIER DETECTION
# ========================================================================

def detect_outliers(df, time_col=None, method='iqr', threshold=1.5):
    """
    Detect outliers using multiple methods
    
    INTERPRETATION GUIDE:
    - IQR method: Values beyond 1.5*IQR from Q1/Q3 quartiles
    - Z-score: Values more than 3 standard deviations from mean
    - High outlier rates: Data quality issues or natural extreme events
    - Temporal clustering: Outliers occurring together may indicate events
    """
    
    print("\n" + "="*60)
    print("⚠️  OUTLIER DETECTION")
    print("="*60)
    
    if time_col and time_col in df.columns:
        df_numeric = df.drop(columns=[time_col]).select_dtypes(include=[np.number])
        time_index = df[time_col]
    else:
        df_numeric = df.select_dtypes(include=[np.number])
        time_index = None
    
    outlier_results = {}
    total_outliers = 0
    
    for col in df_numeric.columns:
        series = df_numeric[col].dropna()
        
        if len(series) < 4:  # Need minimum data for outlier detection
            continue
            
        if method == 'iqr':
            Q1 = series.quantile(0.25)
            Q3 = series.quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - threshold * IQR
            upper_bound = Q3 + threshold * IQR
            outliers = series[(series < lower_bound) | (series > upper_bound)]
            
        elif method == 'zscore':
            z_scores = np.abs(stats.zscore(series))
            outliers = series[z_scores > 3]
            
        if len(outliers) > 0:
            outlier_rate = len(outliers) / len(series) * 100
            outlier_results[col] = {
                'count': len(outliers),
                'rate': outlier_rate,
                'values': outliers.tolist(),
                'indices': outliers.index.tolist()
            }
            total_outliers += len(outliers)
            
            print(f"\n{col}:")
            print(f"  Outliers: {len(outliers)} ({outlier_rate:.1f}% of data)")
            print(f"  Outlier values: {outliers.tolist()}")
            
            if time_index is not None and len(outliers) > 0:
                outlier_times = time_index.iloc[outliers.index].tolist()
                print(f"  Times: {outlier_times}")
    
    print(f"\nOUTLIER SUMMARY:")
    print(f"  Total outliers: {total_outliers}")
    print(f"  Features with outliers: {len(outlier_results)}")
    print(f"  Average outlier rate: {(total_outliers / (df_numeric.shape[0] * df_numeric.shape[1])) * 100:.2f}%")
    
    # Find features with highest outlier rates
    if outlier_results:
        sorted_features = sorted(outlier_results.items(), key=lambda x: x[1]['rate'], reverse=True)
        print(f"\n  Highest outlier rates:")
        for col, info in sorted_features[:5]:
            print(f"    {col}: {info['rate']:.1f}%")
    
    print("\n💡 INTERPRETATION:")
    print("• High outlier rates (>10%): Check data quality or natural extreme events")
    print("• Clustered outlier times: May indicate system events or measurement issues")
    print("• Consistent outliers across features: Potential data collection problems")
    print("• Isolated outliers: May be genuine extreme values worth investigating")
    print("• Consider robust statistical methods if many outliers present")
    
    return outlier_results

# ========================================================================
# 10. COMPREHENSIVE ANALYSIS RUNNER
# ========================================================================

def run_comprehensive_analysis(df, time_col=None):
    """
    Run all analysis functions in sequence
    
    OVERALL INTERPRETATION GUIDE:
    - Start with data exploration to understand basic properties
    - Time column detection ensures proper temporal analysis
    - Stationarity guides choice of modeling approaches
    - Trends and change points reveal system behavior
    - Correlations show feature relationships and redundancy
    - Volatility/momentum identifies high-information features
    - Dimensionality analysis reveals data structure
    - Outliers indicate data quality and extreme events
    """
    
    print("🚀 STARTING COMPREHENSIVE TIME SERIES ANALYSIS")
    print("="*80)
    
    # 1. Data exploration
    df = load_and_explore_data(df)
    
    # 2. Detect time column if not provided
    if time_col is None:
        time_col = detect_time_column(df)
        print(f"\n🕰️  Using '{time_col}' as time column")
    
    # Store all results
    analysis_results = {
        'time_column': time_col,
        'stationarity': None,
        'trends': None,
        'change_points': None,
        'correlations': None,
        'volatility': None,
        'pca': None,
        'outliers': None
    }
    
    # 3. Run all analyses
    try:
        analysis_results['stationarity'] = test_stationarity(df, time_col)
    except Exception as e:
        print(f"⚠️  Stationarity analysis failed: {e}")
    
    try:
        analysis_results['trends'] = analyze_trends(df, time_col)
    except Exception as e:
        print(f"⚠️  Trend analysis failed: {e}")
    
    try:
        analysis_results['change_points'] = detect_change_points(df, time_col)
    except Exception as e:
        print(f"⚠️  Change point analysis failed: {e}")
    
    try:
        corr_matrix, strong_pairs, moderate_pairs = analyze_relationships(df, time_col)
        analysis_results['correlations'] = {
            'matrix': corr_matrix,
            'strong_pairs': strong_pairs,
            'moderate_pairs': moderate_pairs
        }
    except Exception as e:
        print(f"⚠️  Correlation analysis failed: {e}")
    
    try:
        analysis_results['volatility'] = analyze_volatility_momentum(df, time_col)
    except Exception as e:
        print(f"⚠️  Volatility analysis failed: {e}")
    
    try:
        pca_result, explained_var, cum_var = analyze_dimensionality(df, time_col)
        analysis_results['pca'] = {
            'model': pca_result,
            'explained_variance': explained_var,
            'cumulative_variance': cum_var
        }
    except Exception as e:
        print(f"⚠️  PCA analysis failed: {e}")
    
    try:
        analysis_results['outliers'] = detect_outliers(df, time_col)
    except Exception as e:
        print(f"⚠️  Outlier detection failed: {e}")
    
    # Final summary
    print("\n" + "="*80)
    print("📋 ANALYSIS COMPLETE - KEY TAKEAWAYS")
    print("="*80)
    
    print("\n🎯 NEXT STEPS BASED ON YOUR RESULTS:")
    print("1. Review each section's interpretation guidelines")
    print("2. Focus on features with high volatility/momentum for predictive modeling")
    print("3. Use PCA components if high dimensionality detected")
    print("4. Apply appropriate transformations based on stationarity results")
    print("5. Investigate outliers and change points for domain insights")
    print("6. Use correlation analysis for feature selection and engineering")
    
    return analysis_results

# ========================================================================
# ADDITIONAL HELPER FUNCTIONS
# ========================================================================

def create_analysis_dashboard(df, time_col, results):
    """
    Create visualizations for the analysis results
    Note: This is a template - customize based on your specific needs
    """
    
    plt.style.use('default')
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    fig.suptitle('Time Series Analysis Dashboard', fontsize=16)
    
    # Add your custom visualization code here
    # Example plots you might want:
    # - Time series plots of key features
    # - Correlation heatmap
    # - PCA explained variance
    # - Outlier distribution
    # - Change point visualization
    # - Volatility comparison
    
    plt.tight_layout()
    plt.show()

def export_insights_report(results, filename='analysis_insights.txt'):
    """
    Export key insights to a text report
    """
    
    with open(filename, 'w') as f:
        f.write("TIME SERIES ANALYSIS INSIGHTS REPORT\n")
        f.write("="*50 + "\n\n")
        
        # Add key findings from each analysis
        # This is a template - customize based on your results
        
        f.write("KEY FINDINGS:\n")
        f.write("- [Add your key insights here]\n")
        f.write("- [Based on the analysis results]\n")
        
        f.write("\nRECOMMENDATIONS:\n")
        f.write("- [Add actionable recommendations]\n")
        
    print(f"📄 Insights report saved to '{filename}'")

# ========================================================================
# END OF ANALYSIS FRAMEWORK
# ========================================================================

In [30]:
df = pd.read_csv('TEST_Trader_Quant_dataset.csv')
results = run_comprehensive_analysis(df)

🚀 STARTING COMPREHENSIVE TIME SERIES ANALYSIS
📊 DATASET OVERVIEW
Shape: (8688, 56)
Memory usage: 3.71 MB
Missing values: 0

Data types:
float64    39
int64      17
Name: count, dtype: int64

Basic statistics:
                 1             2             3            4            5  \
count  8688.000000   8688.000000   8688.000000  8688.000000  8688.000000   
mean     72.298593    781.216361    807.324287   106.064478    39.628097   
std       3.600288   1137.844449   1262.210062   165.048474    81.746966   
min      60.277750      0.127000      0.006000     0.402000     0.016000   
25%      71.519875     97.043750    100.329500    37.691000    10.568000   
50%      73.713000    351.978000    344.176000    73.808500    24.075000   
75%      74.201750   1003.126750    996.374500   126.193250    46.198750   
max      78.947000  13359.016000  18239.051000  4426.142000  3255.730000   

                 6            7            8            9           10  ...  \
count  8688.000000  8688.00