In [1]:
# YEARLY MARKET DATA ANALYSIS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
import scipy.stats as stats

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

def generate_annual_analysis_report(data_file, analysis_years, output_file='annual_market_analysis.pdf'):
    """
    Generate comprehensive yearly analysis report for market data
    
    Parameters:
    -----------
    data_file : str
        Path to market data CSV file
    analysis_years : list
        List of years to analyze
    output_file : str
        Output PDF file name
    """
    
    print("=" * 70)
    print("ANNUAL MARKET DATA ANALYSIS REPORT GENERATOR")
    print("=" * 70)
    
    # Load data
    print("\nLoading market data...")
    market_data = pd.read_csv(data_file, parse_dates=True)
    
    # Prepare datetime index
    market_data['DateTime'] = pd.to_datetime(market_data['DateTime'], format='%d-%m-%Y %H.%M')
    market_data.set_index('DateTime', inplace=True)
    
    # Filter to market hours
    market_data = market_data.between_time('09:15', '15:30')
    
    print(f"Data loaded: {len(market_data)} records")
    print(f"Date range: {market_data.index[0]} to {market_data.index[-1]}")
    print(f"\nYears to analyze: {analysis_years}")
    
    # Create PDF document
    pdf_document = PdfPages(output_file)
    
    # Create summary page
    plt.figure(figsize=(10, 6))
    plt.axis('off')
    plt.text(0.5, 0.9, 'ANNUAL MARKET ANALYSIS REPORT', 
             fontsize=16, fontweight='bold', ha='center')
    plt.text(0.5, 0.8, f'Dataset: {data_file}', 
             fontsize=12, ha='center')
    plt.text(0.5, 0.7, f'Analysis Period: {min(analysis_years)} - {max(analysis_years)}', 
             fontsize=12, ha='center')
    plt.text(0.5, 0.6, f'Total Records: {len(market_data):,}', 
             fontsize=12, ha='center')
    plt.text(0.5, 0.5, f'Report Generated: {pd.Timestamp.now().strftime("%Y-%m-%d %H:%M")}', 
             fontsize=12, ha='center')
    pdf_document.savefig()
    plt.close()
    
    # Analyze each year
    for year in analysis_years:
        print(f"\nAnalyzing {year}...")
        
        # Filter data for the year
        year_data = market_data[market_data.index.year == year].copy()
        
        if len(year_data) == 0:
            print(f"  No data available for {year}")
            continue
        
        print(f"  Records: {len(year_data):,}")
        
        # Calculate derived metrics
        year_data['Minute_Return'] = year_data['Close'].pct_change().fillna(0)
        year_data['Volatility_30min'] = year_data['Minute_Return'].rolling(window=30).std() * (30**0.5)
        year_data['Month'] = year_data.index.month
        year_data['Price_Range'] = (year_data['High'] - year_data['Low']) / year_data['Close']
        year_data['Daily_High'] = year_data['High'].resample('D').max()
        year_data['Daily_Low'] = year_data['Low'].resample('D').min()
        
        # Create year summary page
        plt.figure(figsize=(10, 6))
        plt.axis('off')
        
        # Summary statistics
        summary_text = f"""
        {year} MARKET ANALYSIS SUMMARY
        {'=' * 40}
        
        Data Points: {len(year_data):,}
        Trading Days: {len(year_data.resample('D').last()):,}
        
        PRICE STATISTICS:
          Average Price: {year_data['Close'].mean():.2f}
          Price Range: {year_data['Close'].min():.2f} - {year_data['Close'].max():.2f}
          Price Volatility: {year_data['Close'].std():.2f}
        
        RETURN STATISTICS:
          Average Return: {year_data['Minute_Return'].mean():.6f}
          Return Volatility: {year_data['Minute_Return'].std():.6f}
          Skewness: {stats.skew(year_data['Minute_Return'].dropna()):.3f}
          Kurtosis: {stats.kurtosis(year_data['Minute_Return'].dropna()):.3f}
        
        VOLATILITY:
          Average 30-min Volatility: {year_data['Volatility_30min'].mean():.6f}
          Max Daily Range: {(year_data['Price_Range'] * 100).max():.2f}%
        """
        
        plt.text(0.1, 0.95, summary_text, fontsize=10, fontfamily='monospace',
                verticalalignment='top', transform=plt.gca().transAxes)
        
        pdf_document.savefig()
        plt.close()
        
        # 1. Price and Return Distribution
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Price distribution
        axes[0, 0].hist(year_data['Close'], bins=60, alpha=0.7, color='steelblue')
        axes[0, 0].axvline(year_data['Close'].mean(), color='red', linestyle='--', 
                          label=f'Mean: {year_data["Close"].mean():.2f}')
        axes[0, 0].set_title(f'{year} Price Distribution', fontweight='bold')
        axes[0, 0].set_xlabel('Closing Price')
        axes[0, 0].set_ylabel('Frequency')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.2)
        
        # Return distribution with KDE
        returns_clean = year_data['Minute_Return'].dropna()
        axes[0, 1].hist(returns_clean, bins=60, alpha=0.6, color='forestgreen', density=True)
        sns.kdeplot(data=returns_clean, ax=axes[0, 1], color='darkgreen', linewidth=2)
        axes[0, 1].axvline(returns_clean.mean(), color='red', linestyle='--',
                          label=f'Mean: {returns_clean.mean():.6f}')
        axes[0, 1].set_title(f'{year} Return Distribution', fontweight='bold')
        axes[0, 1].set_xlabel('Minute Return')
        axes[0, 1].set_ylabel('Density')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.2)
        
        # Price KDE
        axes[1, 0].clear()
        sns.kdeplot(data=year_data['Close'], ax=axes[1, 0], fill=True, color='mediumpurple')
        axes[1, 0].set_title(f'{year} Price Density', fontweight='bold')
        axes[1, 0].set_xlabel('Closing Price')
        axes[1, 0].set_ylabel('Density')
        axes[1, 0].grid(True, alpha=0.2)
        
        # Monthly price distribution
        monthly_data = [year_data[year_data['Month']==m]['Close'].values for m in range(1,13)]
        axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
        axes[1, 1].set_title(f'{year} Monthly Price Distribution', fontweight='bold')
        axes[1, 1].set_xlabel('Month')
        axes[1, 1].set_ylabel('Price Level')
        axes[1, 1].grid(True, alpha=0.2)
        
        plt.tight_layout()
        pdf_document.savefig()
        plt.close()
        
        # 2. Time Series Analysis
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Rolling statistics
        rolling_window = year_data['Minute_Return'].rolling(window=60)
        axes[0, 0].plot(rolling_window.mean(), label='60-min Mean', linewidth=1.5)
        axes[0, 0].plot(rolling_window.std(), label='60-min Std Dev', linewidth=1.5)
        axes[0, 0].set_title(f'{year} Rolling Return Statistics', fontweight='bold')
        axes[0, 0].set_xlabel('Time')
        axes[0, 0].set_ylabel('Statistic Value')
        axes[0, 0].legend()
        axes[0, 0].grid(True, alpha=0.2)
        
        # Volatility pattern
        axes[0, 1].plot(year_data.index, year_data['Volatility_30min'], color='crimson')
        axes[0, 1].set_title(f'{year} 30-Minute Volatility', fontweight='bold')
        axes[0, 1].set_xlabel('Time')
        axes[0, 1].set_ylabel('Volatility')
        axes[0, 1].grid(True, alpha=0.2)
        
        # Autocorrelation analysis
        plot_acf(returns_clean, lags=40, ax=axes[1, 0])
        axes[1, 0].set_title(f'{year} Return Autocorrelation', fontweight='bold')
        axes[1, 0].set_xlabel('Lag')
        axes[1, 0].set_ylabel('Autocorrelation')
        axes[1, 0].grid(True, alpha=0.2)
        
        # Partial autocorrelation
        plot_pacf(returns_clean, lags=40, ax=axes[1, 1])
        axes[1, 1].set_title(f'{year} Return Partial Autocorrelation', fontweight='bold')
        axes[1, 1].set_xlabel('Lag')
        axes[1, 1].set_ylabel('Partial Autocorrelation')
        axes[1, 1].grid(True, alpha=0.2)
        
        plt.tight_layout()
        pdf_document.savefig()
        plt.close()
        
        # 3. Statistical Tests and QQ Plot
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # QQ Plot
        stats.probplot(returns_clean, dist="norm", plot=axes[0, 0])
        axes[0, 0].set_title(f'{year} QQ Plot - Returns vs Normal', fontweight='bold')
        axes[0, 0].grid(True, alpha=0.2)
        
        # Price range distribution
        price_range_pct = (year_data['High'] - year_data['Low']) / year_data['Close'] * 100
        axes[0, 1].hist(price_range_pct, bins=40, alpha=0.7, color='darkorange')
        axes[0, 1].axvline(price_range_pct.mean(), color='red', linestyle='--',
                          label=f'Mean: {price_range_pct.mean():.2f}%')
        axes[0, 1].set_title(f'{year} Intraday Price Range Distribution', fontweight='bold')
        axes[0, 1].set_xlabel('Price Range (%)')
        axes[0, 1].set_ylabel('Frequency')
        axes[0, 1].legend()
        axes[0, 1].grid(True, alpha=0.2)
        
        # Statistical tests table
        axes[1, 0].axis('off')
        
        # Perform statistical tests
        try:
            # Ljung-Box test
            lb_test = acorr_ljungbox(returns_clean.dropna(), lags=[10, 20, 30], return_df=True)
            
            # Normality tests
            if len(returns_clean) > 5000:
                sample_returns = returns_clean.sample(5000, random_state=42)
            else:
                sample_returns = returns_clean
            
            shapiro_stat, shapiro_p = stats.shapiro(sample_returns)
            
            # Prepare test results table
            test_results = f"""
            STATISTICAL TESTS - {year}
            {'=' * 40}
            
            LJUNG-BOX TEST (Autocorrelation):
            Lag 10:  Stat={lb_test.loc[10, 'lb_stat']:.2f}, p={lb_test.loc[10, 'lb_pvalue']:.4f}
            Lag 20:  Stat={lb_test.loc[20, 'lb_stat']:.2f}, p={lb_test.loc[20, 'lb_pvalue']:.4f}
            Lag 30:  Stat={lb_test.loc[30, 'lb_stat']:.2f}, p={lb_test.loc[30, 'lb_pvalue']:.4f}
            
            NORMALITY TESTS:
            Shapiro-Wilk:  Stat={shapiro_stat:.4f}, p={shapiro_p:.4e}
            
            INTERPRETATION:
            - Low p-value (<0.05) in Ljung-Box indicates autocorrelation
            - Low p-value (<0.05) in Shapiro-Wilk indicates non-normality
            """
            
            axes[1, 0].text(0.1, 0.95, test_results, fontsize=9, fontfamily='monospace',
                           verticalalignment='top', transform=axes[1, 0].transAxes)
            
        except Exception as e:
            axes[1, 0].text(0.1, 0.5, f"Statistical tests could not be performed\nError: {str(e)}",
                           fontsize=10, transform=axes[1, 0].transAxes)
        
        # Daily pattern analysis
        axes[1, 1].clear()
        hour_returns = year_data.groupby(year_data.index.hour)['Minute_Return'].mean()
        axes[1, 1].plot(hour_returns.index, hour_returns.values * 10000, 
                       marker='o', linewidth=2, markersize=6, color='darkblue')
        axes[1, 1].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
        axes[1, 1].set_title(f'{year} Average Returns by Hour', fontweight='bold')
        axes[1, 1].set_xlabel('Hour of Day')
        axes[1, 1].set_ylabel('Return (bps)')
        axes[1, 1].grid(True, alpha=0.2)
        
        plt.tight_layout()
        pdf_document.savefig()
        plt.close()
        
        # 4. Advanced Analysis
        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        
        # Cumulative returns
        cumulative_returns = (1 + returns_clean).cumprod()
        axes[0, 0].plot(cumulative_returns.index, cumulative_returns.values, color='darkgreen')
        axes[0, 0].set_title(f'{year} Cumulative Returns', fontweight='bold')
        axes[0, 0].set_xlabel('Date')
        axes[0, 0].set_ylabel('Cumulative Return')
        axes[0, 0].grid(True, alpha=0.2)
        
        # Volume analysis (if available)
        if 'volume' in year_data.columns and year_data['volume'].sum() > 0:
            daily_volume = year_data['volume'].resample('D').sum()
            axes[0, 1].plot(daily_volume.index, daily_volume.values, color='purple')
            axes[0, 1].set_title(f'{year} Daily Trading Volume', fontweight='bold')
            axes[0, 1].set_xlabel('Date')
            axes[0, 1].set_ylabel('Volume')
            axes[0, 1].grid(True, alpha=0.2)
        else:
            axes[0, 1].text(0.5, 0.5, 'Volume data not available', 
                           fontsize=12, ha='center', transform=axes[0, 1].transAxes)
            axes[0, 1].set_title(f'{year} Volume Analysis', fontweight='bold')
        
        # Return outliers
        return_outliers = returns_clean[np.abs(returns_clean) > returns_clean.std() * 3]
        axes[1, 0].hist(returns_clean * 100, bins=60, alpha=0.6, color='lightblue', label='All Returns')
        axes[1, 0].hist(return_outliers * 100, bins=20, alpha=0.8, color='red', label='Outliers (>3σ)')
        axes[1, 0].set_title(f'{year} Return Outliers Analysis', fontweight='bold')
        axes[1, 0].set_xlabel('Return (%)')
        axes[1, 0].set_ylabel('Frequency')
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.2)
        
        # Seasonality analysis
        monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()
        monthly_df = pd.DataFrame({
            'Year': monthly_returns.index.year,
            'Month': monthly_returns.index.month,
            'Return': monthly_returns.values
        })
        
        if not monthly_df.empty:
            monthly_pivot = monthly_df.pivot(index='Month', columns='Year', values='Return')
            im = axes[1, 1].imshow(monthly_pivot * 100, cmap='RdYlGn', aspect='auto')
            axes[1, 1].set_title(f'{year} Monthly Returns Heatmap', fontweight='bold')
            axes[1, 1].set_xlabel('Year')
            axes[1, 1].set_ylabel('Month')
            plt.colorbar(im, ax=axes[1, 1], label='Return (%)')
        else:
            axes[1, 1].text(0.5, 0.5, 'Insufficient data for heatmap', 
                           fontsize=12, ha='center', transform=axes[1, 1].transAxes)
        
        plt.tight_layout()
        pdf_document.savefig()
        plt.close()
        
        print(f"  Analysis completed for {year}")
    
    # Close PDF document
    pdf_document.close()
    
    print(f"\n{'=' * 70}")
    print(f"REPORT GENERATION COMPLETE")
    print(f"Report saved as: {output_file}")
    print(f"{'=' * 70}")
    
    return True

# Execute analysis
if __name__ == "__main__":
    # Configuration
    DATA_FILE = 'NIFTY 50_minute_data.csv'
    ANALYSIS_YEARS = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
    OUTPUT_FILE = 'nifty_annual_analysis_report.pdf'
    
    # Generate report
    success = generate_annual_analysis_report(DATA_FILE, ANALYSIS_YEARS, OUTPUT_FILE)
    
    if success:
        print("\nAnalysis Summary:")
        print("-" * 40)
        print(f"• Analyzed {len(ANALYSIS_YEARS)} years of market data")
        print(f"• Generated comprehensive PDF report")
        print(f"• Each year analysis includes:")
        print("  - Price and return distributions")
        print("  - Statistical tests and QQ plots")
        print("  - Time series analysis")
        print("  - Volatility patterns")
        print("  - Monthly performance heatmaps")
        print("\nReport ready for review!")

ANNUAL MARKET DATA ANALYSIS REPORT GENERATOR

Loading market data...
Data loaded: 932334 records
Date range: 2015-01-09 09:15:00 to 2025-02-07 15:29:00

Years to analyze: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]

Analyzing 2015...
  Records: 89,906


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2015

Analyzing 2016...
  Records: 92,250


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2016

Analyzing 2017...
  Records: 92,586


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2017

Analyzing 2018...
  Records: 91,875


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2018

Analyzing 2019...
  Records: 91,481


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2019

Analyzing 2020...
  Records: 94,120


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2020

Analyzing 2021...
  Records: 92,303


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2021

Analyzing 2022...
  Records: 92,622


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2022

Analyzing 2023...
  Records: 91,862


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2023

Analyzing 2024...
  Records: 92,454


  axes[1, 1].boxplot(monthly_data, labels=list(range(1,13)), showfliers=False)
  monthly_returns = cumulative_returns.resample('M').last().pct_change().dropna()


  Analysis completed for 2024

REPORT GENERATION COMPLETE
Report saved as: nifty_annual_analysis_report.pdf

Analysis Summary:
----------------------------------------
• Analyzed 10 years of market data
• Generated comprehensive PDF report
• Each year analysis includes:
  - Price and return distributions
  - Statistical tests and QQ plots
  - Time series analysis
  - Volatility patterns
  - Monthly performance heatmaps

Report ready for review!
