In [2]:
#!/usr/bin/env python3
"""
Statistical Analysis Helper for AI P0 Counting Research
Null Hypothesis Testing & Standard Deviation Calculations
"""

import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import ttest_1samp, chi2_contingency, normaltest
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple

class StatisticalAnalysisHelper:
    """
    Statistical analysis for AI counting accuracy research
    Helps formulate null hypothesis and calculate key statistics
    """
    
    def __init__(self):
        self.ai_counts = {}
        self.true_count = None
        self.results = {}
        
    def set_data(self, ai_systems_counts: Dict[str, int], true_count: int):
        """Set the data for analysis"""
        self.ai_counts = ai_systems_counts
        self.true_count = true_count
        
        print("📊 DATA LOADED")
        print("=" * 40)
        print(f"True Count (Deterministic): {true_count}")
        for system, count in ai_systems_counts.items():
            print(f"{system}: {count}")
    
    def formulate_null_hypothesis(self) -> Dict[str, str]:
        """Formulate appropriate null hypotheses for the research"""
        
        hypotheses = {
            'accuracy_null': "H₀: AI systems can accurately count P0 failures in documents (μ_error = 0)",
            'accuracy_alternative': "H₁: AI systems cannot accurately count P0 failures in documents (μ_error ≠ 0)",
            
            'consistency_null': "H₀: AI systems produce consistent counts across different approaches (σ² = 0)", 
            'consistency_alternative': "H₁: AI systems produce inconsistent counts across different approaches (σ² > 0)",
            
            'equivalence_null': "H₀: AI counting performance is equivalent to deterministic methods (|μ_AI - μ_true| ≤ δ)",
            'equivalence_alternative': "H₁: AI counting performance is not equivalent to deterministic methods (|μ_AI - μ_true| > δ)",
            
            'practical_null': "H₀: AI counting errors are within acceptable tolerance for research/clinical use (error ≤ 5%)",
            'practical_alternative': "H₁: AI counting errors exceed acceptable tolerance for research/clinical use (error > 5%)"
        }
        
        print("\n🔬 NULL HYPOTHESES FORMULATED")
        print("=" * 50)
        for key, hypothesis in hypotheses.items():
            print(f"\n{key.upper()}:")
            print(f"  {hypothesis}")
        
        return hypotheses
    
    def calculate_descriptive_statistics(self) -> Dict:
        """Calculate key descriptive statistics"""
        
        ai_values = list(self.ai_counts.values())
        errors = [abs(count - self.true_count) for count in ai_values]
        percent_errors = [(abs(count - self.true_count) / self.true_count) * 100 for count in ai_values]
        
        stats_dict = {
            'ai_counts': {
                'mean': np.mean(ai_values),
                'median': np.median(ai_values),
                'std_dev': np.std(ai_values, ddof=1),  # Sample standard deviation
                'variance': np.var(ai_values, ddof=1),
                'min': np.min(ai_values),
                'max': np.max(ai_values),
                'range': np.max(ai_values) - np.min(ai_values)
            },
            'absolute_errors': {
                'mean': np.mean(errors),
                'median': np.median(errors),
                'std_dev': np.std(errors, ddof=1),
                'min': np.min(errors),
                'max': np.max(errors)
            },
            'percent_errors': {
                'mean': np.mean(percent_errors),
                'median': np.median(percent_errors),
                'std_dev': np.std(percent_errors, ddof=1),
                'min': np.min(percent_errors),
                'max': np.max(percent_errors)
            },
            'true_count': self.true_count,
            'n_systems': len(ai_values)
        }
        
        self.results['descriptive'] = stats_dict
        
        print("\n📈 DESCRIPTIVE STATISTICS")
        print("=" * 40)
        print(f"True Count: {self.true_count}")
        print(f"AI Systems (n={len(ai_values)}): {ai_values}")
        print()
        print("AI COUNTS:")
        print(f"  Mean: {stats_dict['ai_counts']['mean']:.2f}")
        print(f"  Std Dev: {stats_dict['ai_counts']['std_dev']:.2f}")
        print(f"  Range: {stats_dict['ai_counts']['min']} - {stats_dict['ai_counts']['max']}")
        print()
        print("ABSOLUTE ERRORS:")
        print(f"  Mean: {stats_dict['absolute_errors']['mean']:.2f}")
        print(f"  Std Dev: {stats_dict['absolute_errors']['std_dev']:.2f}")
        print()
        print("PERCENT ERRORS:")
        print(f"  Mean: {stats_dict['percent_errors']['mean']:.1f}%")
        print(f"  Std Dev: {stats_dict['percent_errors']['std_dev']:.1f}%")
        
        return stats_dict
    
    def perform_hypothesis_tests(self, alpha: float = 0.05) -> Dict:
        """Perform statistical hypothesis tests"""
        
        ai_values = list(self.ai_counts.values())
        errors = [count - self.true_count for count in ai_values]  # Signed errors
        abs_errors = [abs(error) for error in errors]
        percent_errors = [(abs(count - self.true_count) / self.true_count) * 100 for count in ai_values]
        
        tests = {}
        
        # Test 1: One-sample t-test for accuracy (H₀: μ_error = 0)
        if len(errors) > 1:
            t_stat, p_value = ttest_1samp(errors, 0)
            tests['accuracy_test'] = {
                'test': 'One-sample t-test',
                'null_hypothesis': 'AI error mean = 0 (accurate)',
                't_statistic': t_stat,
                'p_value': p_value,
                'significant': p_value < alpha,
                'conclusion': 'Reject H₀ (AI is inaccurate)' if p_value < alpha else 'Fail to reject H₀'
            }
        
        # Test 2: Test if errors exceed practical tolerance (5%)
        tolerance_violations = sum(1 for error in percent_errors if error > 5)
        tests['tolerance_test'] = {
            'test': 'Practical tolerance assessment',
            'null_hypothesis': 'AI errors ≤ 5% (acceptable)',
            'violations': tolerance_violations,
            'total_systems': len(percent_errors),
            'violation_rate': (tolerance_violations / len(percent_errors)) * 100,
            'conclusion': f"{tolerance_violations}/{len(percent_errors)} systems exceed 5% tolerance"
        }
        
        # Test 3: Normality test for errors
        if len(errors) >= 8:  # Need sufficient sample size
            normality_stat, normality_p = normaltest(errors)
            tests['normality_test'] = {
                'test': 'D\'Agostino normality test',
                'statistic': normality_stat,
                'p_value': normality_p,
                'normal': normality_p > alpha,
                'conclusion': 'Errors are normally distributed' if normality_p > alpha else 'Errors are not normally distributed'
            }
        
        self.results['hypothesis_tests'] = tests
        
        print("\n🧪 HYPOTHESIS TESTS")
        print("=" * 40)
        
        for test_name, test_data in tests.items():
            print(f"\n{test_name.upper()}:")
            print(f"  Test: {test_data['test']}")
            print(f"  H₀: {test_data['null_hypothesis']}")
            
            if 'p_value' in test_data:
                print(f"  p-value: {test_data['p_value']:.6f}")
                print(f"  Significant (α=0.05): {test_data.get('significant', 'N/A')}")
            
            print(f"  Conclusion: {test_data['conclusion']}")
        
        return tests
    
    def calculate_effect_sizes(self) -> Dict:
        """Calculate effect sizes for practical significance"""
        
        ai_values = list(self.ai_counts.values())
        errors = [count - self.true_count for count in ai_values]
        
        effect_sizes = {}
        
        # Cohen's d for error magnitude
        if len(errors) > 1:
            cohens_d = np.mean(errors) / np.std(errors, ddof=1)
            effect_sizes['cohens_d'] = {
                'value': cohens_d,
                'interpretation': self._interpret_cohens_d(abs(cohens_d))
            }
        
        # Coefficient of variation for consistency
        if np.mean(ai_values) != 0:
            cv = (np.std(ai_values, ddof=1) / np.mean(ai_values)) * 100
            effect_sizes['coefficient_of_variation'] = {
                'value': cv,
                'interpretation': 'High variability' if cv > 20 else 'Moderate variability' if cv > 10 else 'Low variability'
            }
        
        # Mean Absolute Percentage Error (MAPE)
        mape = np.mean([abs(count - self.true_count) / self.true_count * 100 for count in ai_values])
        effect_sizes['mape'] = {
            'value': mape,
            'interpretation': 'Poor accuracy' if mape > 20 else 'Fair accuracy' if mape > 10 else 'Good accuracy' if mape > 5 else 'Excellent accuracy'
        }
        
        print("\n📏 EFFECT SIZES")
        print("=" * 30)
        for metric, data in effect_sizes.items():
            print(f"{metric}: {data['value']:.3f} ({data['interpretation']})")
        
        return effect_sizes
    
    def _interpret_cohens_d(self, d: float) -> str:
        """Interpret Cohen's d effect size"""
        if d < 0.2:
            return "Negligible effect"
        elif d < 0.5:
            return "Small effect"
        elif d < 0.8:
            return "Medium effect"
        else:
            return "Large effect"
    
    def generate_excel_formulas(self) -> Dict[str, str]:
        """Generate Excel formulas for verification"""
        
        formulas = {
            'standard_deviation': '=STDEV.S(B2:B5)',  # Assuming data in B2:B5
            'mean': '=AVERAGE(B2:B5)',
            'median': '=MEDIAN(B2:B5)',
            'variance': '=VAR.S(B2:B5)',
            'range': '=MAX(B2:B5)-MIN(B2:B5)',
            'mean_absolute_error': '=AVERAGE(ABS(B2:B5-$C$1))',  # Assuming true value in C1
            'mean_percent_error': '=AVERAGE(ABS((B2:B5-$C$1)/$C$1)*100)',
            'coefficient_of_variation': '=(STDEV.S(B2:B5)/AVERAGE(B2:B5))*100',
            't_test_one_sample': '=T.TEST(D2:D5,0,2,1)',  # Assuming errors in D2:D5
        }
        
        print("\n📋 EXCEL FORMULAS FOR VERIFICATION")
        print("=" * 50)
        print("Copy these formulas into Excel to double-check calculations:")
        print()
        for name, formula in formulas.items():
            print(f"{name}: {formula}")
        
        return formulas
    
    def create_summary_table(self) -> pd.DataFrame:
        """Create summary table for Excel export"""
        
        ai_values = list(self.ai_counts.values())
        ai_names = list(self.ai_counts.keys())
        
        summary_data = []
        for name, count in self.ai_counts.items():
            error = count - self.true_count
            abs_error = abs(error)
            percent_error = (abs_error / self.true_count) * 100
            
            summary_data.append({
                'AI_System': name,
                'Count': count,
                'True_Count': self.true_count,
                'Error': error,
                'Absolute_Error': abs_error,
                'Percent_Error': percent_error,
                'Within_5pct_Tolerance': 'Yes' if percent_error <= 5 else 'No'
            })
        
        df = pd.DataFrame(summary_data)
        
        # Add summary statistics row
        summary_row = {
            'AI_System': 'SUMMARY',
            'Count': np.mean(ai_values),
            'True_Count': self.true_count,
            'Error': np.mean([row['Error'] for row in summary_data]),
            'Absolute_Error': np.mean([row['Absolute_Error'] for row in summary_data]),
            'Percent_Error': np.mean([row['Percent_Error'] for row in summary_data]),
            'Within_5pct_Tolerance': f"{sum(1 for row in summary_data if row['Within_5pct_Tolerance'] == 'Yes')}/{len(summary_data)}"
        }
        
        df = pd.concat([df, pd.DataFrame([summary_row])], ignore_index=True)
        
        return df
    
    def export_to_excel(self, filename: str = 'ai_counting_analysis.xlsx'):
        """Export analysis to Excel for verification"""
        
        with pd.ExcelWriter(filename, engine='openpyxl') as writer:
            # Summary table
            summary_df = self.create_summary_table()
            summary_df.to_excel(writer, sheet_name='Summary', index=False)
            
            # Descriptive statistics
            if 'descriptive' in self.results:
                desc_data = []
                for category, stats in self.results['descriptive'].items():
                    if isinstance(stats, dict):
                        for stat_name, value in stats.items():
                            desc_data.append({
                                'Category': category,
                                'Statistic': stat_name,
                                'Value': value
                            })
                
                pd.DataFrame(desc_data).to_excel(writer, sheet_name='Descriptive_Stats', index=False)
            
            # Raw data for Excel calculations
            raw_data = {
                'AI_System': list(self.ai_counts.keys()),
                'Count': list(self.ai_counts.values()),
                'True_Count': [self.true_count] * len(self.ai_counts),
                'Error': [count - self.true_count for count in self.ai_counts.values()],
                'Absolute_Error': [abs(count - self.true_count) for count in self.ai_counts.values()],
                'Percent_Error': [(abs(count - self.true_count) / self.true_count) * 100 for count in self.ai_counts.values()]
            }
            
            pd.DataFrame(raw_data).to_excel(writer, sheet_name='Raw_Data', index=False)
        
        print(f"\n💾 Analysis exported to Excel: {filename}")
        return filename

def main():
    """Example usage with sample data"""
    
    print("📊 STATISTICAL ANALYSIS HELPER")
    print("=" * 50)
    
    # Sample data - replace with your actual values
    ai_counts = {
        'ChatGPT_Scan': 78,
        'Claude_Addition': 91,
        'Claude_Fabrication': 115,
        #'Claude_Workup': 200
    }
    
    true_count = 290  # Replace with your deterministic count
    
    # Initialize analyzer
    analyzer = StatisticalAnalysisHelper()
    analyzer.set_data(ai_counts, true_count)
    
    # Formulate hypotheses
    hypotheses = analyzer.formulate_null_hypothesis()
    
    # Calculate descriptive statistics
    desc_stats = analyzer.calculate_descriptive_statistics()
    
    # Perform hypothesis tests
    test_results = analyzer.perform_hypothesis_tests()
    
    # Calculate effect sizes
    effect_sizes = analyzer.calculate_effect_sizes()
    
    # Generate Excel formulas
    excel_formulas = analyzer.generate_excel_formulas()
    
    # Export to Excel
    excel_file = analyzer.export_to_excel()
    
    print(f"\n✅ ANALYSIS COMPLETE")
    print(f"📋 Use the Excel file to verify calculations")
    print(f"🧪 Statistical evidence for your research paper ready")

if __name__ == "__main__":
    main()

📊 STATISTICAL ANALYSIS HELPER
📊 DATA LOADED
True Count (Deterministic): 290
ChatGPT_Scan: 78
Claude_Addition: 91
Claude_Fabrication: 115

🔬 NULL HYPOTHESES FORMULATED

ACCURACY_NULL:
  H₀: AI systems can accurately count P0 failures in documents (μ_error = 0)

ACCURACY_ALTERNATIVE:
  H₁: AI systems cannot accurately count P0 failures in documents (μ_error ≠ 0)

CONSISTENCY_NULL:
  H₀: AI systems produce consistent counts across different approaches (σ² = 0)

CONSISTENCY_ALTERNATIVE:
  H₁: AI systems produce inconsistent counts across different approaches (σ² > 0)

EQUIVALENCE_NULL:
  H₀: AI counting performance is equivalent to deterministic methods (|μ_AI - μ_true| ≤ δ)

EQUIVALENCE_ALTERNATIVE:
  H₁: AI counting performance is not equivalent to deterministic methods (|μ_AI - μ_true| > δ)

PRACTICAL_NULL:
  H₀: AI counting errors are within acceptable tolerance for research/clinical use (error ≤ 5%)

PRACTICAL_ALTERNATIVE:
  H₁: AI counting errors exceed acceptable tolerance for resea