# Case Study 1: Debug Analysis
## Iceland vs Eurozone - Data Validation & Testing

**Purpose:** Internal debugging and validation of analysis methodology

In [11]:
# Setup with debugging capabilities
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Debug configuration
DEBUG = True

def debug_print(message):
    if DEBUG:
        print(f"[DEBUG] {message}")

def validate_data(df, name):
    print(f"\n=== {name} VALIDATION ===")
    print(f"Shape: {df.shape}")
    print(f"Missing values: {df.isnull().sum().sum()}")
    print(f"Duplicates: {df.duplicated().sum()}")
    return df

print("Debug environment ready")

Debug environment ready


## 1. Data Loading with Validation

In [12]:
# Load data with validation
debug_print("Loading BOP data...")
case_one_raw = pd.read_csv("../../data/case_study_1_data_july_24_2025.csv")
case_one_raw = validate_data(case_one_raw, "BOP Data")

print(f"\nCountries: {sorted(case_one_raw['COUNTRY'].unique())}")
print(f"Indicators: {len(case_one_raw['INDICATOR'].unique())}")
print(f"Time range: {case_one_raw['TIME_PERIOD'].min()} to {case_one_raw['TIME_PERIOD'].max()}")

[DEBUG] Loading BOP data...

=== BOP Data VALIDATION ===
Shape: (15248, 8)
Missing values: 2808
Duplicates: 0

Countries: ['Austria', 'Belgium', 'Finland', 'France', 'Germany', 'Iceland', 'Ireland', 'Italy', 'Luxembourg', 'Netherlands, The', 'Portugal', 'Spain']
Indicators: 6
Time range: 1999-Q1 to 2025-Q1


In [13]:
# Load GDP data
debug_print("Loading GDP data...")
gdp_raw = pd.read_csv("../../data/dataset_2025-07-24T18_28_31.898465539Z_DEFAULT_INTEGRATION_IMF.RES_WEO_6.0.0.csv")
gdp_raw = validate_data(gdp_raw, "GDP Data")

# Check country overlap
bop_countries = set(case_one_raw['COUNTRY'].unique())
gdp_countries = set(gdp_raw['COUNTRY'].unique())
common = bop_countries.intersection(gdp_countries)

print(f"\nCountry overlap: {len(common)} countries")
print(f"Common: {sorted(common)}")
if bop_countries - gdp_countries:
    print(f"BOP only: {bop_countries - gdp_countries}")

[DEBUG] Loading GDP data...

=== GDP Data VALIDATION ===
Shape: (5571, 7)
Missing values: 27
Duplicates: 0

Country overlap: 12 countries
Common: ['Austria', 'Belgium', 'Finland', 'France', 'Germany', 'Iceland', 'Ireland', 'Italy', 'Luxembourg', 'Netherlands, The', 'Portugal', 'Spain']


## 2. Data Processing Pipeline

In [14]:
# Replicate main analysis pipeline
debug_print("Processing BOP data...")

# Clean BOP data
case_one_clean = case_one_raw.copy()
case_one_clean['ENTRY_FIRST_WORD'] = case_one_clean['BOP_ACCOUNTING_ENTRY'].str.extract(r'^([^,]+)')
case_one_clean['FULL_INDICATOR'] = case_one_clean['ENTRY_FIRST_WORD'] + ' - ' + case_one_clean['INDICATOR']

# Drop columns and process time
columns_to_drop = ['BOP_ACCOUNTING_ENTRY', 'INDICATOR', 'ENTRY_FIRST_WORD', 'FREQUENCY', 'SCALE']
case_one_clean = case_one_clean.drop(columns=columns_to_drop)
case_one_clean[['YEAR', 'QUARTER']] = case_one_clean['TIME_PERIOD'].str.split('-', expand=True)
case_one_clean['YEAR'] = case_one_clean['YEAR'].astype(int)
case_one_clean['QUARTER'] = case_one_clean['QUARTER'].str.extract(r'(\d+)').astype(int)
case_one_clean = case_one_clean.drop('TIME_PERIOD', axis=1)

print(f"Cleaned BOP shape: {case_one_clean.shape}")
print(f"Indicators: {len(case_one_clean['FULL_INDICATOR'].unique())}")
print(f"Year range: {case_one_clean['YEAR'].min()}-{case_one_clean['YEAR'].max()}")

[DEBUG] Processing BOP data...
Cleaned BOP shape: (15248, 6)
Indicators: 13
Year range: 1999-2025


In [15]:
# Process and join data
debug_print("Pivoting and joining data...")

# Pivot BOP data
bop_pivoted = case_one_clean.pivot_table(
    index=['COUNTRY', 'YEAR', 'QUARTER', 'UNIT'],
    columns='FULL_INDICATOR',
    values='OBS_VALUE',
    aggfunc='first'
).reset_index()

# Process GDP
gdp_clean = gdp_raw[['COUNTRY', 'TIME_PERIOD', 'INDICATOR', 'OBS_VALUE']].copy()
gdp_pivoted = gdp_clean.pivot_table(
    index=['COUNTRY', 'TIME_PERIOD'],
    columns='INDICATOR',
    values='OBS_VALUE',
    aggfunc='first'
).reset_index()

# Join datasets
merged_data = bop_pivoted.merge(
    gdp_pivoted,
    left_on=['COUNTRY', 'YEAR'],
    right_on=['COUNTRY', 'TIME_PERIOD'],
    how='left'
).drop('TIME_PERIOD', axis=1, errors='ignore')

print(f"Merged data shape: {merged_data.shape}")
print(f"GDP column available: {'Gross domestic product (GDP), Current prices, US dollar' in merged_data.columns}")

[DEBUG] Pivoting and joining data...
Merged data shape: (1186, 18)
GDP column available: True


## 3. Normalization and Grouping

In [16]:
# Normalize to % of GDP
debug_print("Normalizing to % of GDP...")

metadata_cols = ['COUNTRY', 'YEAR', 'QUARTER', 'UNIT']
gdp_col = 'Gross domestic product (GDP), Current prices, US dollar'
bop_cols = [col for col in merged_data.columns if col not in metadata_cols + [gdp_col]]

# Create normalized dataset
normalized_data = merged_data[metadata_cols + [gdp_col]].copy()
for col in bop_cols:
    normalized_data[f"{col}_PGDP"] = (merged_data[col] * 4 / merged_data[gdp_col]) * 100

normalized_data['UNIT'] = "% of GDP (annualized)"

print(f"Normalized data shape: {normalized_data.shape}")
print(f"BOP indicators normalized: {len(bop_cols)}")

# Check for normalization issues
gdp_zeros = (merged_data[gdp_col] == 0).sum()
gdp_nulls = merged_data[gdp_col].isnull().sum()
print(f"GDP zero values: {gdp_zeros}")
print(f"GDP null values: {gdp_nulls}")

[DEBUG] Normalizing to % of GDP...
Normalized data shape: (1186, 18)
BOP indicators normalized: 13
GDP zero values: 0
GDP null values: 0


In [17]:
# Create country groups
debug_print("Creating country groups...")

# Add groups
normalized_data['GROUP'] = normalized_data['COUNTRY'].apply(
    lambda x: 'Iceland' if x == 'Iceland' else 'Eurozone'
)

# Remove Luxembourg
final_data = normalized_data[normalized_data['COUNTRY'] != 'Luxembourg'].copy()

print(f"Final data shape: {final_data.shape}")
print(f"Group distribution:")
print(final_data['GROUP'].value_counts())

# Show countries by group
for group in ['Iceland', 'Eurozone']:
    countries = sorted(final_data[final_data['GROUP'] == group]['COUNTRY'].unique())
    print(f"{group}: {countries}")

# Get analysis indicators
analysis_indicators = [col for col in final_data.columns if col.endswith('_PGDP')]
print(f"\nAnalysis indicators: {len(analysis_indicators)}")

[DEBUG] Creating country groups...
Final data shape: (1093, 19)
Group distribution:
GROUP
Eurozone    988
Iceland     105
Name: count, dtype: int64
Iceland: ['Iceland']
Eurozone: ['Austria', 'Belgium', 'Finland', 'France', 'Germany', 'Ireland', 'Italy', 'Netherlands, The', 'Portugal', 'Spain']

Analysis indicators: 13


## 4. Statistical Testing and Validation

In [18]:
# Perform statistical tests with validation
debug_print("Running statistical tests...")

def test_volatility_with_validation(data, indicator):
    """Test volatility with comprehensive validation"""
    iceland_data = data[data['GROUP'] == 'Iceland'][indicator].dropna()
    eurozone_data = data[data['GROUP'] == 'Eurozone'][indicator].dropna()
    
    if len(iceland_data) < 2 or len(eurozone_data) < 2:
        return {'error': 'Insufficient data'}
    
    # Basic statistics
    iceland_stats = {
        'n': len(iceland_data),
        'mean': iceland_data.mean(),
        'std': iceland_data.std(),
        'var': iceland_data.var()
    }
    
    eurozone_stats = {
        'n': len(eurozone_data),
        'mean': eurozone_data.mean(),
        'std': eurozone_data.std(),
        'var': eurozone_data.var()
    }
    
    # F-test
    f_stat = iceland_stats['var'] / eurozone_stats['var'] if eurozone_stats['var'] != 0 else np.inf
    df1, df2 = iceland_stats['n'] - 1, eurozone_stats['n'] - 1
    p_value = 2 * min(stats.f.cdf(f_stat, df1, df2), 1 - stats.f.cdf(f_stat, df1, df2))
    
    return {
        'iceland_stats': iceland_stats,
        'eurozone_stats': eurozone_stats,
        'f_statistic': f_stat,
        'p_value': p_value,
        'iceland_more_volatile': iceland_stats['var'] > eurozone_stats['var']
    }

# Test all indicators
test_results = []
for indicator in analysis_indicators:
    result = test_volatility_with_validation(final_data, indicator)
    if 'error' not in result:
        test_results.append({
            'Indicator': indicator.replace('_PGDP', ''),
            'F_Statistic': result['f_statistic'],
            'P_Value': result['p_value'],
            'Iceland_More_Volatile': result['iceland_more_volatile'],
            'Significant': result['p_value'] < 0.05
        })

results_df = pd.DataFrame(test_results)

print(f"\nTEST RESULTS SUMMARY:")
print(f"Indicators tested: {len(results_df)}")
if len(results_df) > 0:
    iceland_higher = results_df['Iceland_More_Volatile'].sum()
    significant = results_df['Significant'].sum()
    print(f"Iceland higher volatility: {iceland_higher}/{len(results_df)} ({iceland_higher/len(results_df)*100:.1f}%)")
    print(f"Statistically significant: {significant}/{len(results_df)} ({significant/len(results_df)*100:.1f}%)")
    
    print(f"\nTop 5 by F-statistic:")
    top_5 = results_df.nlargest(5, 'F_Statistic')
    for _, row in top_5.iterrows():
        name = row['Indicator'][:30] + '...' if len(row['Indicator']) > 30 else row['Indicator']
        print(f"  {name:<33} F={row['F_Statistic']:6.2f} p={row['P_Value']:7.4f}")

[DEBUG] Running statistical tests...

TEST RESULTS SUMMARY:
Indicators tested: 13
Iceland higher volatility: 9/13 (69.2%)
Statistically significant: 13/13 (100.0%)

Top 5 by F-statistic:
  Liabilities - Portfolio invest... F=  7.93 p= 0.0000
  Net (net acquisition of financ... F=  3.81 p= 0.0000
  Net (net acquisition of financ... F=  3.39 p= 0.0000
  Assets - Other investment, Deb... F=  2.82 p= 0.0000
  Liabilities - Other investment... F=  2.70 p= 0.0000


## 5. Data Quality Checks

In [19]:
# Comprehensive data quality assessment for ALL indicators
debug_print("Performing comprehensive data quality assessment...")

print("\nCOMPREHENSIVE DATA QUALITY ASSESSMENT:")
print("=" * 60)

# 1. Missing data analysis for ALL indicators
print("\n1. MISSING DATA ANALYSIS (All Indicators):")
print("-" * 50)

missing_data_summary = []
for indicator in analysis_indicators:
    clean_name = indicator.replace('_PGDP', '')
    
    # Overall missing data
    total_missing = final_data[indicator].isnull().sum()
    total_obs = len(final_data)
    missing_pct = (total_missing / total_obs) * 100
    
    # Missing by group
    iceland_missing = final_data[final_data['GROUP'] == 'Iceland'][indicator].isnull().sum()
    iceland_total = len(final_data[final_data['GROUP'] == 'Iceland'])
    iceland_missing_pct = (iceland_missing / iceland_total) * 100 if iceland_total > 0 else 0
    
    eurozone_missing = final_data[final_data['GROUP'] == 'Eurozone'][indicator].isnull().sum()
    eurozone_total = len(final_data[final_data['GROUP'] == 'Eurozone'])
    eurozone_missing_pct = (eurozone_missing / eurozone_total) * 100 if eurozone_total > 0 else 0
    
    missing_data_summary.append({
        'Indicator': clean_name,
        'Total_Missing_Pct': missing_pct,
        'Iceland_Missing_Pct': iceland_missing_pct,
        'Eurozone_Missing_Pct': eurozone_missing_pct,
        'Total_Missing_Count': total_missing
    })
    
    # Display with truncated name for readability
    display_name = clean_name[:35] + '...' if len(clean_name) > 35 else clean_name
    print(f"{display_name:<38} Total: {missing_pct:5.1f}% Iceland: {iceland_missing_pct:5.1f}% Eurozone: {eurozone_missing_pct:5.1f}%")

# Identify problematic indicators (>20% missing)
high_missing = [item for item in missing_data_summary if item['Total_Missing_Pct'] > 20]
if high_missing:
    print(f"\n⚠️  HIGH MISSING DATA (>20%): {len(high_missing)} indicators")
    for item in high_missing:
        print(f"   - {item['Indicator'][:50]}: {item['Total_Missing_Pct']:.1f}%")
else:
    print(f"\n✓ No indicators with excessive missing data (>20%)")

# 2. Outlier detection for ALL indicators
print("\n\n2. OUTLIER DETECTION (All Indicators - IQR Method):")
print("-" * 55)

outlier_summary = []
for indicator in analysis_indicators:
    clean_name = indicator.replace('_PGDP', '')
    data_values = final_data[indicator].dropna()
    
    if len(data_values) > 0:
        # IQR method
        Q1 = data_values.quantile(0.25)
        Q3 = data_values.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = ((data_values < lower_bound) | (data_values > upper_bound)).sum()
        outlier_pct = (outliers / len(data_values)) * 100
        
        # Extreme outliers (beyond 3*IQR)
        extreme_lower = Q1 - 3 * IQR
        extreme_upper = Q3 + 3 * IQR
        extreme_outliers = ((data_values < extreme_lower) | (data_values > extreme_upper)).sum()
        
        outlier_summary.append({
            'Indicator': clean_name,
            'Total_Observations': len(data_values),
            'Outliers_Count': outliers,
            'Outliers_Pct': outlier_pct,
            'Extreme_Outliers': extreme_outliers,
            'Min_Value': data_values.min(),
            'Max_Value': data_values.max(),
            'Q1': Q1,
            'Q3': Q3
        })
        
        display_name = clean_name[:35] + '...' if len(clean_name) > 35 else clean_name
        print(f"{display_name:<38} Outliers: {outliers:3d} ({outlier_pct:4.1f}%) Extreme: {extreme_outliers:2d}")

# Identify indicators with excessive outliers
high_outliers = [item for item in outlier_summary if item['Outliers_Pct'] > 15]
if high_outliers:
    print(f"\n⚠️  HIGH OUTLIER RATES (>15%): {len(high_outliers)} indicators")
    for item in high_outliers:
        print(f"   - {item['Indicator'][:50]}: {item['Outliers_Pct']:.1f}%")
else:
    print(f"\n✓ No indicators with excessive outliers (>15%)")

# 3. Data distribution analysis for ALL indicators
print("\n\n3. DATA DISTRIBUTION ANALYSIS (All Indicators):")
print("-" * 50)

distribution_summary = []
for indicator in analysis_indicators:
    clean_name = indicator.replace('_PGDP', '')
    
    # Iceland data
    iceland_data = final_data[final_data['GROUP'] == 'Iceland'][indicator].dropna()
    # Eurozone data  
    eurozone_data = final_data[final_data['GROUP'] == 'Eurozone'][indicator].dropna()
    
    if len(iceland_data) > 0 and len(eurozone_data) > 0:
        # Calculate key statistics
        iceland_stats = {
            'mean': iceland_data.mean(),
            'std': iceland_data.std(),
            'skew': stats.skew(iceland_data) if len(iceland_data) > 2 else np.nan,
            'kurtosis': stats.kurtosis(iceland_data) if len(iceland_data) > 2 else np.nan
        }
        
        eurozone_stats = {
            'mean': eurozone_data.mean(),
            'std': eurozone_data.std(), 
            'skew': stats.skew(eurozone_data) if len(eurozone_data) > 2 else np.nan,
            'kurtosis': stats.kurtosis(eurozone_data) if len(eurozone_data) > 2 else np.nan
        }
        
        # Coefficient of variation
        iceland_cv = (iceland_stats['std'] / abs(iceland_stats['mean'])) * 100 if iceland_stats['mean'] != 0 else np.inf
        eurozone_cv = (eurozone_stats['std'] / abs(eurozone_stats['mean'])) * 100 if eurozone_stats['mean'] != 0 else np.inf
        
        distribution_summary.append({
            'Indicator': clean_name,
            'Iceland_CV': iceland_cv,
            'Eurozone_CV': eurozone_cv,
            'CV_Ratio': iceland_cv / eurozone_cv if eurozone_cv != 0 else np.inf,
            'Iceland_Skew': iceland_stats['skew'],
            'Eurozone_Skew': eurozone_stats['skew'],
            'Iceland_Kurtosis': iceland_stats['kurtosis'],
            'Eurozone_Kurtosis': eurozone_stats['kurtosis']
        })
        
        display_name = clean_name[:30] + '...' if len(clean_name) > 30 else clean_name
        print(f"{display_name:<33} CV: Ice={iceland_cv:6.1f}% Euro={eurozone_cv:6.1f}% Ratio={iceland_cv/eurozone_cv if eurozone_cv != 0 else np.inf:5.2f}")

# 4. Sample size adequacy by indicator
print("\n\n4. SAMPLE SIZE ADEQUACY BY INDICATOR:")
print("-" * 45)

sample_size_issues = []
for indicator in analysis_indicators:
    clean_name = indicator.replace('_PGDP', '')
    
    iceland_n = final_data[final_data['GROUP'] == 'Iceland'][indicator].notna().sum()
    eurozone_n = final_data[final_data['GROUP'] == 'Eurozone'][indicator].notna().sum()
    
    # Check adequacy (rule of thumb: >30 for CLT, >10 for basic stats)
    iceland_adequate = "✓" if iceland_n >= 30 else "⚠️" if iceland_n >= 10 else "❌"
    eurozone_adequate = "✓" if eurozone_n >= 30 else "⚠️" if eurozone_n >= 10 else "❌"
    
    if iceland_n < 30 or eurozone_n < 30:
        sample_size_issues.append({
            'Indicator': clean_name,
            'Iceland_N': iceland_n,
            'Eurozone_N': eurozone_n
        })
    
    # Show sample sizes for all indicators
    display_name = clean_name[:30] + '...' if len(clean_name) > 30 else clean_name
    print(f"{display_name:<33} Iceland: {iceland_n:3d}{iceland_adequate} Eurozone: {eurozone_n:3d}{eurozone_adequate}")

if sample_size_issues:
    print(f"\n⚠️  SAMPLE SIZE CONCERNS: {len(sample_size_issues)} indicators with n<30")
    for item in sample_size_issues:
        print(f"   - {item['Indicator'][:45]}: Iceland={item['Iceland_N']}, Eurozone={item['Eurozone_N']}")

# 5. Time coverage consistency 
print("\n\n5. TIME COVERAGE BY GROUP:")
print("-" * 30)

for group in ['Iceland', 'Eurozone']:
    group_data = final_data[final_data['GROUP'] == group]
    min_year = group_data['YEAR'].min()
    max_year = group_data['YEAR'].max()
    year_span = max_year - min_year + 1
    
    # Check for gaps in years
    unique_years = group_data['YEAR'].nunique()
    quarters_per_year = group_data.groupby('YEAR')['QUARTER'].nunique().mean()
    
    if group == 'Iceland':
        total_observations = len(group_data)
        countries = 1
    else:
        countries = group_data['COUNTRY'].nunique() 
        total_observations = len(group_data)
        obs_per_country = total_observations / countries
        
    print(f"{group}:")
    print(f"   Time span: {min_year}-{max_year} ({year_span} years)")
    print(f"   Unique years: {unique_years}/{year_span} ({unique_years/year_span*100:.1f}% coverage)")
    print(f"   Avg quarters/year: {quarters_per_year:.1f}")
    if group == 'Eurozone':
        print(f"   Countries: {countries}, Avg obs/country: {obs_per_country:.1f}")
    print(f"   Total observations: {total_observations}\n")

# 6. Extreme value analysis
print("\n6. EXTREME VALUES ANALYSIS:")
print("-" * 35)

extreme_values = []
for indicator in analysis_indicators:
    clean_name = indicator.replace('_PGDP', '')
    data_values = final_data[indicator].dropna()
    
    if len(data_values) > 0:
        # Find extreme values (beyond 99th percentile or below 1st percentile)
        p1 = data_values.quantile(0.01)
        p99 = data_values.quantile(0.99)
        
        extreme_low = (data_values < p1).sum()
        extreme_high = (data_values > p99).sum()
        
        # Check for values that might be data errors (e.g., >1000% of GDP)
        very_extreme = (abs(data_values) > 1000).sum()
        
        if extreme_low > 0 or extreme_high > 0 or very_extreme > 0:
            extreme_values.append({
                'Indicator': clean_name,
                'Extreme_Low': extreme_low,
                'Extreme_High': extreme_high,
                'Very_Extreme': very_extreme,
                'Min': data_values.min(),
                'Max': data_values.max(),
                'P1': p1,
                'P99': p99
            })

if extreme_values:
    print(f"Found extreme values in {len(extreme_values)} indicators:")
    for item in extreme_values:
        display_name = item['Indicator'][:30] + '...' if len(item['Indicator']) > 30 else item['Indicator']
        print(f"  {display_name:<33} Min: {item['Min']:8.1f} Max: {item['Max']:8.1f} VeryExtreme: {item['Very_Extreme']}")
else:
    print("✓ No extreme values found (>99th percentile or <1st percentile)")

# 7. Export comprehensive quality assessment
print("\n\n7. EXPORTING COMPREHENSIVE QUALITY ASSESSMENT:")
print("-" * 50)

# Create comprehensive quality DataFrame
quality_assessment = pd.DataFrame([
    {
        'Indicator': item['Indicator'],
        'Missing_Data_Pct': missing_data_summary[i]['Total_Missing_Pct'],
        'Iceland_Missing_Pct': missing_data_summary[i]['Iceland_Missing_Pct'],
        'Eurozone_Missing_Pct': missing_data_summary[i]['Eurozone_Missing_Pct'],
        'Outliers_Pct': outlier_summary[i]['Outliers_Pct'] if i < len(outlier_summary) else np.nan,
        'Extreme_Outliers': outlier_summary[i]['Extreme_Outliers'] if i < len(outlier_summary) else np.nan,
        'Iceland_CV': distribution_summary[i]['Iceland_CV'] if i < len(distribution_summary) else np.nan,
        'Eurozone_CV': distribution_summary[i]['Eurozone_CV'] if i < len(distribution_summary) else np.nan,
        'CV_Ratio': distribution_summary[i]['CV_Ratio'] if i < len(distribution_summary) else np.nan,
        'Iceland_Sample_Size': final_data[final_data['GROUP'] == 'Iceland'][analysis_indicators[i]].notna().sum(),
        'Eurozone_Sample_Size': final_data[final_data['GROUP'] == 'Eurozone'][analysis_indicators[i]].notna().sum(),
        'Min_Value': outlier_summary[i]['Min_Value'] if i < len(outlier_summary) else np.nan,
        'Max_Value': outlier_summary[i]['Max_Value'] if i < len(outlier_summary) else np.nan
    }
    for i, item in enumerate(missing_data_summary)
])

quality_assessment.to_csv('comprehensive_data_quality_assessment.csv', index=False)
print(f"✓ Comprehensive quality assessment saved: comprehensive_data_quality_assessment.csv")

# Summary of quality issues
print("\n\nQUALITY ASSESSMENT SUMMARY:")
print("=" * 40)

total_indicators = len(analysis_indicators)
issues_found = []

if high_missing:
    issues_found.append(f"High missing data: {len(high_missing)} indicators (>20%)")
if high_outliers:
    issues_found.append(f"High outlier rates: {len(high_outliers)} indicators (>15%)")
if sample_size_issues:
    issues_found.append(f"Sample size concerns: {len(sample_size_issues)} indicators (n<30)")
if extreme_values:
    issues_found.append(f"Extreme values: {len(extreme_values)} indicators with very high/low values")

if issues_found:
    print(f"⚠️  POTENTIAL ISSUES IDENTIFIED:")
    for issue in issues_found:
        print(f"   - {issue}")
    
    print(f"\n📊 RECOMMENDATION: Review the exported CSV for detailed quality metrics")
    print(f"🔍 NEXT STEP: Consider excluding problematic indicators or using robust statistics")
else:
    print(f"✅ EXCELLENT: No major data quality issues detected across {total_indicators} indicators")
    print(f"✅ DATA READY: All indicators suitable for statistical analysis")

print(f"\n📈 OVERALL ASSESSMENT: Data quality is {'acceptable' if len(issues_found) <= 2 else 'concerning'} for research purposes")

[DEBUG] Performing comprehensive data quality assessment...

COMPREHENSIVE DATA QUALITY ASSESSMENT:

1. MISSING DATA ANALYSIS (All Indicators):
--------------------------------------------------
Assets - Direct investment, Total f... Total:   0.0% Iceland:   0.0% Eurozone:   0.0%
Assets - Other investment, Debt ins... Total:   2.7% Iceland:   0.0% Eurozone:   2.9%
Assets - Other investment, Debt ins... Total:   5.7% Iceland:   0.0% Eurozone:   6.3%
Assets - Portfolio investment, Debt... Total:   0.0% Iceland:   0.0% Eurozone:   0.0%
Assets - Portfolio investment, Equi... Total:   0.0% Iceland:   0.0% Eurozone:   0.0%
Assets - Portfolio investment, Tota... Total:   0.0% Iceland:   0.0% Eurozone:   0.0%
Liabilities - Direct investment, To... Total:   0.0% Iceland:   0.0% Eurozone:   0.0%
Liabilities - Other investment, Deb... Total:   6.5% Iceland:   0.0% Eurozone:   7.2%
Liabilities - Portfolio investment,... Total:   0.4% Iceland:   0.0% Eurozone:   0.4%
Liabilities - Portfolio investm

## 6. Export and Summary

In [20]:
# Final summary and export
debug_print("Generating final summary...")

print("\nDEBUG ANALYSIS SUMMARY:")
print("=" * 40)

print(f"\n✓ Data loaded and validated")
print(f"✓ Processing pipeline replicated")
print(f"✓ {len(analysis_indicators)} indicators normalized")
print(f"✓ Statistical tests completed")
print(f"✓ Data quality assessed")

if len(results_df) > 0:
    iceland_higher_pct = results_df['Iceland_More_Volatile'].mean() * 100
    significant_pct = results_df['Significant'].mean() * 100
    
    print(f"\nKEY FINDINGS:")
    print(f"  • {iceland_higher_pct:.1f}% of indicators show Iceland has higher volatility")
    print(f"  • {significant_pct:.1f}% of tests are statistically significant")
    
    if iceland_higher_pct >= 60 and significant_pct >= 50:
        print(f"  • CONCLUSION: Strong support for Hypothesis 1")
    else:
        print(f"  • CONCLUSION: Mixed evidence for Hypothesis 1")

# Export results
final_data.to_csv('debug_final_dataset.csv', index=False)
if len(results_df) > 0:
    results_df.to_csv('debug_test_results.csv', index=False)

print(f"\n✓ Files exported: debug_final_dataset.csv, debug_test_results.csv")
print(f"\n🔍 Debug analysis complete - methodology validated")

[DEBUG] Generating final summary...

DEBUG ANALYSIS SUMMARY:

✓ Data loaded and validated
✓ Processing pipeline replicated
✓ 13 indicators normalized
✓ Statistical tests completed
✓ Data quality assessed

KEY FINDINGS:
  • 69.2% of indicators show Iceland has higher volatility
  • 100.0% of tests are statistically significant
  • CONCLUSION: Strong support for Hypothesis 1

✓ Files exported: debug_final_dataset.csv, debug_test_results.csv

🔍 Debug analysis complete - methodology validated


---

## Debug Analysis Complete

This notebook provides:

✅ **Data loading validation**  
✅ **Processing pipeline verification**  
✅ **Statistical testing with checks**  
✅ **Data quality assessment**  
✅ **Results export for review**  

Use this to validate the main analysis and check for any methodology issues.