# Health Indicator Analysis
## Pakistan, India, and Bangladesh (40+ Years) - Analysis & Visualization

### Overview
This notebook continues from 'Long-term indicators' and performs comprehensive analysis of long-term health indicators. It assumes that the `long_term` DataFrame and other necessary variables have been created.

### Objectives
1. Select the most relevant health indicator from `long_term` DataFrame
2. Perform comprehensive time series analysis
3. Compare trends across the three countries
4. Analyze long-term patterns and policy implications
5. Generate insights and recommendations

**Prerequisites:** Run `long_term_indicator.ipynb` first to create the necessary DataFrames.


## Setup and Data Loading

First, let's import libraries and load the data prepared in Part 1. If you haven't run Part 1 yet, please run `long_term_indicators.ipynb` first.


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.dates as mdates
warnings.filterwarnings('ignore')

# Set up plotting preferences
plt.style.use('default')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

print("Libraries loaded successfully")

# Load the data prepared in Part 1
# If Part 1 variables are not available, load from the prepared data
try:
    # Check if variables from Part 1 exist
    print(f"Using existing variables from Part 1:")
    print(f"- df shape: {df.shape}")
    print(f"- long_term indicators: {len(long_term)}")
    print("✅ Part 1 variables found and ready for analysis")
except NameError:
    print("❌ Part 1 variables not found. Please run Part 1 first or load the data manually.")
    print("Loading data manually...")
    
    # Load the filtered dataset
    df = pd.read_csv('/home/jovyan/work/data/processed/filtered_data.csv')
    
    # Recreate necessary variables
    year_columns = [col for col in df.columns if col.isdigit()]
    year_columns = sorted([int(year) for year in year_columns])
    year_columns_str = [str(year) for year in year_columns]
    
    # This is a simplified version - ideally run Part 1 first
    print("⚠️  Basic data loaded. For full functionality, please run Part 1 first.")


In [None]:
# Focus on health and development related indicators
health_keywords = ['mortality', 'life expectancy', 'infant', 'child', 'maternal', 'birth', 'death', 'health', 'population', 'fertility']

print("=== Health and Development Indicators with 40+ Years Data ===")
health_indicators = long_term[
    long_term['Indicator_Name'].str.contains('|'.join(health_keywords), case=False, na=False)
].copy()

if len(health_indicators) > 0:
    print(f"Found {len(health_indicators)} health-related indicators:")
    for idx, row in health_indicators.iterrows():
        print(f"\n{row['Indicator_Name']}")
        print(f"  Code: {row['Indicator_Code']}")
        print(f"  Years: {row['Min_Year']}-{row['Max_Year']} ({row['Year_Range']} years)")
        print(f"  Data completeness: {row['Data_Density']}%")
else:
    print("No health indicators found with 40+ years. Let's look at top indicators by data coverage:")
    print(long_term[['Indicator_Name', 'Year_Range', 'Data_Density']].head(10).to_string())


In [None]:
# Select the best indicator for analysis
if len(health_indicators) > 0:
    # Prioritize by data completeness and relevance
    selected_indicator = health_indicators.iloc[0]
    print(f"🎯 SELECTED INDICATOR FOR ANALYSIS:")
    print(f"   {selected_indicator['Indicator_Name']}")
    print(f"   Code: {selected_indicator['Indicator_Code']}")
    print(f"   Period: {selected_indicator['Min_Year']}-{selected_indicator['Max_Year']}")
    print(f"   Data completeness: {selected_indicator['Data_Density']}%")
elif len(long_term) > 0:
    # If no health indicators, pick the one with best coverage
    selected_indicator = long_term.iloc[0]
    print(f"🎯 SELECTED INDICATOR FOR ANALYSIS (Best Available):")
    print(f"   {selected_indicator['Indicator_Name']}")
    print(f"   Code: {selected_indicator['Indicator_Code']}")
    print(f"   Period: {selected_indicator['Min_Year']}-{selected_indicator['Max_Year']}")
    print(f"   Data completeness: {selected_indicator['Data_Density']}%")
else:
    print("❌ No indicators found with 40+ years of data for all three countries")
    print("Let's look for indicators with at least 30 years of data...")
    
    # Fallback to 30 years - need to recreate the function if not available
    try:
        _, medium_term = analyze_indicator_coverage(df, min_years=30)
        if len(medium_term) > 0:
            selected_indicator = medium_term.iloc[0]
            print(f"🎯 SELECTED INDICATOR FOR ANALYSIS (30+ years):")
            print(f"   {selected_indicator['Indicator_Name']}")
            print(f"   Code: {selected_indicator['Indicator_Code']}")
            print(f"   Period: {selected_indicator['Min_Year']}-{selected_indicator['Max_Year']}")
            print(f"   Data completeness: {selected_indicator['Data_Density']}%")
        else:
            print("No suitable indicators found even with 30+ years criteria.")
    except NameError:
        print("analyze_indicator_coverage function not available. Please run Part 1 first.")


In [None]:
# Extract and prepare the selected indicator data
try:
    selected_code = selected_indicator['Indicator_Code']
    selected_name = selected_indicator['Indicator_Name']
    
    # Filter for the selected indicator
    indicator_data = df[df['Indicator Code'] == selected_code].copy()
    
    # Transform to long format
    indicator_long = indicator_data.melt(
        id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
        value_vars=year_columns_str,
        var_name='Year',
        value_name='Value'
    )
    
    # Clean the data
    indicator_long['Year'] = indicator_long['Year'].astype(int)
    indicator_long['Value'] = pd.to_numeric(indicator_long['Value'], errors='coerce')
    indicator_clean = indicator_long.dropna(subset=['Value']).copy()
    
    print(f"=== Data Preparation Complete ===")
    print(f"Indicator: {selected_name}")
    print(f"Total data points: {len(indicator_clean)}")
    print(f"Year range: {indicator_clean['Year'].min()} - {indicator_clean['Year'].max()}")
    print(f"Countries: {sorted(indicator_clean['Country Name'].unique())}")
    
    # Show data coverage by country
    print(f"=== Data Coverage by Country ===")
    for country in sorted(indicator_clean['Country Name'].unique()):
        country_data = indicator_clean[indicator_clean['Country Name'] == country]
        print(f"{country}: {len(country_data)} data points ({country_data['Year'].min()}-{country_data['Year'].max()})")
    
    # Basic statistics
    print(f"=== Basic Statistics ===")
    print(indicator_clean.groupby('Country Name')['Value'].agg(['count', 'mean', 'std', 'min', 'max']).round(2))
    
except NameError:
    print("No suitable indicator was selected. Please run the previous cell first.")
    
except Exception as e:
    print(f"Error processing data: {e}")
    print("Let's manually select a common indicator...")
    
    # Manually try some common long-term indicators
    common_indicators = [
        'SP.DYN.LE00.IN',  # Life expectancy at birth
        'SP.DYN.IMRT.IN',  # Infant mortality rate
        'SP.DYN.TFRT.IN',  # Total fertility rate
        'SP.POP.TOTL',     # Total population
        'NY.GDP.PCAP.PP.KD' # GDP per capita PPP
    ]
    
    for code in common_indicators:
        test_data = df[df['Indicator Code'] == code]
        if not test_data.empty:
            print(f"Found data for {code}: {test_data['Indicator Name'].iloc[0]}")
            selected_code = code
            selected_name = test_data['Indicator Name'].iloc[0]
            break


In [None]:
# Comprehensive Time Series Visualization
if 'indicator_clean' in locals() and len(indicator_clean) > 0:
    
    # Main time series plot
    plt.figure(figsize=(16, 10))
    
    colors = ['#2E86AB', '#A23B72', '#F18F01']
    markers = ['o', 's', '^']
    
    for i, country in enumerate(sorted(indicator_clean['Country Name'].unique())):
        country_data = indicator_clean[indicator_clean['Country Name'] == country].sort_values('Year')
        
        plt.plot(country_data['Year'], country_data['Value'], 
                marker=markers[i], linewidth=3, markersize=6, 
                label=country, color=colors[i], alpha=0.8)
    
    plt.title(f'{selected_name}\\nLong-term Trends (1960-2023)', 
              fontsize=18, fontweight='bold', pad=20)
    plt.xlabel('Year', fontsize=14)
    plt.ylabel(selected_name.split('(')[0].strip(), fontsize=14)
    plt.legend(fontsize=13, loc='best')
    plt.grid(True, alpha=0.3)
    
    # Add trend lines
    for i, country in enumerate(sorted(indicator_clean['Country Name'].unique())):
        country_data = indicator_clean[indicator_clean['Country Name'] == country].sort_values('Year')
        if len(country_data) > 1:
            z = np.polyfit(country_data['Year'], country_data['Value'], 1)
            p = np.poly1d(z)
            plt.plot(country_data['Year'], p(country_data['Year']), 
                    linestyle='--', color=colors[i], alpha=0.6, linewidth=2)
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics table
    print("=== Decade-wise Analysis ===")
    indicator_clean['Decade'] = (indicator_clean['Year'] // 10) * 10
    decade_stats = indicator_clean.groupby(['Country Name', 'Decade'])['Value'].agg(['mean', 'count']).round(2)
    print(decade_stats.to_string())
    
else:
    print("No data available for visualization. Please check the data preparation step.")


In [None]:
# Statistical Analysis and Trend Detection
if 'indicator_clean' in locals() and len(indicator_clean) > 0:
    
    print("=== Long-term Trend Analysis (Linear Regression) ===")
    
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    trend_results = {}
    colors = ['#2E86AB', '#A23B72', '#F18F01']
    
    for i, country in enumerate(sorted(indicator_clean['Country Name'].unique())):
        country_data = indicator_clean[indicator_clean['Country Name'] == country].sort_values('Year')
        
        if len(country_data) > 10:  # Need sufficient data for meaningful regression
            # Perform linear regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(country_data['Year'], country_data['Value'])
            
            # Calculate additional metrics
            years_span = country_data['Year'].max() - country_data['Year'].min()
            total_change = slope * years_span
            percent_change = (total_change / country_data['Value'].iloc[0]) * 100 if country_data['Value'].iloc[0] != 0 else 0
            
            trend_results[country] = {
                'slope': slope,
                'r_squared': r_value**2,
                'p_value': p_value,
                'total_change': total_change,
                'percent_change': percent_change,
                'years_span': years_span
            }
            
            # Visualization
            axes[i].scatter(country_data['Year'], country_data['Value'], alpha=0.6, s=30, color=colors[i])
            axes[i].plot(country_data['Year'], slope * country_data['Year'] + intercept, 
                        color=colors[i], linewidth=3, alpha=0.8)
            
            axes[i].set_title(f'{country}\\nSlope: {slope:.3f}/year\\nR² = {r_value**2:.3f}', 
                            fontweight='bold', fontsize=12)
            axes[i].set_xlabel('Year')
            axes[i].set_ylabel('Value')
            axes[i].grid(True, alpha=0.3)
            
            # Statistical significance
            significance = "***" if p_value < 0.001 else "**" if p_value < 0.01 else "*" if p_value < 0.05 else "ns"
            trend_direction = "↗" if slope > 0 else "↘"
            
            print(f"\\n{country}:")
            print(f"  Annual change: {slope:.4f} units/year {trend_direction}")
            print(f"  Total change over {years_span} years: {total_change:.2f} units ({percent_change:+.1f}%)")
            print(f"  R² = {r_value**2:.3f}, p-value = {p_value:.4f} {significance}")
            print(f"  Statistical significance: {'Significant' if p_value < 0.05 else 'Not significant'}")
    
    plt.tight_layout()
    plt.show()
    
    # Comparative analysis
    print("="*60)
    print("                 COMPARATIVE ANALYSIS")
    print("="*60)
    
    if len(trend_results) >= 2:
        countries = list(trend_results.keys())
        
        # Compare slopes
        slopes = [trend_results[country]['slope'] for country in countries]
        fastest_improving = countries[np.argmin(slopes)] if min(slopes) < 0 else countries[np.argmax(slopes)]
        slowest_improving = countries[np.argmax(slopes)] if min(slopes) < 0 else countries[np.argmin(slopes)]
        
        print(f"\\n📈 Trend Comparison:")
        print(f"  Fastest improving: {fastest_improving}")
        print(f"  Slowest improving: {slowest_improving}")
        
        # Compare R-squared values
        r_squared_values = [trend_results[country]['r_squared'] for country in countries]
        most_consistent = countries[np.argmax(r_squared_values)]
        
        print(f"\\n📊 Trend Consistency:")
        print(f"  Most consistent trend: {most_consistent} (R² = {max(r_squared_values):.3f})")
    
else:
    print("No data available for statistical analysis.")


In [None]:
# Final Summary and Conclusions
if 'indicator_clean' in locals() and len(indicator_clean) > 0:
    
    print("\\n" + "="*80)
    print("                    COMPREHENSIVE ANALYSIS SUMMARY")
    print("="*80)
    
    # Current status
    latest_year = indicator_clean['Year'].max()
    latest_data = indicator_clean[indicator_clean['Year'] == latest_year]
    
    print(f"\\n📊 CURRENT STATUS ({latest_year}):")
    print(f"   Indicator: {selected_name}")
    for _, row in latest_data.iterrows():
        print(f"   • {row['Country Name']}: {row['Value']:.2f}")
    
    # Historical perspective
    earliest_year = indicator_clean['Year'].min()
    earliest_data = indicator_clean[indicator_clean['Year'] == earliest_year]
    
    print(f"\\n📈 HISTORICAL PERSPECTIVE ({earliest_year} vs {latest_year}):")
    for country in sorted(indicator_clean['Country Name'].unique()):
        early_val = earliest_data[earliest_data['Country Name'] == country]['Value'].iloc[0]
        late_val = latest_data[latest_data['Country Name'] == country]['Value'].iloc[0]
        total_change = late_val - early_val
        percent_change = (total_change / early_val) * 100 if early_val != 0 else 0
        
        print(f"   • {country}: {early_val:.2f} → {late_val:.2f} ({total_change:+.2f}, {percent_change:+.1f}%)")
    
    # Key insights
    print(f"\\n🔍 KEY INSIGHTS:")
    
    # Ranking by current values
    current_ranking = latest_data.sort_values('Value', ascending=True)
    print(f"   • Current ranking (best to worst): {', '.join(current_ranking['Country Name'].tolist())}")
    
    # Improvement ranking
    improvement_scores = {}
    for country in sorted(indicator_clean['Country Name'].unique()):
        early_val = earliest_data[earliest_data['Country Name'] == country]['Value'].iloc[0]
        late_val = latest_data[latest_data['Country Name'] == country]['Value'].iloc[0]
        # Assume lower values are better for most health indicators
        improvement = early_val - late_val  # Positive means improvement
        improvement_scores[country] = improvement
    
    best_improver = max(improvement_scores, key=improvement_scores.get)
    worst_improver = min(improvement_scores, key=improvement_scores.get)
    
    print(f"   • Best long-term improvement: {best_improver}")
    print(f"   • Least improvement: {worst_improver}")
    
    # Data quality assessment
    print(f"\\n📋 DATA QUALITY ASSESSMENT:")
    print(f"   • Analysis period: {earliest_year}-{latest_year} ({latest_year-earliest_year+1} years)")
    print(f"   • Total data points: {len(indicator_clean)}")
    
    for country in sorted(indicator_clean['Country Name'].unique()):
        country_data = indicator_clean[indicator_clean['Country Name'] == country]
        coverage = (len(country_data) / (latest_year - earliest_year + 1)) * 100
        print(f"   • {country}: {len(country_data)} data points ({coverage:.1f}% coverage)")
    
    # Future projections (simple linear extrapolation)
    if 'trend_results' in locals() and len(trend_results) > 0:
        print(f"\\n🔮 SIMPLE PROJECTIONS (Linear Extrapolation to 2030):")
        for country in sorted(indicator_clean['Country Name'].unique()):
            if country in trend_results:
                current_val = latest_data[latest_data['Country Name'] == country]['Value'].iloc[0]
                slope = trend_results[country]['slope']
                years_ahead = 2030 - latest_year
                projected_val = current_val + (slope * years_ahead)
                
                print(f"   • {country}: {current_val:.2f} → {projected_val:.2f} (by 2030)")
    
    # Recommendations
    print(f"\\n💡 POLICY RECOMMENDATIONS:")
    print(f"   • Continue monitoring this indicator as it shows significant long-term trends")
    print(f"   • Countries with slower improvement should study best practices from {best_improver}")
    print(f"   • Focus on understanding the drivers behind periods of rapid change")
    print(f"   • Consider complementary indicators for a more comprehensive view")
    
    print("\\n" + "="*80)
    print("✅ ANALYSIS COMPLETE - Part 2 of long-term health indicator analysis")
    print("="*80)
    
else:
    print("\\n❌ ANALYSIS INCOMPLETE")
    print("No data was available for comprehensive analysis.")
    print("Please check that Part 1 was run successfully and the data is available.")
