# Data Preparation - Long-term Indicator
## Pakistan, India, and Bangladesh (40+ Years)

### Overview
This notebook prepares data for Pakistan, India, and Bangladesh over a period of 40+ years. This is Part 1 which focuses on data loading, filtering, and identifying indicators with comprehensive historical data.

### Objectives (Part 1)
1. Load and explore the filtered dataset
2. Identify indicators with 40+ years of data availability
3. Filter for health and development related indicators
4. Create the `long_term` DataFrame for analysis


In order to perform data analysis for objectives above, I imported necessary libraries and set up plotting preferences for visualization first.


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.dates as mdates
warnings.filterwarnings('ignore')

# Set up plotting preferences
plt.style.use('default')
sns.set_palette("Set2")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

print("Libraries loaded successfully")


Libraries loaded successfully


In [2]:
# Load the filtered dataset
df = pd.read_csv('/home/jovyan/work/data/processed/filtered_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Countries: {df['Country Name'].unique()}")
print(f"Total indicators: {df['Indicator Name'].nunique()}")

# Identify year columns
year_columns = [col for col in df.columns if col.isdigit()]
year_columns = sorted([int(year) for year in year_columns])
print(f"Year range in dataset: {min(year_columns)} - {max(year_columns)}")
print(f"Total years available: {len(year_columns)} years")

# Calculate the cutoff year for 40+ years of data
current_year = max(year_columns)
cutoff_year = current_year - 40
print(f"Looking for indicators with data from {cutoff_year} or earlier to {current_year}")

year_columns_str = [str(year) for year in year_columns]


Dataset shape: (4797, 64)
Countries: ['Bangladesh' 'India' 'Pakistan']
Total indicators: 1599
Year range in dataset: 1960 - 2018
Total years available: 59 years
Looking for indicators with data from 1978 or earlier to 2018


In [4]:
# Function to analyze data availability for each indicator
def analyze_indicator_coverage(df, min_years=40):
    """
    Analyze data coverage for each indicator across all countries
    """
    results = []
    
    for indicator_name in df['Indicator Name'].unique():
        indicator_data = df[df['Indicator Name'] == indicator_name]
        indicator_code = indicator_data['Indicator Code'].iloc[0]
        
        # Melt the data to long format
        indicator_long = indicator_data.melt(
            id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'],
            value_vars=year_columns_str,
            var_name='Year',
            value_name='Value'
        )
        
        # Convert to numeric and remove missing values
        indicator_long['Year'] = indicator_long['Year'].astype(int)
        indicator_long['Value'] = pd.to_numeric(indicator_long['Value'], errors='coerce')
        indicator_clean = indicator_long.dropna(subset=['Value'])
        
        if len(indicator_clean) > 0:
            # Calculate coverage statistics
            year_range = indicator_clean['Year'].max() - indicator_clean['Year'].min() + 1
            countries_with_data = indicator_clean['Country Name'].nunique()
            total_data_points = len(indicator_clean)
            
            # Check if all three countries have data
            countries_in_data = set(indicator_clean['Country Name'].unique())
            all_countries = {'Bangladesh', 'India', 'Pakistan'}
            has_all_countries = all_countries.issubset(countries_in_data)
            
            # Calculate data density (percentage of possible data points)
            possible_points = countries_with_data * year_range
            data_density = (total_data_points / possible_points) * 100 if possible_points > 0 else 0
            
            results.append({
                'Indicator_Name': indicator_name,
                'Indicator_Code': indicator_code,
                'Year_Range': year_range,
                'Min_Year': indicator_clean['Year'].min(),
                'Max_Year': indicator_clean['Year'].max(),
                'Countries_with_Data': countries_with_data,
                'Has_All_Countries': has_all_countries,
                'Total_Data_Points': total_data_points,
                'Data_Density': round(data_density, 1)
            })
    
    results_df = pd.DataFrame(results)
    
    # Filter for indicators with 40+ years and all countries
    long_term_indicators = results_df[
        (results_df['Year_Range'] >= min_years) & 
        (results_df['Has_All_Countries'] == True)
    ].sort_values(['Year_Range', 'Data_Density'], ascending=[False, False])
    
    return results_df, long_term_indicators

# Analyze data coverage
print("Analyzing data coverage for all indicators...")
all_indicators, long_term = analyze_indicator_coverage(df, min_years=40)

print(f"Found {len(long_term)} indicators with 40+ years of data for all three countries:")
print(long_term[['Indicator_Name', 'Year_Range', 'Min_Year', 'Max_Year', 'Data_Density']].head(10).to_string())


Analyzing data coverage for all indicators...
Found 798 indicators with 40+ years of data for all three countries:
                                                          Indicator_Name  Year_Range  Min_Year  Max_Year  Data_Density
50         Adolescent fertility rate (births per 1,000 women ages 15-19)          58      1960      2017         100.0
56                    Age dependency ratio (% of working-age population)          58      1960      2017         100.0
57               Age dependency ratio, old (% of working-age population)          58      1960      2017         100.0
58             Age dependency ratio, young (% of working-age population)          58      1960      2017         100.0
70            Agriculture, forestry, and fishing, value added (% of GDP)          58      1960      2017         100.0
72   Agriculture, forestry, and fishing, value added (constant 2010 US$)          58      1960      2017         100.0
73        Agriculture, forestry, and fishing, value 