In [None]:
import pandas as pd

# Read large file in chunks for memory efficiency
chunk_size = 10000
filtered_data = []

for chunk in pd.read_csv('data/raw/world_bank/wdi-csv-zip-57-mb-/WDIData.csv', chunksize=chunk_size):
    # Filter for Bangladesh, India, and Pakistan data only
    filtered_chunk = chunk[chunk['Country Name'].isin(['Bangladesh', 'India', 'Pakistan'])]
    if not filtered_chunk.empty:
        filtered_data.append(filtered_chunk)

# Combine all chunks and save
if filtered_data:
    final_df = pd.concat(filtered_data, ignore_index=True)
    final_df.to_csv('data/processed/filtered_data.csv', index=False)
    print(f"Filtered data saved with {len(final_df)} rows")

In [4]:
# Load the filtered data
file_path = '/home/jovyan/work/data/processed/filtered_data.csv'
df = pd.read_csv(file_path)

# Search for GDP, mortality, and literacy indicators
gdp_indicators = df[df['Indicator Name'].str.contains('GDP', case=False, na=False)]
mortality_indicators = df[df['Indicator Name'].str.contains('mortality', case=False, na=False)]
literacy_indicators = df[df['Indicator Name'].str.contains('literacy', case=False, na=False)]

In [None]:

print('--- GDP Indicators ---')
print(gdp_indicators[['Indicator Name', 'Indicator Code']].drop_duplicates().to_string())


In [6]:

print('\n--- Mortality Indicators ---')
print(mortality_indicators[['Indicator Name', 'Indicator Code']].drop_duplicates().to_string())



--- Mortality Indicators ---
                                                                                                                  Indicator Name     Indicator Code
807                                                         Maternal mortality ratio (modeled estimate, per 100,000 live births)        SH.STA.MMRT
808                                                        Maternal mortality ratio (national estimate, per 100,000 live births)     SH.STA.MMRT.NE
849                                                                 Mortality caused by road traffic injury (per 100,000 people)     SH.STA.TRAF.P5
850                                                 Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70 (%)     SH.DYN.NCOM.ZS
851                                         Mortality from CVD, cancer, diabetes or CRD between exact ages 30 and 70, female (%)  SH.DYN.NCOM.FE.ZS
852                                           Mortality from CVD, cancer, diabetes

In [5]:

print('\n--- Literacy Indicators ---')
print(literacy_indicators[['Indicator Name', 'Indicator Code']].drop_duplicates().to_string())



--- Literacy Indicators ---
                                                   Indicator Name        Indicator Code
773  Literacy rate, adult female (% of females ages 15 and above)     SE.ADT.LITR.FE.ZS
774      Literacy rate, adult male (% of males ages 15 and above)     SE.ADT.LITR.MA.ZS
775    Literacy rate, adult total (% of people ages 15 and above)        SE.ADT.LITR.ZS
776  Literacy rate, youth (ages 15-24), gender parity index (GPI)  SE.ADT.1524.LT.FM.ZS
777         Literacy rate, youth female (% of females ages 15-24)  SE.ADT.1524.LT.FE.ZS
778             Literacy rate, youth male (% of males ages 15-24)  SE.ADT.1524.LT.MA.ZS
779           Literacy rate, youth total (% of people ages 15-24)     SE.ADT.1524.LT.ZS
