In [1]:
import pandas as pd
import os
import numpy as np
import altair as alt

## Cleaning and exploration

#### Join datasets

In [2]:
def join_datasets():
    folder_path = "/Users/paulacadena/CAPP30239-SP/data"
    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            try:
                df = pd.read_csv(file_path, encoding="ISO-8859-1")
                dataframes.append(df)
            except UnicodeDecodeError:
                print(f"Could not decode {filename}. Skipping.")

    return pd.concat(dataframes, ignore_index=True)

#### Clean and unify

In [3]:
def clean_dataset():
    world_bank = join_datasets()

    #Correctly name missing values
    world_bank.replace("..", np.nan, inplace=True)
    #Drop missing values in identificating columns
    world_bank.dropna(subset=["Series Code", "Country Code", "Series Name"], inplace=True)
    #Drop wrongly identified country codes
    world_bank = world_bank[world_bank["Country Code"].str.len() <= 3]

    return world_bank

##### For easier use in visualizations

In [4]:
def wide_long_wb():
    world_bank = clean_dataset()

    # Identify the columns to transform
    value_vars = [col for col in world_bank.columns if "YR" in col]

    # Melt the DataFrame
    long_format = pd.melt(world_bank, 
                        id_vars=[col for col in world_bank.columns if col not in value_vars],
                        value_vars=value_vars, 
                        var_name='YEAR', 
                        value_name='Value')

    # Extract the year from the 'YEAR' column
    long_format['YEAR'] = long_format['YEAR'].str.extract(r'(\d{4})')[0].astype(int)
    #Change Value column to numeric
    long_format['Value'] = pd.to_numeric(long_format['Value'], errors='coerce')

    return long_format

In [5]:
world_bank = wide_long_wb()

#### Exploration

In [6]:
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8018304 entries, 0 to 8018303
Data columns (total 6 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Country Name  object 
 1   Country Code  object 
 2   Series Name   object 
 3   Series Code   object 
 4   YEAR          int64  
 5   Value         float64
dtypes: float64(1), int64(1), object(4)
memory usage: 367.0+ MB


In [7]:
def histogram_missing(variable, bins):
    
    # Group by the selected variable and count missing values in 'Value' column
    missing_values = world_bank.groupby(variable)['Value'].apply(lambda x: x.isna().sum()).reset_index()
    missing_values.columns = [variable, 'MissingValues']
    
    histogram = alt.Chart(missing_values).mark_bar().encode(
        alt.X('MissingValues:Q', bin=alt.Bin(maxbins=bins), title='Number of Missing Values'),
        alt.Y('count()', title=f"Count of {variable}")
    ).properties(
        title=f"Histogram of Missing Values by {variable}",
        width=800,
        height=400
    )

    histogram.display()

In [8]:
histogram_missing('Series Name',40)

In [9]:
histogram_missing('Country Name',40)

## Visualizations