In [None]:
import pandas as pd
import os
import numpy as np
import altair as alt
from vega_datasets import data
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

## Cleaning and exploration

#### Join datasets

In [None]:
def join_datasets():
    folder_path = '/Users/paulacadena/CAPP30239-SP/data'
    dataframes = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            try:
                df = pd.read_csv(file_path, encoding='ISO-8859-1')
                dataframes.append(df)
            except UnicodeDecodeError:
                print(f'Could not decode {filename}. Skipping.')

    return pd.concat(dataframes, ignore_index=True)

#### Clean and unify

In [4]:
def clean_dataset():
    world_bank = join_datasets()

    #Correctly name missing values
    world_bank.replace('..', np.nan, inplace=True)
    #Drop missing values in identificating columns
    world_bank.dropna(subset=['Series Code', 'Country Code', 'Series Name'], inplace=True)
    #Drop wrongly identified country codes
    world_bank = world_bank[world_bank['Country Code'].str.len() <= 3]

    return world_bank

##### For easier use in visualizations

In [5]:
def wide_long_wb():
    world_bank = clean_dataset()

    # Identify the columns to transform
    value_vars = [col for col in world_bank.columns if 'YR' in col]

    # Melt the DataFrame
    long_format = pd.melt(world_bank, 
                        id_vars=[col for col in world_bank.columns if col not in value_vars],
                        value_vars=value_vars, 
                        var_name='YEAR', 
                        value_name='Value')

    # Extract the year from the 'YEAR' column
    long_format['YEAR'] = long_format['YEAR'].str.extract(r'(\d{4})')[0].astype(int)
    #Change Value column to numeric
    long_format['Value'] = pd.to_numeric(long_format['Value'], errors='coerce')

    return long_format

##### To add more details

In [6]:
def add_continents(df):
    continents = pd.read_csv('/Users/paulacadena/CAPP30239-SP/data/old/continents2.csv', 
                             usecols = ['alpha-3','region','sub-region','country-code'])
    df = df.merge(continents, left_on = 'Country Code', right_on= 'alpha-3', how='left')
    df['country-code'] = df['country-code'].fillna(-1).astype(int)
    df.drop(columns='alpha-3',inplace = True)
    df.loc[df['Country Name'] == 'Kosovo','region'] = 'Europe'
    df.loc[df['Country Name'] == 'Channel Islands','region'] = 'Europe'
    df['region'] = df['region'].fillna('Aggregated data')
    return df

In [7]:
def add_decade(df):
    df['YEAR'] = df['YEAR'].astype(int)
    df['DECADE'] = (df['YEAR'] // 10) * 10
    return df

In [8]:
def world_bank_complete():
    df = wide_long_wb()
    df = add_continents(df)
    df = add_decade(df)
    df = df.rename(columns = {'Country Name':'Country', 'Series Name': 'Series', 
                         'YEAR': 'Year', 'region': 'Region', 'sub-region': 'Sub-region'})
    if 'Value' in df.columns:
        df['Value'] = df['Value'].round(2)
    return df

In [85]:
world_bank = world_bank_complete()

#### Exploration

In [10]:
world_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8018304 entries, 0 to 8018303
Data columns (total 10 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Country       object 
 1   Country Code  object 
 2   Series        object 
 3   Series Code   object 
 4   Year          int64  
 5   Value         float64
 6   country-code  int64  
 7   Region        object 
 8   Sub-region    object 
 9   DECADE        int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 611.7+ MB


In [11]:
world_bank['Series'].unique()

array(['GDP (constant 2015 US$)',
       'Average working hours of children, study and work, ages 7-14 (hours per week)',
       'Average working hours of children, study and work, female, ages 7-14 (hours per week)',
       'Average working hours of children, study and work, male, ages 7-14 (hours per week)',
       'Average working hours of children, working only, ages 7-14 (hours per week)',
       'Average working hours of children, working only, female, ages 7-14 (hours per week)',
       'Average working hours of children, working only, male, ages 7-14 (hours per week)',
       'Child employment in agriculture (% of economically active children ages 7-14)',
       'Child employment in agriculture, female (% of female economically active children ages 7-14)',
       'Child employment in agriculture, male (% of male economically active children ages 7-14)',
       'Child employment in manufacturing (% of economically active children ages 7-14)',
       'Child employment in manufact

In [18]:
def histogram_missing(variable, bins):
    
    # Group by the selected variable and count missing values in 'Value' column
    missing_values = world_bank.groupby(variable)['Value'].apply(lambda x: x.isna().sum()).reset_index()
    missing_values.columns = [variable, 'MissingValues']
    
    histogram = alt.Chart(missing_values).mark_bar().encode(
        alt.X('MissingValues:Q', bin=alt.Bin(maxbins=bins), title='Number of Missing Values'),
        alt.Y('count()', title=f'Count of {variable}')
    ).properties(
        title=f'Histogram of Missing Values by {variable}',
        width=800,
        height=400
    )

    histogram.display()

In [19]:
histogram_missing('Series',40)

In [20]:
histogram_missing('Country',40)

## Visualizations

#### Theme for altair

In [112]:
import altair as alt

def custom_theme():
    return {
        "config": {
            "title": {
                "fontSize": 16,           # Size of plot titles
                "font": "Helvetica",      # Font for titles
                "anchor": "start",        # Align title to the start of the chart
                "color": "#333333"        # Title color
            },
            "axis": {
                "labelFontSize": 12,      # Size of axis labels
                "titleFontSize": 14,      # Size of axis titles
                "labelFont": "Arial",     # Font for axis labels
                "titleFont": "Arial",     # Font for axis titles
                "labelColor": "#333333",  # Color for axis labels
                "titleColor": "#333333"   # Color for axis titles
            },
            "legend": {
                "labelFontSize": 10,      # Size of legend labels
                "titleFontSize": 12,      # Size of legend title
                "labelFont": "Arial",     # Font for legend labels
                "titleFont": "Arial"      # Font for legend title
            },
            "view": {
                "width": 600,             # Default chart width
                "height": 400             # Default chart height
            },
            "range": {
                "category": ["#742183","#166417","#f8c7cc","#edae49","#81a684","#113447","#AACEBE","#c6e2e9","#2c8c99","#73eedc"],
                "diverging": {"scheme": "purplegreen"},
                "heatmap":{"scheme":"purplegreen"},
                "ordinal":["#166417", "#742183"]
            }
        }
    }

alt.themes.register("custom_theme", custom_theme)
alt.themes.enable("custom_theme")


ThemeRegistry.enable('custom_theme')

1. Map

In [73]:
def world_map(series):
    world_bank = world_bank_complete()
    
    # Filter data based on the selected series
    world_bank = world_bank[world_bank['Series'] == series]

    # Filtering the first year and last year available data for each country
    first_data = world_bank.sort_values(by=['Country', 'Year']).groupby('Country').first().reset_index()
    last_data = world_bank.sort_values(by=['Country', 'Year'], ascending=False).groupby('Country').first().reset_index()

    # Calculate the min and max for the legend
    combined_data = pd.concat([first_data, last_data])
    min_value = combined_data['Value'].min()
    max_value = combined_data['Value'].max()

    # Load important world map
    source = alt.topo_feature(data.world_110m.url, 'countries')
    background = alt.Chart(source).mark_geoshape(fill='lightgray', stroke='white')

    # First map layer for first year available data
    first_map = (
        background
        + alt.Chart(source)
        .mark_geoshape(stroke='black', strokeWidth=0.15)
        .encode(
            color=alt.Color(
                'Value:Q',
                scale=alt.Scale(
                    domain=[min_value, 0, max_value]
                ),
                legend=alt.Legend()
            ),
            tooltip=[
                alt.Tooltip('Country:N', title='Country'),
                alt.Tooltip('Year:O', title='Year'),
                alt.Tooltip('Value:Q', title='Value')
            ]
        )
        .transform_lookup(
            lookup='id',
            from_=alt.LookupData(first_data, 'country-code', ['Value', 'Country', 'Year'])
        )
    ).properties(title= 'First Available Year by Country').project('equalEarth')

    # Second map layer for last year available data
    last_map = (
        background
        + alt.Chart(source)
        .mark_geoshape(stroke='black', strokeWidth=0.15, fillOpacity=1)
        .encode(
            color=alt.Color(
                'Value:Q',
                scale=alt.Scale(
                    domain=[min_value,0, max_value]
                ),
                legend=alt.Legend()
            ),
            tooltip=[
                alt.Tooltip('Country:N', title='Country'),
                alt.Tooltip('Year:O', title='Year'),
                alt.Tooltip('Value:Q', title='Value')
            ]
        )
        .transform_lookup(
            lookup='id',
            from_=alt.LookupData(last_data, 'country-code', ['Value', 'Country', 'Year'])
        )
    ).properties(title= 'Last Available Year by Country').project('equalEarth')

    final_map = alt.vconcat(
        first_map,
        last_map
    ).configure_view(strokeWidth=0).properties(title=f'{series}')

    return final_map

world_map('Population growth (annual %)')

2. Double axis

3. Heatmap

In [89]:
def heat_map_series(series):
    world_bank = world_bank_complete()
    #Filter data by series, exclude aggregate obs
    world_bank = world_bank[(world_bank['Series'] == series) &
                            (world_bank['Region'] != 'Aggregated data') &
                            (world_bank['Year'] < 2023)] #Incomplete info
    #Pivot to create a column for series
    pivot_data = world_bank.pivot(index=['Country', 'Region', 'Year'], 
                               columns='Series', values='Value').reset_index()
    #2D Histogram Heatmap
    heat_map = alt.Chart(pivot_data).mark_rect().encode(
        alt.X('Year:Q').bin(maxbins=62),
        alt.Y(series, title='% Population').bin(maxbins=40),
        alt.Color('count():Q', title = 'Count')
        ).properties(
        title=f'{series}'
    )
    heat_map.display()

heat_map_series('Literacy rate, adult total (% of people ages 15 and above)')

4. Bars / Stacked

In [106]:
def top_bottom_bars(series, year):
    world_bank = world_bank_complete()
    # Filter data based on the provided series
    world_bank = world_bank[(world_bank['Series'] == series) &
                            (world_bank['Region'] != 'Aggregated data') &
                            (world_bank['Year'] == year)]
    
    # Drop any missing values in 'Value' to avoid issues in sorting
    world_bank = world_bank.dropna(subset=['Value'])
    
    # Sort by 'Value' and select top 10 and bottom 10
    top_10 = world_bank.nlargest(10, 'Value')
    bottom_10 = world_bank.nsmallest(10, 'Value')
    
    # Concatenate top and bottom 10 data
    top_bottom = pd.concat([top_10, bottom_10])
    
    # Create a bar chart
    bar_chart = alt.Chart(top_bottom).mark_bar().encode(
        x=alt.X('Value:Q', title=f'{series} Value'),
        y=alt.Y('Country:N', sort='-x', title='Country'),
        color=alt.condition( 
            alt.datum['Value'] > 0,
            alt.value('#166417'),
            alt.value('#742183')),

        tooltip=['Country', 'Value']
    ).properties(
        title=f'{year}'
    )

    return bar_chart

In [107]:
def bars_twoyears(series, year):
    initial_year = top_bottom_bars(series, year)
    year_before = top_bottom_bars(series, year - 1)

    min_value = min(initial_year.data['Value'].min(), year_before.data['Value'].min())
    max_value = max(initial_year.data['Value'].max(), year_before.data['Value'].max())

    # Update the x-axis scale for both charts
    xaxis = x=alt.X('Value:Q', scale=alt.Scale(domain=[min_value, max_value]))
    initial_year = initial_year.encode(xaxis)
    
    year_before = year_before.encode(xaxis)

    # Concatenate both charts
    final_chart = alt.vconcat(
        year_before,
        initial_year
    ).properties(title=f'{series}: Top 10 and Bottom 10 Countries')
    
    final_chart.display()

bars_twoyears('Net migration', 2023)

5. Lines

6. Area

In [None]:
def area_pop(series1, series2):
    world_bank = world_bank_complete()
    
    #Data adjusted for chart
    world_bank = world_bank[(world_bank['Series'].isin([series1, series2])) &
                            (world_bank['Region'] != 'Aggregated data')]
    world_bank = world_bank.dropna(subset=['Value'])
    world_bank = world_bank.groupby(['Year', 'Series'], as_index=False)['Value'].sum()

    #Area chart
    area = alt.Chart(world_bank).mark_area().encode(
            x="Year:T",
            y=alt.Y("Value:Q",stack="normalize", title = 'Percentage of total population'),
            color="Series:O"
        ).properties(title = f'{series1} vs {series2}: Worldwide')
    area.display()

area_pop('Rural population','Urban population')

7. Scatter plot

In [114]:
def scatter_plot(series1, series2, title, axis1, axis2):
    world_bank = world_bank_complete()
    
    # Data filter for the two specific series and exclude aggregated observations and missing values
    filtered_df = world_bank[(world_bank['Series'].isin([series1, series2])) &
                             (world_bank['Region'] != 'Aggregated data')]
    filtered_df = filtered_df.dropna(subset=['Value'])
    
    # Sort and Drop duplicates to keep only the last available year for each country and series
    filtered_df = filtered_df.sort_values(['Country', 'Series', 'Year'], ascending=[True, True, False])
    latest_df = filtered_df.drop_duplicates(subset=['Country', 'Series'], keep='first')
   
    # Define axis max value for equal axis
    max_value = 100
    
    # Pivot the data to create 'series1' and 'series2' columns
    pivot_df = latest_df.pivot(index=['Country', 'Region', 'Year'], 
                               columns='Series', values='Value').reset_index()

    #Scatter plot with guide line
    scatter = alt.Chart(pivot_df).mark_circle(size=60).encode(
        alt.X(series1, title=axis1, scale=alt.Scale(domain=[0, max_value])),
        alt.Y(series2, title=axis2, scale=alt.Scale(domain=[0, max_value])),
        alt.Color('Region:N'),
        tooltip=['Country', 'Region', 'Year', series1, series2]
    )

    diagonal_line = alt.Chart(pd.DataFrame({'x': [0, max_value], 'y': [0, max_value]})).mark_line(
        color='black', 
        opacity=0.3
    ).encode(
        x='x:Q',
        y='y:Q'
    )

    final_plot = (scatter + diagonal_line
                  ).properties(title=f'{title}')

    final_plot.display()

In [115]:
scatter_plot('Wage and salaried workers, female (% of female employment) (modeled ILO estimate)', 
             'Wage and salaried workers, male (% of male employment) (modeled ILO estimate)',
             'Wage and salaried workers by gender, modeled ILO estimate',
             '% Female', '% Male')

In [116]:
scatter_plot('Share of youth not in education, employment or training, female (% of female youth population) (modeled ILO estimate)', 
             'Share of youth not in education, employment or training, male (% of male youth population)  (modeled ILO estimate)',
             'Youth not in education, employment or training by gender, modeled ILO estimate',
             '% Female', '% Male')

8. Trends

9. Population Pyramid