# Unveiling Global Trends: Socioeconomic Patterns Across Nations

In [2]:
import pandas as pd
import dask.dataframe as dd
import altair as alt
alt.data_transformers.enable("vegafusion")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



DataTransformerRegistry.enable('vegafusion')

### 1. Data frame



Source: Steven Ruggles, Lara Cleveland, Rodrigo Lovaton, Sula Sarkar, Matthew Sobek, Derek Burk, Dan Ehrlich, Quinn Heimann, Jane Lee. Integrated Public Use Microdata Series, International: Version 7.5 [dataset]. Minneapolis, MN: IPUMS, 2024. https://doi.org/10.1 [dataset]. Minneapolis, MN: IPUMS, 2024.
https://doi.org/10.18128/D020.V7.5

In [46]:
def ipums_df():
    ipums_or = dd.read_csv('ipumsi_00007.csv', assume_missing=True)
    
    # Additional columns with no use for the visualizations
    exclude_columns = ['SAMPLE', 'SERIAL', 'HHWT', 'FORMTYPE', 'PERNUM', 'PERWT', 'MARSTD', 'EDATTAIND', 'EMPSTATD', 'BPLCOUNTRY', 'CITIZEN', 'INCTOT', 'OCC']
    
    impus_or = ipums_or.loc[:, ~ipums_or.columns.isin(exclude_columns)]

    return impus_or

In [5]:
def add_countries_names(df):
    countries = pd.read_csv('countries.csv')
    df['COUNTRY'] = df['COUNTRY'].astype(float)
    countries['COUNTRY'] = countries['COUNTRY'].astype(float)

    with_names = df.merge(countries, on='COUNTRY', how='left')
    return with_names

In [6]:
def add_decade(df):
    df['YEAR'] = df['YEAR'].astype(int)
    df['DECADE'] = (df['YEAR'] // 10) * 10
    return df

In [7]:
def add_age_groups(df):
    bins = [0, 17, 25, 40, 65, 100]
    labels = ['0-17', '18-25','26-40','41-65', '65+']
    df['AGE_GROUP'] = pd.cut(df['AGE'], bins=bins, labels=labels, right=False)  
    return df

**Variables Available**

COUNTRY (Country)

YEAR (Year)

RESIDENT (Residence status: de facto, de jure)

AGE (Age)

SEX (Sex)

MARST (Marital status [general version] 0 NIU 1 single 2 married 3 separated 4 widowed 5 unknown)

BIRTHYR (Year of birth)

BIRTHMO (Month of birth)

CHBORN (Children ever born)

CHSURV (Children surviving)

CHBORNF (Number of female children ever born)

CHBORNM (Number of male children ever born)

NATIVITY (Nativity status - 0 NIU 1 Native-born 2 Foreign-born 9 Unknown)

SCHOOL (School attendance - 0 NIU 1 yes 2 no 3 no, attended in the past 4 no, never attended 9 unknown)

LIT (Literacy - 0 NIU 1 illiterate 2 literate 9 unknown)

EDATTAIN (Educational attainment, international recode [general version] 0 NIU 1 Less than primary completed 2 Primary completed 3 Secondary completed 4 University completed 9 Unknown)

YRSCHOOL (Years of schooling)

EMPSTAT (Activity status (employment status) [general version] - 0 NIU 1 Employed 2 Unemployed 3 Inactive 9 Unknown)

LABFORCE (Labor force participation - 1 No, not in the labor force 2 Yes, in the labor force 8 Unknown 9 NIU)

### 2. Visualizations

All visualizations are composed of two functions, one to create the data frame and one to plot the graph. They used the functions created before to call the complete data frame, do the calculations and add countries names. 

#### Education and Labor Force by Country and Gender after 2000

In [9]:
def education_labor():
    # Only include data from the year 2000 and after
    ipums = ipums_df()
    #Take out the unknows
    ipums = ipums[ipums['EDATTAIN'] != 0]
    ipums = ipums[ipums['EDATTAIN'] != 9]
    ipums = ipums[ipums['YEAR'] > 1999]

    #Only include those in the labor force
    ipums = ipums[ipums['LABFORCE'] ==2]

    # Grouping and counting the data
    education_labor = ipums.groupby(['COUNTRY', 'SEX', 'EDATTAIN', 'YEAR']).size()
    education_labor = education_labor.to_frame(name='Count').reset_index()

    df = add_countries_names(education_labor.compute())

    # Map SEX codes to labels
    sex_labels = {1: 'Male', 2: 'Female', 9: 'Unknown'}
    df['SEX_LABEL'] = df['SEX'].map(sex_labels)
    
    edattain_labels = {
        1: 'Less than primary completed', 
        2: 'Primary completed', 
        3: 'Secondary completed', 
        4: 'University completed'
    }
    df['EDATTAIN'] = df['EDATTAIN'].replace(edattain_labels)

    return df


In [10]:
def chart_educ_labor():
    df = education_labor()
    df['Distribution'] = 0.5

    # Create the base bar chart
    bars = alt.Chart(df).mark_bar().encode(
        x=alt.X('COUNTRY_NAME:N', title='Country'),
        y=alt.Y('Count:Q', stack="normalize", title='Percentage'),
        color=alt.Color('SEX_LABEL:N', title='Sex'),
        tooltip=['COUNTRY_NAME', 'SEX_LABEL', 'Count:Q', 'EDATTAIN']
    )
    
    # Create the line rule for the 50% line
    line = alt.Chart().mark_rule().encode(y='Distribution')
    
    # Combine the bar chart and the line
    chart = alt.layer(
        bars, line,
        data = df
        ).facet(
            row=alt.Row('EDATTAIN:N', title = 'Educational Attainment')
        ).properties(
        title='Education and Labor Force by Country and Gender after 2000'
    )

    return chart

In [11]:
chart_educ_labor()

#### Age Distribution of Marital Status by Year

In [125]:
def age_martial_status():
    ipums = ipums_df()

    # Unify NIU
    ipums['MARST'] = ipums['MARST'].replace(9, 0)
    ipums = ipums[ipums['MARST'] != 0]
    
    # Define labels
    marst_labels = {
        1: 'Single',
        2: 'Married',
        3: 'Separated',
        4: 'Widowed'
    }
    
    # Apply the mapping to the 'MARST' column
    ipums['MARST'] = ipums['MARST'].map(marst_labels)

    # Regroup
    age_marital_df = ipums.groupby(['YEAR', 'AGE', 'MARST']).size()
    df = age_marital_df.to_frame(name="Count_year").reset_index()
    
    df = add_decade(df.compute())
    df = add_age_groups(df)
    
    # Calculate total counts for each DECADE and AGE_GROUP
    count = df.groupby(['DECADE','AGE_GROUP','MARST'])['Count_year'].sum().reset_index()
    count = count.rename(columns={'Count_year': 'Count'})
    total_counts = df.groupby(['DECADE', 'AGE_GROUP'])['Count_year'].sum().reset_index()
    total_counts = total_counts.rename(columns={'Count_year': 'TotalCount'})

    # Merge total counts back to original df
    new = count.merge(total_counts, on=['DECADE', 'AGE_GROUP'])

    # Calculate the percentage
    new['Percentage'] = (new['Count'] / new['TotalCount']) * 100
    
    return new

In [126]:
def age_martials_hm():
    df = age_martial_status()

    heatmap = alt.Chart(df).mark_rect().encode(
        x='AGE_GROUP:O',
        y='MARST:N',
        color=alt.Color('Percentage:Q'),
        facet='DECADE:N'
    ).properties(
        width=150,
        height=300
    )
    heatmap.show()

In [127]:
age_martials_hm()

#### Children Born vs. Children Surviving by Educational Attainment and Residence Status

In [34]:
def children_education():
    ipums = ipums_df()
    #Take out the unknows
    ipums = ipums[ipums['EDATTAIN'] != 0]
    ipums = ipums[ipums['EDATTAIN'] != 9]

    #Regroup
    df = ipums.groupby(['EDATTAIN', 'COUNTRY', 'YEAR']).agg(
        avg_children_born=('CHBORN', 'mean')
    ).reset_index()

    # Add Labels for Education level
    edattain_labels = {
        1: 'Less than primary completed', 
        2: 'Primary completed', 
        3: 'Secondary completed', 
        4: 'University completed'
    }
    df['EDATTAIN'] = df['EDATTAIN'].replace(edattain_labels)
    df = add_countries_names(df.compute())
    df = add_decade(df)
    df = df[df['DECADE'] != 2020] 

    return df

In [35]:
def scatter_children_edu():
    df = children_education()

    # Base scatter plot
    scatter = alt.Chart(df).mark_circle(size=60).encode(
        y=alt.Y('avg_children_born:Q', title='Average Children Born'),
        x=alt.X('EDATTAIN:N'),  
        color=alt.Color('COUNTRY_NAME:N', title='Country'), 
        tooltip=['COUNTRY_NAME', 'EDATTAIN', 'avg_children_born', 'DECADE']
    )

    # Facet by decade
    final_chart = scatter.facet(
        column=alt.Column('DECADE:N', title='Decade')
    ).properties(
        title='Average children born by education level'
    )

    return final_chart


In [36]:
scatter_children_edu()

#### Employment Status by Age and School Attendance

In [1]:
def empl_age_school():
    ipums = ipums_df()
    #Take out the unknows
    ipums = ipums[ipums['SCHOOL'] != 0] 
    ipums = ipums[ipums['SCHOOL'] != 9]
    ipums = ipums[ipums['EMPSTAT'] != 0]
    ipums = ipums[ipums['EMPSTAT'] != 9]
    ipums = ipums[ipums['AGE'] != 999.0]
    ipums = ipums.dropna()

    empl_age_school = ipums.groupby(['AGE', 'SCHOOL', 'EMPSTAT']).size()
    empl_age_school = empl_age_school.to_frame(name="Count").reset_index()
    
    # Map EMPSTAT labels to the EMPSTAT column
    empstat_labels = {
        1: 'Employed',
        2: 'Unemployed',
        3: 'Inactive'
    }
    empl_age_school['EMPSTAT'] = empl_age_school['EMPSTAT'].map(empstat_labels)

    # Map SCHOOL labels to the SCHOOL column
    school_labels = {
        1: 'Yes',
        2: 'No',
        3: 'No, attended in the past',
        4: 'No, never attended'
    }
    empl_age_school['SCHOOL'] = empl_age_school['SCHOOL'].map(school_labels)

    df = empl_age_school.compute()
    
    return df

In [26]:
def line_empl_age_school():
    
    df = empl_age_school()

    # Line chart showing school attendance by age
    school = alt.Chart(df).mark_line(interpolate='monotone').encode(
        x='AGE:Q',
        y=alt.Y('Count:Q', stack="normalize", title='Percentage'),
        strokeDash='SCHOOL:N',
        tooltip=['AGE', 'Count', 'SCHOOL']
    ).properties(
        width=600,
        height=400,
        title='School Attendance by Age'
    )

    # Line chart showing employment status by age
    employment = alt.Chart(df).mark_line(interpolate='monotone').encode(
        x='AGE:Q',
        y=alt.Y('Count:Q', stack="normalize", title='Percentage'),
        color='EMPSTAT:N',
        tooltip=['AGE', 'Count', 'EMPSTAT']
    ).properties(
        width=600,
        height=400,
        title='Employment Status by Age'
    )

    # Concatenate the two charts vertically and resolve the Y-axis scales
    combined_chart = alt.vconcat(school, employment).resolve_scale(
        y='shared'
    )

    return combined_chart


In [27]:
line_empl_age_school()

#### Nativity vs. Literacy Rates by Year and Country

In [54]:
def literacy_rate():
    ipums = ipums_df()
    
    # Filter out unknowns and missing values for LIT, and AGE
    ipums = ipums[ipums['LIT'].isin([1, 2])]
    ipums = ipums[ipums['AGE'] != 999.0]
    ipums = ipums.dropna()
    
    # Grouping and counting the data
    literacy_df = ipums.groupby(['YEAR', 'COUNTRY', 'LIT']).size()
    literacy_df = literacy_df.to_frame(name="Count").reset_index()
    
    # Add country names and decades
    df = add_countries_names(literacy_df.compute())
    
    lit_labels = {
        1: 'Illiterate',
        2: 'Literate',
    }
    df['LIT_LABEL'] = df['LIT'].map(lit_labels)
    
    # Add decade information to the dataset
    df = add_decade(df)
    
    return df


In [55]:
def literacy_chart():
    df = literacy_rate()
    
    # Create the area chart for literacy rates
    literacy_chart = alt.Chart(df).mark_area().encode(
        x=alt.X('DECADE:O', title='Decade'),
        y=alt.Y('Count:Q', stack="normalize", title='Percentage'),
        strokeDash='LIT_LABEL:N',
        tooltip=['DECADE', 'Count', 'LIT_LABEL']
    ).properties(
        width=300,
        height=200,
        title='Literacy Rate by Decade'
    )
    
    return literacy_chart


In [None]:
literacy_chart() #Not working correctly, showing fewer countries and fewer decades than expected