In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import altair as alt
from vega_datasets import data

from sklearn.preprocessing import MinMaxScaler

# Potential Drivers of Gender Inequality Comparison Graphs

In [2]:
# Load original dfs for both grades
df_4 = pd.read_csv('grade_4_final.csv')
df_8 = pd.read_csv('grade_8_final.csv')

### Scale target variables for visualizing

In [3]:
def scale_values(df):

    # Drop rows with no information on share of seats in parliament held by women
    df = df[(df['Share of seats in parliament held by women'] != 'No Information') 
        & (df['Government expenditure on education'] != 'No Information')]
    
    # Convert variables of interest to float
    df['Share of seats in parliament held by women'] = df['Share of seats in parliament held by women'].astype(float)
    df['Government expenditure on education'] = df['Government expenditure on education'].astype(float)
    
    # Min max scaler (scale to scale 0-100 to match with share of seats)
    scaler = MinMaxScaler(feature_range=(0, 100))
    
    # Scale mean interest
    interest_cols = df[['math_interest', 'science_interest']]
    scaler.fit(interest_cols)
    scaled_interest = scaler.transform(interest_cols)
    average_interest_scaled = (scaled_interest[:,0] + scaled_interest[:,1])/2
    df['average_interest_scaled'] = average_interest_scaled
    
    # Scale score
    score_cols = df[['math', 'science']]
    scaler.fit(score_cols)
    scaled_scores = scaler.transform(score_cols)
    average_score_scaled = (scaled_scores[:,0] + scaled_scores[:,1])/2
    df['average_score_scaled'] = average_score_scaled
    
    return df

In [4]:
df_4 = scale_values(df_4)

In [5]:
df_8 = scale_values(df_8)

In [6]:
def create_vis_df(df):
    # Choose columns to use
    vis_df = df[['Country', 'Gender Inequality Index', 'sex of student', 'average_interest_scaled',
                  'average_score_scaled', 'Share of seats in parliament held by women', 
                  'Government expenditure on education', 'Adolescent birth rate']]
    
    # Group by country and sex of student
    vis_df = vis_df.groupby(['Adolescent birth rate', 'Country', 'sex of student']).mean().reset_index()
    
    # Caculate the difference
    vis_df_diff = vis_df.set_index(['sex of student', 'Country', 'Adolescent birth rate']).diff().reset_index()
    
    # Adding diff columns to vis dfs
    vis_df['interest_diff'] = vis_df_diff['average_interest_scaled']*(-1)
    vis_df['score_diff'] = vis_df['average_score_scaled']*(-1)
    
    # Filter vis dfs to the rows contains the diff results
    vis_df = vis_df[vis_df['sex of student'] == 2]
    
    # Drop original columns
    vis_df = vis_df.drop(columns=['average_interest_scaled', 'average_score_scaled', 'sex of student'])
    
    # Rename columns
    vis_df = vis_df.rename(columns={'Share of seats in parliament held by women': 'Share of Seats in Parliament Held by Women',
                       'interest_diff': 'Difference in Math & Science Interest Between Boys and Girls',
                       'score_diff': 'Difference in Math & Science Ability Between Boys and Girls'})
    
    # Create long-form df
    vis_df = pd.melt(vis_df.reset_index(), id_vars=['Country', 'Gender Inequality Index',
         'Government expenditure on education', 'Adolescent birth rate'], value_vars= 
        ['Share of Seats in Parliament Held by Women', 
         'Difference in Math & Science Interest Between Boys and Girls',
         'Difference in Math & Science Ability Between Boys and Girls'], 
        var_name='type', value_name='count')
    
    
    return vis_df

In [7]:
vis_df_4 = create_vis_df(df_4)

In [8]:
vis_df_8 = create_vis_df(df_8)

In [9]:
def make_com_chart(df, grade):
    
    com_chart = alt.Chart(df).mark_circle().encode(
        x = alt.X('Gender Inequality Index', scale=alt.Scale(zero=False),
                   axis=alt.Axis(labels=True, ticks=False, domain=False, labelPadding=5, tickCount=5)),
        y = alt.Y('count', scale=alt.Scale(zero=False),
                  title=f'Grade {grade}', axis=alt.Axis(labels=True, ticks=False, domain=False, 
                  tickCount=5, labelPadding=10)),
        color = alt.Color('Adolescent birth rate:Q', scale=alt.Scale(scheme='bluepurple'),
                         title='Adolescent Birth Rate'),
        tooltip = 'Country',
        size = alt.Size('Government expenditure on education',
                       scale=alt.Scale(range=[0, 500]),
                       title='Education Expenditure',
                       legend=alt.Legend(symbolFillColor='7A7A7A')),
    ).properties(
            width=350,
            height=350

    ).facet(column=alt.Column('type:N', 
                              title=None,
                              header=alt.Header(labelFontWeight='bold',
                                                labelFontSize=12))

    )
    
    return com_chart

In [10]:
g4_graph = make_com_chart(vis_df_4, 4)

In [11]:
g8_graph = make_com_chart(vis_df_8, 8)

In [12]:
(g4_graph & g8_graph).properties(title={'text': ['Potential Drivers of Gender Inequality Comparison Across Countries'],
           'subtitle': ['By Adolescent Birth Rate and Education Expenditure'],
          }).configure_title(
        fontSize=30,
        fontWeight='bold',
        color='#5B2C6F',
        anchor='middle',
        subtitlePadding=3,
        subtitleColor='#8EA9DB',
        subtitleFontWeight='bolder',
            subtitleFontSize=25,
            offset=5

        ).configure_legend(orient='top',
                          columnPadding=10,
                          offset=10,
                          padding=20,
                          symbolSize=350,
                          labelFontSize=12,
                          titleFontSize=15

        ).configure_axis(
                 labelFontSize=15,
                 titleFontSize=17,
                 titleFontWeight='bolder',
                 titlePadding=10)

# Parallel Coordinates Plot of Women Empowerment Indicators

In [2]:
# Load the dataset
hdr_df_new = pd.read_csv('hdr_df_new.csv', na_values=['..', '—']).drop(columns=['index'])

In [14]:
# Choose columns to use
hdr_df_both = hdr_df_new[['Country', 'Gender Inequality Index', 'HDI Female', 'Mean years of schooling Female',
  'Share of female employment in nonagriculture', 'Maternal mortality ratio', 'GDI Group',
  'Labour force participation rate Female',
  'Share at least some secondary education Female', 'Share of seats held by women in parliament',
  'Share of female in STEM tertiary education programs', 'Share of female STEM tertiary education graduates',
  'Share of female internet users', 'Share of violence against women ever experienced by intimate partner',
  'Share of violence against women ever experienced by nonintimate partner']]

In [15]:
# Drop if any column contains na
hdr_df_both = hdr_df_both.dropna()

In [16]:
# Sort rows by Gender Inequality Index
hdr_df_both = hdr_df_both.sort_values(by=['Gender Inequality Index'])

In [17]:
# Choose all numerical columns
value_cols = [col for col in hdr_df_both.columns if col not in ['Country', 'GDI Group']]

In [18]:
# Convert numerical columns to float
for col in value_cols:
    hdr_df_both[col] = hdr_df_both[col].astype(float)

### Scale the df to create graph

In [19]:
# Log transform columns with low variabiliy
hdr_df_both['Maternal mortality ratio'] = hdr_df_both['Maternal mortality ratio'].map(np.log)

hdr_df_both['Share of violence against women ever experienced by intimate partner'] = hdr_df_both[
    'Share of violence against women ever experienced by intimate partner'].map(np.log)

hdr_df_both['Share of violence against women ever experienced by nonintimate partner'] = hdr_df_both[
    'Share of violence against women ever experienced by nonintimate partner'].map(np.log)

In [22]:
# Min max scaler
scaler = MinMaxScaler(feature_range=(0, 100))

# Scale columns
cols_to_scale = hdr_df_both[value_cols]
scaler.fit(cols_to_scale)
scaled_cols = scaler.transform(cols_to_scale)

In [23]:
# Create new df
scaled_both_df = pd.DataFrame(data=scaled_cols, columns=value_cols)

In [24]:
# Add country and group columns back to the scaled df
scaled_both_df['Country'] = hdr_df_both['Country'].tolist()
scaled_both_df['GDI Group'] = hdr_df_both['GDI Group'].tolist()

In [25]:
# Regroup from 5 to 3 groups
scaled_both_df['GDI Group'] = scaled_both_df['GDI Group'].map({1: 1, 2: 2, 3: 2, 4: 3, 5: 3})

In [26]:
# Rename columns
scaled_both_df = scaled_both_df.rename(columns={'Share of female employment in nonagriculture': 'Female employment in nonagriculture',
                   'Mean years of schooling Female': 'Female mean years of schooling',
                   'Labour force participation rate Female': 'Female labour force participation rate',
                   'Female at least some secondary education': 'Share at least some secondary education Female',
                   'Share of female in STEM tertiary education programs': 'Female in STEM tertiary education programs',
                   'Share of female STEM tertiary education graduates': 'Female STEM tertiary education graduates',
                   'Share of violence against women ever experienced by intimate partner': 'Women experienced violence by intimate partner',
                   'Share of violence against women ever experienced by nonintimate partner': 'Women experienced violence by nonintimate partner',                            
                   'HDI Female': 'Human Development Index of Female',
                   'Share at least some secondary education Female': 'Female with at least some secondary education'})

In [40]:
alt.Chart(scaled_both_df).transform_window(
    index='count()'

).transform_fold(
    ['Human Development Index of Female',
       'Female mean years of schooling', 'Female employment in nonagriculture',
       'Maternal mortality ratio', 'Female labour force participation rate',
       'Female with at least some secondary education',
       'Share of seats held by women in parliament',
       'Female in STEM tertiary education programs',
       'Female STEM tertiary education graduates',
       'Share of female internet users',
       'Women experienced violence by intimate partner',
       'Women experienced violence by nonintimate partner']

).mark_line(size=2.5).encode(
    x=alt.X('key:N', axis=alt.Axis(ticks=False, domain=False, labelPadding=10, title=None)),
    y=alt.Y('value:Q', axis=alt.Axis(ticks=False, domain=False, labels=False, title=None),
           scale=alt.Scale(domain=[-10, 110])),
    color=alt.Color('GDI Group:N', scale=alt.Scale(range=['#6495ED', '#FF7F50', '#228B32'])),
    detail='index:N',
    opacity=alt.value(0.5)

).properties(width=800
            
).configure_axis(labelLimit=320,
                 labelFontSize=12,
                 grid=False,
                 labelFontWeight='bolder'
                 
).properties(title={'text': ["Women's Empowerment Indicators Across Countries"],
           'subtitle': ['By Gender Development Group'],
          }).configure_title(
        fontSize=32,
        fontWeight='bold',
        color='#FF7F50',
        anchor='middle',
        subtitlePadding=3,
        subtitleColor='228B32',
        subtitleFontWeight='bolder',
            subtitleFontSize=22,
            offset=5

        ).configure_legend(orient='top',
                          columnPadding=10,
                          offset=10,
                          padding=10,
                          symbolType='stroke',
                          symbolSize=750,
                          symbolStrokeWidth=7,
                          labelFontSize=12,
                          titleFontSize=15

        )                 

# PCA Factor Loading

In [70]:
# Read PCA factor loading dataframes
load_4 = pd.read_csv('f_loading_4_mat_int.csv')

load_8 = pd.read_csv('f_loading_8_sci_int.csv')

In [72]:
# Sort these dataframes by the value of loading columns
load_4.sort_values(by='loading', ascending=False, inplace=True)

load_8.sort_values(by='loading', ascending=False, inplace=True)

In [74]:
# Rename columns
load_4['variable'] = load_4['variable'].map({'MAT\AGREE\ENJOY LEARNING MATHEMATICS': 'Enjoy Learning Math', 
                                             'MAT\AGREE\WISH HAVE NOT TO STUDY MATH': 'Wish to Study Math',
                                             'MAT\AGREE\MATH IS BORING': 'Math is not Boring',
                                             'MAT\AGREE\LEARN INTERESTING THINGS': 'Learn Interesting Things',
                                             'MAT\AGREE\LIKE MATHEMATICS': 'Like Math',
                                             'MAT\AGREE\SCHOOLWORK INVOLVES NUMBERS': 'Like Schoolwork Involves Numbers',
                                             'MAT\AGREE\LIKE MATH PROBLEMS': 'Like Math Problems',
                                             'MAT\AGREE\LOOK FORWARD TO MATH LESSONS': 'Look Forward to Math Lessons',
                                             'MAT\AGREE\MATH FAVORITE SUBJECT': 'Math is Favorite Subject'})

In [75]:
# Rename columns
load_8['variable'] = load_8['variable'].map({'SCI\AGREE\ENJOY LEARNING SCIENCE': 'Enjoy Learning Science', 
                                             'SCI\AGREE\WISH HAVE NOT TO STUDY SCIENCE': 'Wish to Study Science',
                                             'SCI\AGREE\SCIENCE IS BORING': 'Science is not Boring',
                                             'SCI\AGREE\LEARN INTERESTING THINGS': 'Learn Interesting Things',
                                             'SCI\AGREE\LIKE SCIENCE': 'Like Science',
                                             'SCI\AGREE\HOW THINGS WORK': 'Like to Know How Things Work',
                                             'SCI\AGREE\LIKE SCIENCE EXPERIMENTS': 'Like Science Experiment',
                                             'SCI\AGREE\LOOK FORWARD TO LEARN SCI': 'Look Forward to Learning Science',
                                             'SCI\AGREE\FAVORITE SUBJECT': 'Science is Favorite Subject'})

In [76]:
def creat_load_chart(df, color, title):

    graph_load = alt.Chart(df).mark_bar(size=27, cornerRadiusEnd=20, color=color).encode(
        y=alt.Y('variable:N', axis=alt.Axis(title=title, labels=True, ticks=False)),
        x=alt.X('loading:Q', axis=alt.Axis(title=None, labels=False, ticks=False))
    )
    
    text_load = graph_load.mark_text(
    align='left',
    baseline='middle',
    dx=15,
    fontWeight='bolder',
    fontSize=12
    ).encode(
        text=alt.Text('loading:Q', format='.3r')
    )
    
    com_load = (graph_load + text_load).configure_view(
        strokeWidth=0

    ).configure_axis(domain=False,
                     labelPadding=10,
                     labelLimit=320,
                     labelFontSize=12,
                     grid=False,
                     labelFontWeight='bolder',
                     titlePadding=30,
                     titleFontSize=13,
                     titleFontWeight='bolder',
                     titleColor=color

    ).configure_scale(
        bandPaddingInner=0.5

    ).properties(width=400,
                 height=300

    )
    
    return com_load

In [77]:
loading_4 = creat_load_chart(load_4, 'FF7F50', 'Math Interest Grade 4')

In [78]:
loading_8 = creat_load_chart(load_8, '228B32', 'Science Interest Grade 8')

In [79]:
loading_4.properties(title={'text': ['Factor Loading Comparison'],
           'subtitle': ['Of PCA Resulting Variables'],
          }).configure_title(
        fontSize=32,
        fontWeight='bold',
        color='#6495ED',
        anchor='middle',
        subtitlePadding=3,
        subtitleColor='5B2C6F',
        subtitleFontWeight='bolder',
            subtitleFontSize=22,
            offset=5

        )

In [80]:
loading_8

# Choropleth Map of Female Human Development Index

In [246]:
# Read country codes data that contains country codes that match Altair's country data
country_codes = pd.read_csv('country-codes_csv.csv')

In [247]:
# Rename column that contains country names
country_codes.rename(columns={'CLDR display name': 'Country'}, inplace=True)

In [249]:
# See the names in hdr dataframe
name_list = hdr_df_new['Country'].tolist()

In [250]:
# See the names in the new country code dataframe
name_list_w_code = country_codes['Country'].tolist()

In [251]:
# See country names that are in one list but not the other
np.setdiff1d(name_list, name_list_w_code)

array(['Antigua and Barbuda', 'Arab States',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Brunei Darussalam', 'Cabo Verde', 'Congo',
       'Congo (Democratic Republic of the)', "Côte d'Ivoire",
       'Developing countries', 'East Asia and the Pacific',
       'Eswatini (Kingdom of)', 'Europe and Central Asia',
       'High human development', 'Hong Kong, China (SAR)',
       'Iran (Islamic Republic of)',
       "Korea (Democratic People's Rep. of)", 'Korea (Republic of)',
       "Lao People's Democratic Republic",
       'Latin America and the Caribbean', 'Least developed countries',
       'Low human development', 'Medium human development',
       'Micronesia (Federated States of)', 'Moldova (Republic of)',
       'Organisation for Economic Co-operation and Development',
       'Other countries or territories', 'Palestine, State of',
       'Russian Federation', 'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Sao T

In [252]:
# See country names that are in one list but not the other
np.setdiff1d(name_list_w_code, name_list)

array(['American Samoa', 'Anguilla', 'Antarctica', 'Antigua & Barbuda',
       'Aruba', 'Bermuda', 'Bolivia', 'Bosnia', 'Bouvet Island',
       'British Indian Ocean Territory', 'British Virgin Islands',
       'Brunei', 'Cape Verde', 'Caribbean Netherlands', 'Cayman Islands',
       'Christmas Island', 'Cocos (Keeling) Islands',
       'Congo - Brazzaville', 'Congo - Kinshasa', 'Cook Islands',
       'Curaçao', 'Côte d’Ivoire', 'Eswatini', 'Falkland Islands',
       'Faroe Islands', 'French Guiana', 'French Polynesia',
       'French Southern Territories', 'Gibraltar', 'Greenland',
       'Guadeloupe', 'Guam', 'Guernsey', 'Heard & McDonald Islands',
       'Hong Kong', 'Iran', 'Isle of Man', 'Jersey', 'Laos', 'Macau',
       'Martinique', 'Mayotte', 'Micronesia', 'Moldova', 'Montserrat',
       'New Caledonia', 'Niue', 'Norfolk Island', 'North Korea',
       'Northern Mariana Islands', 'Palestine', 'Pitcairn Islands',
       'Puerto Rico', 'Russia', 'Réunion', 'Sint Maarten',
       '

In [None]:
# Country names in country code dataframe to be replaced
to_replace = ['Antigua and Barbuda', 'Bolivia (Plurinational State of)', 'Bosnia', 'Brunei', 'Eswatini', 'Hong Kong', 'Iran', 'South Korea',
'Laos', 'Micronesia', 'Moldova', 'Palestine', 'Russia', 'St. Kitts & Nevis', 'St. Lucia', 'St. Vincent & Grenadines',
'São Tomé & Príncipe', 'Syria', 'Tanzania', 'Trinidad & Tobago', 'UK', 'US', 'Venezuela', 'Vietnam', 'Côte d’Ivoire']

In [254]:
# Replace names in country code dataframe with these names from the hdr dataframe
replace_with = ['Antigua & Barbuda', 'Bolivia', 'Bosnia and Herzegovina', 'Brunei Darussalam', 'Eswatini (Kingdom of)', 'Hong Kong, China (SAR)',
'Iran (Islamic Republic of)', 'Korea (Republic of)', "Lao People's Democratic Republic", 'Micronesia (Federated States of)',
'Moldova (Republic of)', 'Palestine, State of', 'Russian Federation', 'Saint Kitts and Nevis', 'Saint Lucia',
'Saint Vincent and the Grenadines', 'Sao Tome and Principe', 'Syrian Arab Republic', 'Tanzania (United Republic of)',
'Trinidad and Tobago', 'United Kingdom', 'United States', 'Venezuela (Bolivarian Republic of)', 'Viet Nam', "Côte d'Ivoire"]   

In [255]:
# Replace country names
country_codes['Country'] = country_codes['Country'].replace(to_replace, replace_with)

In [257]:
# Merge country code dataframe to hdr
hdr_df_new = hdr_df_new.merge(country_codes, on='Country')

In [258]:
# Load world data from Altair's data sources
source = alt.topo_feature(data.world_110m.url, 'countries')

In [259]:
background = alt.Chart(source).mark_geoshape(fill='white', stroke='lightgray')

In [260]:
foreground = (
    alt.Chart(source)
    .mark_geoshape(stroke='black', strokeWidth=0.2)
    .encode(
        color=alt.Color(
            'HDI Female:Q', scale=alt.Scale(scheme='bluepurple'), legend=alt.Legend(title='Female Human Development Index'))
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(hdr_df_new, 'ISO3166-1-numeric', ['HDI Female']),
    )
)

In [261]:
final_map = (
    (background + foreground)
    .configure_view(strokeWidth=0)
    .properties(width=1000, height=600)
    .project('naturalEarth1')
).configure_legend(orient='top',
                          columnPadding=10,
                          offset=10,
                          padding=20,
                          symbolSize=350,
                          labelFontSize=10,
                          titleFontSize=12,
                   titleLimit=300,
                  ).properties(title={'text': ['Female Human Development Index'],
           'subtitle': ['World Comparison'],
          }).configure_title(
        fontSize=30,
        fontWeight='bold',
        color='#5B2C6F',
        anchor='middle',
        subtitlePadding=3,
        subtitleColor='#8EA9DB',
        subtitleFontWeight='bolder',
            subtitleFontSize=20,
            offset=5

        )                  

In [262]:
final_map