# Data Visualisation of various metrics of Countries    


Imports


In [48]:
import pandas as pd
import numpy as np
import altair as alt
from altair import datum
import geopandas as gpd
from vega_datasets import data

Importing the csv as dataframe

In [49]:
# Load the data
df = pd.read_excel('world-data-2023_new.xlsx')

# print column names
print(df.columns)

Index(['Country', 'id', 'Abbreviation', 'Latitude', 'Longitude',
       'Density\n(P/Km2)', 'Agricultural Land( %)', 'Land Area(Km2)',
       'Armed Forces size', 'Birth Rate', 'Calling Code', 'Capital/Major City',
       'Co2-Emissions', 'CPI', 'CPI Change (%)', 'Currency-Code',
       'Fertility Rate', 'Forested Area (%)', 'Gasoline Price', 'GDP',
       'Gross primary education enrollment (%)',
       'Gross tertiary education enrollment (%)', 'Infant mortality',
       'Largest city', 'Life expectancy', 'Maternal mortality ratio',
       'Minimum wage', 'Official language', 'Out of pocket health expenditure',
       'Physicians per thousand', 'Population',
       'Population: Labor force participation (%)', 'Tax revenue (%)',
       'Total tax rate', 'Unemployment rate', 'Urban_population'],
      dtype='object')


In [50]:
#  print column names with corresponding null values
print(df.isnull().sum())

Country                                       0
id                                            0
Abbreviation                                  7
Latitude                                      1
Longitude                                     1
Density\n(P/Km2)                              0
Agricultural Land( %)                         7
Land Area(Km2)                                1
Armed Forces size                            24
Birth Rate                                    6
Calling Code                                  1
Capital/Major City                            3
Co2-Emissions                                 7
CPI                                          17
CPI Change (%)                               16
Currency-Code                                15
Fertility Rate                                7
Forested Area (%)                             7
Gasoline Price                               20
GDP                                           2
Gross primary education enrollment (%)  

Updating the dataset by adding continent and sub region columns, replacing missing values for the 2 letter codes according to ISO 3166 standard using country IDs

In [51]:
# loading the new dataset as a dataframe https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv
iso_data = pd.read_csv('ISO-3166.csv')

# selecting required columns
iso_data = iso_data[['country-code', 'region', 'sub-region', 'alpha-2', 'alpha-3']] 

# merging the two dataframes
df = pd.merge(df, iso_data, left_on='id', right_on='country-code', how='left')  

viewing the updated dataset

In [52]:
df.head()

Unnamed: 0,Country,id,Abbreviation,Latitude,Longitude,Density\n(P/Km2),Agricultural Land( %),Land Area(Km2),Armed Forces size,Birth Rate,...,Population: Labor force participation (%),Tax revenue (%),Total tax rate,Unemployment rate,Urban_population,country-code,region,sub-region,alpha-2,alpha-3
0,Afghanistan,4,AF,33.93911,67.709953,60,0.581,652230.0,323000.0,32.49,...,0.489,0.093,0.714,0.1112,9797273.0,4,Asia,Southern Asia,AF,AFG
1,Albania,8,AL,41.153332,20.168331,105,0.431,28748.0,9000.0,11.78,...,0.557,0.186,0.366,0.1233,1747593.0,8,Europe,Southern Europe,AL,ALB
2,Algeria,12,DZ,28.033886,1.659626,18,0.174,2381741.0,317000.0,24.28,...,0.412,0.372,0.661,0.117,31510100.0,12,Africa,Northern Africa,DZ,DZA
3,Andorra,20,AD,42.506285,1.521801,164,0.4,468.0,,7.2,...,,,,,67873.0,20,Europe,Southern Europe,AD,AND
4,Angola,24,AO,-11.202692,17.873887,26,0.475,1246700.0,117000.0,40.73,...,0.775,0.092,0.491,0.0689,21061025.0,24,Africa,Sub-Saharan Africa,AO,AGO


dropping repeated columns

In [53]:
df.drop(columns=['id', 'Abbreviation'], inplace=True)
df.columns

Index(['Country', 'Latitude', 'Longitude', 'Density\n(P/Km2)',
       'Agricultural Land( %)', 'Land Area(Km2)', 'Armed Forces size',
       'Birth Rate', 'Calling Code', 'Capital/Major City', 'Co2-Emissions',
       'CPI', 'CPI Change (%)', 'Currency-Code', 'Fertility Rate',
       'Forested Area (%)', 'Gasoline Price', 'GDP',
       'Gross primary education enrollment (%)',
       'Gross tertiary education enrollment (%)', 'Infant mortality',
       'Largest city', 'Life expectancy', 'Maternal mortality ratio',
       'Minimum wage', 'Official language', 'Out of pocket health expenditure',
       'Physicians per thousand', 'Population',
       'Population: Labor force participation (%)', 'Tax revenue (%)',
       'Total tax rate', 'Unemployment rate', 'Urban_population',
       'country-code', 'region', 'sub-region', 'alpha-2', 'alpha-3'],
      dtype='object')

checking if any added columns have missing values

In [54]:
# checking if any added columns have missing values
df[['alpha-2', 'alpha-3', 'region', 'sub-region']].isnull().sum()

alpha-2       1
alpha-3       0
region        0
sub-region    0
dtype: int64

In [55]:
# printing out the country code and name for the country with missing alpha-2 code
print(df[df['alpha-2'].isnull()][['Country', 'country-code', 'alpha-3']])

     Country  country-code alpha-3
119  Namibia           516     NAM


In [56]:
#  adding the missing alpha-2 code from https://www.iban.com/country-codes
df.loc[df['country-code'] == 516, 'alpha-2'] = 'NA'

#  checking if the missing alpha-2 code has been added
print(df[df['country-code'] == 516][['Country', 'country-code', 'alpha-2']])

     Country  country-code alpha-2
119  Namibia           516      NA


checking data types of columns

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    195 non-null    object 
 1   Latitude                                   194 non-null    float64
 2   Longitude                                  194 non-null    float64
 3   Density
(P/Km2)                            195 non-null    int64  
 4   Agricultural Land( %)                      188 non-null    float64
 5   Land Area(Km2)                             194 non-null    float64
 6   Armed Forces size                          171 non-null    float64
 7   Birth Rate                                 189 non-null    float64
 8   Calling Code                               194 non-null    float64
 9   Capital/Major City                         192 non-null    object 
 10  Co2-Emissions             

transforming some percentage and currency datatypes to make them compatible with vega altair

In [58]:
# multiplying all the columns with '%' into 100 to get the actual percentage value and converting the columns to float
percentage_columns = [col for col in df.columns if '%' in col]
for col in percentage_columns:
    df[col] = df[col] * 100
    df[col] = df[col].astype(float)

In [59]:
df[percentage_columns].head()

Unnamed: 0,Agricultural Land( %),CPI Change (%),Forested Area (%),Gross primary education enrollment (%),Gross tertiary education enrollment (%),Population: Labor force participation (%),Tax revenue (%)
0,58.1,2.3,2.1,104.0,9.7,48.9,9.3
1,43.1,1.4,28.1,107.0,55.0,55.7,18.6
2,17.4,2.0,0.8,109.9,51.4,41.2,37.2
3,40.0,,34.0,106.4,,,
4,47.5,17.1,46.3,113.5,9.3,77.5,9.2


In [60]:
# changing the columns with currency signs to float
currency_columns = ['GDP', 'Minimum wage', 'Gasoline Price']

# removing the currency signs and saving it as a float value
for col in currency_columns:
    df[col] = df[col].str.replace('$', '').str.replace(',', '').astype(float)



In [61]:
df[currency_columns].head()

Unnamed: 0,GDP,Minimum wage,Gasoline Price
0,19101350000.0,0.43,0.7
1,15278080000.0,1.12,1.36
2,169988200000.0,0.95,0.28
3,3154058000.0,6.63,1.51
4,94635420000.0,0.71,0.97


checking if data types have been updated

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Country                                    195 non-null    object 
 1   Latitude                                   194 non-null    float64
 2   Longitude                                  194 non-null    float64
 3   Density
(P/Km2)                            195 non-null    int64  
 4   Agricultural Land( %)                      188 non-null    float64
 5   Land Area(Km2)                             194 non-null    float64
 6   Armed Forces size                          171 non-null    float64
 7   Birth Rate                                 189 non-null    float64
 8   Calling Code                               194 non-null    float64
 9   Capital/Major City                         192 non-null    object 
 10  Co2-Emissions             

Adding new features to the dataset using the existing features

- GDP per capita
- CO2 emissions per capita

Calculating GDP per Capita:

In [63]:

# Calculate GDP per Capita
df['GDP per Capita'] = df['GDP'] / df['Population']


# Display the updated dataframe
df['GDP per Capita'].head()


0      502.115487
1     5352.857411
2     3948.343279
3    40886.391162
4     2973.591160
Name: GDP per Capita, dtype: float64

calculating CO2 emissions per capita

In [64]:
# Calculate CO2 emissions per capita
df['CO2 Emissions per Capita'] = (df['Co2-Emissions'] * 1000) / df['Population']

# Display the updated dataframe
df['CO2 Emissions per Capita'].head()

0    0.227960
1    1.589242
2    3.484213
3    6.079697
4    1.090108
Name: CO2 Emissions per Capita, dtype: float64

printing the countries with missing values for GDP per capita and CO2 emissions per capita

In [65]:
# Countries with missing GDP per Capita
missing_gdp_per_capita = df[df['GDP per Capita'].isnull()]['Country']
print("Countries with missing GDP per Capita:")
print(missing_gdp_per_capita)

# Countries with missing CO2 Emissions per Capita
missing_co2_per_capita = df[df['CO2 Emissions per Capita'].isnull()]['Country']
print("\nCountries with missing CO2 Emissions per Capita:")
print(missing_co2_per_capita)

Countries with missing GDP per Capita:
73                       Vatican City
133    Palestinian National Authority
Name: Country, dtype: object

Countries with missing CO2 Emissions per Capita:
56                           Eswatini
73                       Vatican City
113                            Monaco
120                             Nauru
128                   North Macedonia
133    Palestinian National Authority
149                        San Marino
Name: Country, dtype: object


We will try to impute missing values from reliable sources if data is available

In [66]:
# Adding the missing CO2 emissions data for countries
df.loc[df['Country'] == 'Eswatini', 'CO2 Emissions per Capita'] = 0.9
df.loc[df['Country'] == 'Nauru', 'CO2 Emissions per Capita'] = 4.8
df.loc[df['Country'] == 'North Macedonia', 'CO2 Emissions per Capita'] = 4.1
df.loc[df['Country'] == 'Palestinian National Authority', 'CO2 Emissions per Capita'] = 0.6

Processing features

In [67]:
df['Unemployment rate'] = df['Unemployment rate']*100

converting all float point values to 3 decimal points

In [69]:
df = df.round(3)

converting all percentages greater than 100 to 100

In [71]:
# Convert all percentage values greater than 100 to 100
for col in percentage_columns:
    df[col] = df[col].apply(lambda x: 100 if x > 100 else x)

# Display the updated dataframe
df[percentage_columns].head()

Unnamed: 0,Agricultural Land( %),CPI Change (%),Forested Area (%),Gross primary education enrollment (%),Gross tertiary education enrollment (%),Population: Labor force participation (%),Tax revenue (%)
0,58.1,2.3,2.1,100.0,9.7,48.9,9.3
1,43.1,1.4,28.1,100.0,55.0,55.7,18.6
2,17.4,2.0,0.8,100.0,51.4,41.2,37.2
3,40.0,,34.0,100.0,,,
4,47.5,17.1,46.3,100.0,9.3,77.5,9.2


In [72]:
df.to_csv('world-data-2023_cleaned.csv', index=False)

Missing values for GDP per capita were not found for the countries and Missing data for CO2 emissions per capita was found for some of the countries.

We will now move onto plotting the data using Altair. 

We have selected the following metrics to plot:
- GDP per capita
- CO2 emissions per capita
- Population
- Life expectancy
- Continent
- Life Expectancy
- Infant Mortality
- Fertility Rate
- Physicians per Thousand
- Maternal Mortality Ratio


We will create the following graphs:

1. Choropleth World Map:
   - Attributes: GDP per Capita.
   - Description: A world map where countries are color-coded based on GDP per capita.
   - Interaction:
      - Hover Tooltips: Display country name, GDP per capita, life expectancy, and CO₂ emissions per capita.
      - Click to Select: Clicking on a country highlights it in all other views.
      - Zoom and Pan: Users can zoom into regions of interest.
      - Purpose: Provides a global overview of economic well-being, facilitating comparison between countries and regions.

2. Scatter Plot: GDP per Capita vs. Life Expectancy
   - Attributes:
      - X-axis: GDP per Capita.
      - Y-axis: Life Expectancy.
   - Color Encoding: Continent.
   - Size Encoding: Population.
   - Description: Each point represents a country, showing the relationship between economic status and health outcomes.
   - Interaction:
      - Brushing: Selecting points to highlight corresponding countries on the map and other plots.
      - Hover Tooltips: Display detailed information about the country.
      - Purpose: Allows users to explore correlations and patterns between wealth and health.

3. Interactive Box Plot: CO₂ Emissions per Capita
   - Attributes: CO₂ Emissions per Capita.
   - Description: Summarizes the distribution of CO₂ emissions per capita across countries.
   - Interaction:
      - Selection: Users can select quartiles or specific ranges to highlight countries in other views.
      - Hover Tooltips: Provide statistical summaries (median, quartiles, outliers).
      - Purpose: Highlights environmental impact without overcrowding the visualization, enabling users to identify countries with unusually high or low emissions.

4. Parallel Coordinates Plot: Health Indicators
   - Attributes:
      - Life Expectancy
      - Infant Mortality
      - Fertility Rate
      - Physicians per Thousand
      - Maternal Mortality Ratio
   - Description: Displays multiple health metrics for each country, with each line representing a country.
   - Interaction:
      - Brushing: Users can select ranges on any axis to filter and highlight countries.
      - Linking: Selected countries are highlighted in all other views.
      - Hover Tooltips: Provide detailed information on each country's health metrics.
      - Purpose: Enables multivariate analysis of health indicators, revealing patterns and outliers.

1. Choropleth World Map

In [21]:
continent_color_scale = alt.Scale(
    domain=['Asia', 'Europe', 'Africa', 'Americas', 'Oceania'],
    range=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
)


country_selection = alt.selection_point(
    fields=['country-code'], 
    toggle=True, 
    empty='all'
)

continent_selection = alt.selection_point(
    fields=['region'],
    empty='all'
)

# Create interactive legend chart
legend = alt.Chart(df).transform_aggregate(
    groupby=['region']
).mark_point(filled=True, size=100).encode(
    y=alt.Y('region:N', axis=alt.Axis(title='Continent')),
    color=alt.Color('region:N', scale=continent_color_scale, legend=None),
    opacity=alt.condition(continent_selection, alt.value(1), alt.value(0.2))
).add_params(
    continent_selection
)




world_map = alt.topo_feature(data.world_110m.url, 'countries')

color_scheme = 'viridis'

projection_list = ['mercator', 'equalEarth', 'orthographic']
projection_select = alt.binding_select(options=projection_list, name='Projection:')
projection_param = alt.param(value='mercator', bind=projection_select)


choropleth = alt.Chart(world_map).mark_geoshape().encode(
    color=alt.Color('GDP per Capita:Q',
                 scale=alt.Scale(scheme=color_scheme),
                 title='GDP per Capita'),
    opacity = alt.condition(
        alt.datum.region == continent_selection.region | alt.datum['country-code'] == country_selection['country-code'],
        alt.value(1),
        alt.value(0.2)
    ),
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
        alt.Tooltip('GDP per Capita:Q', title='GDP per Capita'),
        alt.Tooltip('Life expectancy:Q', title='Life Expectancy'),
        alt.Tooltip('CO2 Emissions per Capita:Q', title='CO₂ Emissions per Capita')
    ]
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(
        df,
        'country-code',
        ['Country', 'GDP per Capita', 'Life expectancy', 'CO2 Emissions per Capita', 'country-code']
    )
).project(
    type=projection_param
).add_params(
    projection_param, country_selection, continent_selection
).properties(
    width=1280,
    height=540
)



2. Scatter Plot: GDP per Capita vs. Life Expectancy 

removed lichenstein and luxembourg as they were outliers

In [22]:
df_filtered = df[~df['Country'].isin(['Luxembourg', 'Liechtenstein'])]

scatter_plot = alt.Chart(df_filtered).mark_circle().encode(
    x=alt.X('GDP per Capita:Q', title='GDP per Capita'),
    y=alt.Y('Life expectancy:Q', title='Life Expectancy', scale=alt.Scale(domain=[df_filtered['Life expectancy'].min() - 5, df_filtered['Life expectancy'].max() + 5])),
    color=alt.Color('region:N', scale=continent_color_scale, legend=None),
    opacity=alt.condition(
        alt.datum.region == continent_selection.region | alt.datum['country-code'] == country_selection['country-code'],
        alt.value(1),
        alt.value(0.2)
    ),
    size=alt.Size('Population:Q', legend=None, scale=alt.Scale(range=[70, 1500]), title='Population'),
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
        alt.Tooltip('GDP per Capita:Q', title='GDP per Capita'),
        alt.Tooltip('Life expectancy:Q', title='Life Expectancy'),
        alt.Tooltip('Population:Q', title='Population'),
        alt.Tooltip('region:N', title='Continent')
    ]
).add_params(
    country_selection, continent_selection
).properties(
    width=960,
    height=540
)

co2 emissions chart: removing qatar and trinidad and tobago as they were outliers

In [23]:
# base = alt.Chart(df)


# box_selection = alt.selection_interval(
#     encodings=['y'],
#     name='box_selection',
#     empty='none'
# )

# box_plot = base.mark_boxplot(size=50, extent='min-max').encode(
#     y=alt.Y('CO2 Emissions per Capita:Q', title='CO₂ Emissions per Capita'),
#     color=alt.condition(
#         box_selection,
#         alt.value('steelblue'),
#         alt.value('lightgray')
#     ),
#     tooltip=[
#         alt.Tooltip('count():Q', title='Count'),
#         alt.Tooltip('min(CO2 Emissions per Capita):Q', title='Min'),
#         alt.Tooltip('max(CO2 Emissions per Capita):Q', title='Max')
#     ]
# ).add_params(box_selection).properties(width=640, height=540)

# box_plot

In [24]:

def get_surrounding_countries(selected_countries, df, n_above=9, n_below=10):
    if not selected_countries:

        return df.nlargest(30, 'CO2 Emissions per Capita')
    
    result_dfs = []
    for country in selected_countries:
        country_value = df[df['Country'] == country]['CO2 Emissions per Capita'].iloc[0]
        above = df[df['CO2 Emissions per Capita'] > country_value].nsmallest(n_above, 'CO2 Emissions per Capita')
        below = df[df['CO2 Emissions per Capita'] < country_value].nlargest(n_below, 'CO2 Emissions per Capita')
        selected = df[df['Country'] == country]
        result_dfs.extend([above, selected, below])
    
    return pd.concat(result_dfs).drop_duplicates()


bar_data = alt.Chart(df).transform_window(
    rank='rank(CO2 Emissions per Capita)',
    sort=[alt.SortField('CO2 Emissions per Capita', order='descending')]
).transform_filter(
    alt.datum.rank <= 30
)


emissions_bar = bar_data.mark_bar().encode(
    y=alt.Y('Country:N',
            sort='-x',
            axis=alt.Axis(title='Country')),
    x=alt.X('CO2 Emissions per Capita:Q',
            axis=alt.Axis(title='CO₂ Emissions per Capita')),
    color=alt.Color('region:N', scale=continent_color_scale, legend=None),
    opacity=alt.condition(
        alt.datum.region == continent_selection.region | alt.datum['country-code'] == country_selection['country-code'],
        alt.value(1),
        alt.value(0.2)
    ),
    tooltip=[
        alt.Tooltip('Country:N'),
        alt.Tooltip('CO2 Emissions per Capita:Q', title='CO₂ Emissions'),
        alt.Tooltip('region:N', title='Continent')
    ]
).add_params(
    country_selection,
    continent_selection
).properties(
    width=640,
    height=540
)


In [25]:
health_indicators = [
    'Life expectancy',
    'Infant mortality',
    'Fertility Rate',
    'Physicians per thousand',
    'Maternal mortality ratio'
]

df_health = df.dropna(subset=health_indicators)

parallel_data = df_health.melt(
    id_vars=['Country', 'country-code', 'region'],
    value_vars=health_indicators,
    var_name='Indicator',
    value_name='Value'
)

parallel_plot = alt.Chart(parallel_data).transform_joinaggregate(
    min_value='min(Value)',
    max_value='max(Value)',
    groupby=['Indicator']
).transform_calculate(
    minmax_value='(datum.Value - datum.min_value) / (datum.max_value - datum.min_value)'
).mark_line(opacity=0.5).encode(
    x=alt.X('Indicator:N', title='Health Indicators'),
    y=alt.Y('minmax_value:Q', scale=alt.Scale(zero=False)),
    color=alt.Color('region:N', scale=continent_color_scale, legend=None),
    opacity=alt.condition(
        alt.datum.region == continent_selection.region | alt.datum['country-code'] == country_selection['country-code'],
        alt.value(1),
        alt.value(0.2)
    ),
    detail='country-code:N',
    tooltip=[
        alt.Tooltip('Country:N'),
        alt.Tooltip('Indicator:N'),
        alt.Tooltip('Value:Q')
    ]
).add_params(
    country_selection, continent_selection
).properties(
    width=960,
    height=540
)

In [26]:

top_row = alt.hconcat(
    choropleth.properties(width=1280, height=540),
)


bottom_row = alt.hconcat(
    scatter_plot.properties(width=540, height=540),    
    parallel_plot.properties(width=740, height=540),
)

left_column = alt.vconcat(
    top_row,
    bottom_row
)

right_column = alt.vconcat(
    emissions_bar.properties(width=640, height=1000),
    legend.properties(width=100, height=80)
)

dashboard = alt.hconcat(
    left_column,
    right_column
)

dashboard