# Exploratory Data Analysis

We have data on happiness, democracy, and __. Let's first take a look at all the data and make something interesting out of some horrendous datasets.

In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import altair as alt
from altair_data_server import data_server
import os 
import re

# Happiness Data

## Useful Links
* [geoJSON file downloads](https://gadm.org/download_country.html)

In [2]:
path = "Datasets/World_Happiness_Reports"
# The pattern looks for a sequence of exactly four digits
pattern = r"\d{4}"

country_sets = []
dataframes = []

# loop through all the happiness.csv files for each year and gather the happiness scores
for filename in os.scandir(path):
    print("Working on: " + filename.path)

    # the current csv file we're working on 
    curr_df = pd.read_csv(filename)
    
    # some countries had an asterisk next to their name, so remove those
    curr_df['Country'] = curr_df['Country'].str.replace('*', '', regex=True)

    # there are differnt number of countries in different files, so we want to collect all the countries 
    country_set = set(curr_df['Country'].tolist())
    country_sets.append(country_set)

    # add "_year" after the Happiness Score for columns
    match = re.search(pattern, filename.path)
    year = match.group(0) if match else None 
    curr_df.rename(columns={"Happiness Score":"Happiness Score_" + year}, inplace=True)
    dataframes.append(curr_df)
    
    print(f"\tColumns: {curr_df.columns}")
# use sets to just get the unique countries
common_countries = set.union(*country_sets)


Working on: Datasets/World_Happiness_Reports/2020.csv
	Columns: Index(['Country', 'Regional indicator', 'Happiness Score_2020',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to make life choices', 'Generosity',
       'Perceptions of corruption', 'Ladder score in Dystopia',
       'Explained by: Log GDP per capita', 'Explained by: Social support',
       'Explained by: Healthy life expectancy',
       'Explained by: Freedom to make life choices',
       'Explained by: Generosity', 'Explained by: Perceptions of corruption',
       'Dystopia + residual'],
      dtype='object')
Working on: Datasets/World_Happiness_Reports/2021.csv
	Columns: Index(['Country', 'Regional indicator', 'Happiness Score_2021',
       'Standard error of ladder score', 'upperwhisker', 'lowerwhisker',
       'Logged GDP per capita', 'Social support', 'Healthy life expectancy',
       'Freedom to m

In [3]:
# Filter each dataset to only include rows with countries in the common countries set
filtered_dataframes = []
for df in dataframes:
    filtered_df = df[df["Country"].isin(common_countries)]
    filtered_dataframes.append(filtered_df)

# create the happiness dataframe
pattern = r"Happiness Score"
happiness_df = filtered_dataframes[0][['Country']]
for i, df in enumerate(filtered_dataframes):
    # Extract the country and happiness score columns
    filtered_columns = df.filter(regex=pattern)

    country_happiness_df = pd.concat([df['Country'], filtered_columns], axis=1)

    # Merge the result DataFrame with the country_happiness_df on the 'Country' column
    happiness_df = happiness_df.merge(country_happiness_df, on='Country', how='right')


# Split the DataFrame into the first column and the remaining columns
first_column = happiness_df.iloc[:, 0:1]
remaining_columns = happiness_df.iloc[:, 1:]

# Sort the remaining columns
sorted_remaining_columns = remaining_columns.sort_index(axis=1)

# Concatenate the first column with the sorted remaining columns
happiness_df = pd.concat([first_column, sorted_remaining_columns], axis=1)

happiness_df['Happiness Score_2022'] = happiness_df['Happiness Score_2022'].str.replace(',','.')
happiness_df['Happiness Score_2022'] = pd.to_numeric(happiness_df['Happiness Score_2022'], errors ='ignore').astype(pd.Float64Dtype())


In [4]:
happiness_df

Unnamed: 0,Country,Happiness Score_2015,Happiness Score_2016,Happiness Score_2017,Happiness Score_2018,Happiness Score_2019,Happiness Score_2020,Happiness Score_2021,Happiness Score_2022
0,Norway,7.522,7.498,7.537,7.594,7.554,7.4880,7.392,7.365
1,Denmark,7.527,7.526,7.522,7.555,7.600,7.6456,7.620,7.636
2,Iceland,7.561,7.501,7.504,7.495,7.494,7.5045,7.554,7.557
3,Switzerland,7.587,7.509,7.494,7.487,7.480,7.5599,7.571,7.512
4,Finland,7.406,7.413,7.469,7.632,7.769,7.8087,7.842,7.821
...,...,...,...,...,...,...,...,...,...
150,Rwanda,3.465,3.515,3.471,3.408,3.334,3.3123,3.415,3.268
151,Syria,3.006,3.069,3.462,3.462,3.462,,,
152,Tanzania,3.781,3.666,3.349,3.303,3.231,3.4762,3.623,3.702
153,Burundi,2.905,2.905,2.905,2.905,3.775,,,


In [5]:
# Let's make a choropleth map, so we need to upload geojson files
countries_filepath = 'Datasets/countries.geojson'
countries_gdf = gpd.read_file(countries_filepath)
countries_gdf = countries_gdf.rename(columns={'ADMIN':'Country'})

In [6]:
# Merge happiness data so we can look at the different levels of happiness in different countries across different years
happiness_countries_merged_df = countries_gdf.merge(happiness_df, on='Country', how = 'right')
happiness_countries_merged_df

Unnamed: 0,Country,ISO_A3,ISO_A2,geometry,Happiness Score_2015,Happiness Score_2016,Happiness Score_2017,Happiness Score_2018,Happiness Score_2019,Happiness Score_2020,Happiness Score_2021,Happiness Score_2022
0,Norway,NOR,NO,"MULTIPOLYGON (((3.38258 -54.44931, 3.36451 -54...",7.522,7.498,7.537,7.594,7.554,7.4880,7.392,7.365
1,Denmark,DNK,DK,"MULTIPOLYGON (((11.25603 54.95458, 11.30348 54...",7.527,7.526,7.522,7.555,7.600,7.6456,7.620,7.636
2,Iceland,ISL,IS,"MULTIPOLYGON (((-20.26964 63.40913, -20.27953 ...",7.561,7.501,7.504,7.495,7.494,7.5045,7.554,7.557
3,Switzerland,CHE,CH,"MULTIPOLYGON (((8.61744 47.75732, 8.62984 47.7...",7.587,7.509,7.494,7.487,7.480,7.5599,7.571,7.512
4,Finland,FIN,FI,"MULTIPOLYGON (((22.44370 59.85147, 22.42774 59...",7.406,7.413,7.469,7.632,7.769,7.8087,7.842,7.821
...,...,...,...,...,...,...,...,...,...,...,...,...
150,Rwanda,RWA,RW,"MULTIPOLYGON (((30.47179 -1.06684, 30.46386 -1...",3.465,3.515,3.471,3.408,3.334,3.3123,3.415,3.268
151,Syria,SYR,SY,"MULTIPOLYGON (((42.23683 37.28630, 42.26722 37...",3.006,3.069,3.462,3.462,3.462,,,
152,Tanzania,,,,3.781,3.666,3.349,3.303,3.231,3.4762,3.623,3.702
153,Burundi,BDI,BI,"MULTIPOLYGON (((30.41507 -2.31309, 30.41848 -2...",2.905,2.905,2.905,2.905,3.775,,,


In [7]:
# Congo was originally broken into "Congo (Brazzaville)" and "Congo (Kinshasa)", I couldn't find a geojson file for them, but I could for Congo, so I just took the average. 
congo_rows = happiness_countries_merged_df[happiness_countries_merged_df['Country'].isin(['Congo (Brazzaville)', 'Congo (Kinshasa)'])]
avg_row = congo_rows.mean(numeric_only=True)

# Create a new row with the average values
new_row = congo_rows.iloc[0].copy()
new_row[new_row.index.str.startswith('Happiness Score_')] = avg_row

# Set the new country name
new_row['Country'] = 'Congo'

# Remove the original rows from the DataFrame
happiness_countries_merged_df = happiness_countries_merged_df[~happiness_countries_merged_df['Country'].isin(['Congo (Brazzaville)', 'Congo (Kinshasa)'])]

# Add the new row with the average values to the DataFrame
happiness_countries_merged_df = happiness_countries_merged_df.append(new_row, ignore_index=True)
happiness_countries_merged_df

  happiness_countries_merged_df = happiness_countries_merged_df.append(new_row, ignore_index=True)


Unnamed: 0,Country,ISO_A3,ISO_A2,geometry,Happiness Score_2015,Happiness Score_2016,Happiness Score_2017,Happiness Score_2018,Happiness Score_2019,Happiness Score_2020,Happiness Score_2021,Happiness Score_2022
0,Norway,NOR,NO,"MULTIPOLYGON (((3.38258 -54.44931, 3.36451 -54...",7.522,7.498,7.5370,7.594,7.554,7.4880,7.392,7.365
1,Denmark,DNK,DK,"MULTIPOLYGON (((11.25603 54.95458, 11.30348 54...",7.527,7.526,7.5220,7.555,7.600,7.6456,7.620,7.636
2,Iceland,ISL,IS,"MULTIPOLYGON (((-20.26964 63.40913, -20.27953 ...",7.561,7.501,7.5040,7.495,7.494,7.5045,7.554,7.557
3,Switzerland,CHE,CH,"MULTIPOLYGON (((8.61744 47.75732, 8.62984 47.7...",7.587,7.509,7.4940,7.487,7.480,7.5599,7.571,7.512
4,Finland,FIN,FI,"MULTIPOLYGON (((22.44370 59.85147, 22.42774 59...",7.406,7.413,7.4690,7.632,7.769,7.8087,7.842,7.821
...,...,...,...,...,...,...,...,...,...,...,...,...
149,Syria,SYR,SY,"MULTIPOLYGON (((42.23683 37.28630, 42.26722 37...",3.006,3.069,3.4620,3.462,3.462,,,
150,Tanzania,,,,3.781,3.666,3.3490,3.303,3.231,3.4762,3.623,3.702
151,Burundi,BDI,BI,"MULTIPOLYGON (((30.41507 -2.31309, 30.41848 -2...",2.905,2.905,2.9050,2.905,3.775,,,
152,Central African Republic,CAF,CF,"MULTIPOLYGON (((22.55576 10.97897, 22.57705 10...",,,2.6930,,,,,


In [8]:
# Hong Kong's original name was too long so I changed it to "Hong Kong"
happiness_countries_merged_df.at[70,'Country'] = 'Hong Kong'

In [9]:
"""# For some reason the happiness score in 2022 was in thousands, so need to divide by 1000

happiness_countries_merged_df['Happiness Score_2022'] = happiness_countries_merged_df['Happiness Score_2022'].str.replace(',','')
happiness_countries_merged_df['Happiness Score_2022'] = pd.to_numeric(happiness_countries_merged_df['Happiness Score_2022'], errors ='ignore').astype(pd.Float64Dtype())

non_na_indices = happiness_countries_merged_df['Happiness Score_2022'].notna()
happiness_countries_merged_df.loc[non_na_indices, 'Happiness Score_2022'] = happiness_countries_merged_df.loc[non_na_indices, 'Happiness Score_2022'] / 1000
"""

"# For some reason the happiness score in 2022 was in thousands, so need to divide by 1000\n\nhappiness_countries_merged_df['Happiness Score_2022'] = happiness_countries_merged_df['Happiness Score_2022'].str.replace(',','')\nhappiness_countries_merged_df['Happiness Score_2022'] = pd.to_numeric(happiness_countries_merged_df['Happiness Score_2022'], errors ='ignore').astype(pd.Float64Dtype())\n\nnon_na_indices = happiness_countries_merged_df['Happiness Score_2022'].notna()\nhappiness_countries_merged_df.loc[non_na_indices, 'Happiness Score_2022'] = happiness_countries_merged_df.loc[non_na_indices, 'Happiness Score_2022'] / 1000\n"

In [10]:
happiness_countries_merged_df

Unnamed: 0,Country,ISO_A3,ISO_A2,geometry,Happiness Score_2015,Happiness Score_2016,Happiness Score_2017,Happiness Score_2018,Happiness Score_2019,Happiness Score_2020,Happiness Score_2021,Happiness Score_2022
0,Norway,NOR,NO,"MULTIPOLYGON (((3.38258 -54.44931, 3.36451 -54...",7.522,7.498,7.5370,7.594,7.554,7.4880,7.392,7.365
1,Denmark,DNK,DK,"MULTIPOLYGON (((11.25603 54.95458, 11.30348 54...",7.527,7.526,7.5220,7.555,7.600,7.6456,7.620,7.636
2,Iceland,ISL,IS,"MULTIPOLYGON (((-20.26964 63.40913, -20.27953 ...",7.561,7.501,7.5040,7.495,7.494,7.5045,7.554,7.557
3,Switzerland,CHE,CH,"MULTIPOLYGON (((8.61744 47.75732, 8.62984 47.7...",7.587,7.509,7.4940,7.487,7.480,7.5599,7.571,7.512
4,Finland,FIN,FI,"MULTIPOLYGON (((22.44370 59.85147, 22.42774 59...",7.406,7.413,7.4690,7.632,7.769,7.8087,7.842,7.821
...,...,...,...,...,...,...,...,...,...,...,...,...
149,Syria,SYR,SY,"MULTIPOLYGON (((42.23683 37.28630, 42.26722 37...",3.006,3.069,3.4620,3.462,3.462,,,
150,Tanzania,,,,3.781,3.666,3.3490,3.303,3.231,3.4762,3.623,3.702
151,Burundi,BDI,BI,"MULTIPOLYGON (((30.41507 -2.31309, 30.41848 -2...",2.905,2.905,2.9050,2.905,3.775,,,
152,Central African Republic,CAF,CF,"MULTIPOLYGON (((22.55576 10.97897, 22.57705 10...",,,2.6930,,,,,


At this point there are still a few countries that have happiness scores but aren't on the map, so we're going to find their geojson files and add them to the happiness dataframe. 

In [11]:
countries_missing_geometry_df = happiness_countries_merged_df[happiness_countries_merged_df['geometry']==None]
countries_missing_geometry_df

Unnamed: 0,Country,ISO_A3,ISO_A2,geometry,Happiness Score_2015,Happiness Score_2016,Happiness Score_2017,Happiness Score_2018,Happiness Score_2019,Happiness Score_2020,Happiness Score_2021,Happiness Score_2022
32,Taiwan Province of China,,,,,,6.422,,,,,
60,North Cyprus,,,,5.695,5.771,5.81,,,,,
70,Hong Kong,,,,,,5.472,,,,,
72,Serbia,,,,5.123,5.177,5.395,5.398,5.603,5.7782,6.078,6.178
102,Palestinian Territories,,,,4.715,4.754,4.775,4.743,4.696,4.5528,4.517,4.483
150,Tanzania,,,,3.781,3.666,3.349,3.303,3.231,3.4762,3.623,3.702
153,Congo,,,,4.253,4.254,4.2855,4.402,4.615,,,


In [12]:
# Downloaded the geojson files for the countries that countries.geojson didn't have and add them into countries_missing_geometry_df

missing_countries_path = 'Datasets/OG_Missing_Countries/'
for _, row in countries_missing_geometry_df.iterrows():
    country = row['Country']
    country_path = country.replace(' ', '_')
    # Read the corresponding geoJSON flie
    
    country_geojson = os.path.join(missing_countries_path,f'{country_path}.geojson')

    if os.path.exists(country_geojson):
        country_gdf = gpd.read_file(country_geojson)
        multi_poly = country_gdf.unary_union
        new_gdf = gpd.GeoDataFrame(geometry=[multi_poly], crs=country_gdf.crs)
        country_geometry = new_gdf['geometry'].iloc[0]
        happiness_countries_merged_df.loc[happiness_countries_merged_df['Country'] == country, 'geometry'] = country_geometry
    else:
        country_geojson = os.path.join(missing_countries_path,f'{country_path}.json')
        country_gdf = gpd.read_file(country_geojson)
        multi_poly = country_gdf.unary_union
        new_gdf = gpd.GeoDataFrame(geometry=[multi_poly], crs=country_gdf.crs)
        country_geometry = new_gdf['geometry'].iloc[0]
        happiness_countries_merged_df.loc[happiness_countries_merged_df['Country'] == country, 'geometry'] = country_geometry


In [13]:
# Convert the GeoDataFrame to a long format DataFrame
happiness_long_df = happiness_countries_merged_df.melt(
    id_vars=['Country', 'ISO_A3', 'ISO_A2', 'geometry'],
    value_vars=['Happiness Score_2015', 'Happiness Score_2016', 'Happiness Score_2017', 'Happiness Score_2018',
                'Happiness Score_2019', 'Happiness Score_2020', 'Happiness Score_2021', 'Happiness Score_2022'],
    var_name='Year',
    value_name='Happiness Score'
)

# Extract the year from the Year column
happiness_long_df['Year'] = happiness_long_df['Year'].str.extract('(\d+)').astype(int)

# Use the data_server provider to allow Altair to work with the GeoDataFrame directly
alt.data_transformers.register('data_server', data_server)
alt.data_transformers.enable('data_server')

# Define the base chart
base_chart = alt.Chart(happiness_long_df).mark_geoshape(
    stroke='black',
    strokeWidth=0.5
).encode(
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
    ]
).project('equirectangular')

# Create a list of years for which you have happiness scores
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

# Create a slider for selecting the year
year_slider = alt.binding_range(min=years[0], max=years[-1], step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['Year'], name='Select', init={'Year': years[0]})

# Define the choropleth map using the year slider
map_chart = base_chart.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).encode(
    color=alt.Color('Happiness Score:Q', title='Happiness Score', scale=alt.Scale(scheme='viridis')),
    tooltip=[
        alt.Tooltip('Country:N', title='Country'),
        alt.Tooltip('Happiness Score:Q', title='Happiness Score')
    ]
).properties(
    title="Happiness Score by Country",
    width = 800, 
    height = 600
)

# Display the map
map_chart


### Reflection
Brighter colors mean the country is higher on the happy index than countries shaded in darker colors. In accordance with our hypothesis, the colors of the map get darker between 2018 and 2019 due to COVID and as the world started to recover from COVID, the map got brighter. This is what we expected. Moving forward, I think we should consider using Finland, Sweden, and Norway as standards of high happiness. I don't know whether we should use Syria or Afghanistan as standards of lower happiness, though. It might also be interesting to do a directo comparison of Russia (5.468)/China (5.191) vs. Canada (7.278)/USA (6.892). I don't really know if these numbers are different enough that a comparison might yield something significant. 

# Happiness.1 - Countries that bounced back

In [14]:
# Calculate the averages for the new columns
happiness_df.dropna(inplace=True)

pre_covid_happiness = happiness_df[['Happiness Score_2015', 'Happiness Score_2016', 'Happiness Score_2017', 'Happiness Score_2018', 'Happiness Score_2019']].mean(axis=1)
happiness_during_covid = happiness_df[['Happiness Score_2020', 'Happiness Score_2021']].mean(axis=1)
post_covid_happiness = happiness_df['Happiness Score_2022']

# Create the new dataframe with the calculated columns
happiness_periods = pd.DataFrame({
    'Country' : happiness_df['Country'],
    'Pre-Covid Happiness': pre_covid_happiness,
    'Happiness During Covid': happiness_during_covid,
    'Post-Covid Happiness': post_covid_happiness
})

happiness_periods

Unnamed: 0,Country,Pre-Covid Happiness,Happiness During Covid,Post-Covid Happiness
0,Norway,7.5410,7.44000,7.365
1,Denmark,7.5460,7.63280,7.636
2,Iceland,7.5110,7.52925,7.557
3,Switzerland,7.5114,7.56545,7.512
4,Finland,7.5378,7.82535,7.821
...,...,...,...,...
147,Liberia,3.8392,4.59145,5.122
148,Guinea,3.8536,4.96665,4.891
149,Togo,3.5442,4.14710,4.112
150,Rwanda,3.4386,3.36365,3.268


## Democracy Index

In [14]:
import pandas as pd
import geopandas as gpd
import altair as alt
from altair_data_server import data_server


In [15]:
# import and fix democracy data
democracy_path = 'Datasets/democracy.csv'
democracy_df = pd.read_csv(democracy_path)
democracy_df = democracy_df[democracy_df['Year'] >= 2015]
democracy_df = democracy_df.pivot(index='Entity', columns='Year', values='democracy_eiu')
democracy_df.columns = [f'democracy_score_{col}' for col in democracy_df.columns]
democracy_df.reset_index(inplace=True)

# We don't need the scores of the continents and the world 
remove = [
    'Africa',
    'North America',
    'World',
    'South America',
    'Europe',
    'Asia',
    'Oceania'
]

democracy_df = democracy_df[~democracy_df['Entity'].isin(remove)]
democracy_countries = democracy_df['Entity'].unique()
democracy_df

Unnamed: 0,Entity,democracy_score_2015,democracy_score_2016,democracy_score_2017,democracy_score_2018,democracy_score_2019,democracy_score_2020,democracy_score_2021,democracy_score_2022
0,Afghanistan,2.77,2.55,2.55,2.97,2.85,2.85,0.32,0.32
2,Albania,5.91,5.91,5.98,5.98,5.89,6.08,6.11,6.41
3,Algeria,3.95,3.56,3.56,3.50,4.01,3.77,3.77,3.66
4,Angola,3.35,3.40,3.62,3.62,3.72,3.66,3.37,3.96
5,Argentina,7.02,6.96,6.96,7.02,7.02,6.95,6.81,6.85
...,...,...,...,...,...,...,...,...,...
168,Venezuela,5.00,4.68,3.87,3.16,2.88,2.76,2.11,2.23
169,Vietnam,3.53,3.38,3.08,3.08,3.08,2.94,2.94,2.73
171,Yemen,2.24,2.07,2.07,1.95,1.95,1.95,1.95,1.95
172,Zambia,6.28,5.99,5.68,5.61,5.09,4.86,5.72,5.80


In [16]:
# import and fix countries geojson data
countries_gpd = gpd.read_file('Datasets/countries.geojson')
countries_gpd = countries_gpd.rename(columns={'ADMIN':'Entity'})
countries = countries_gpd['Entity'].unique()
countries_gpd

Unnamed: 0,Entity,ISO_A3,ISO_A2,geometry
0,Aruba,ABW,AW,"MULTIPOLYGON (((-69.99694 12.57758, -69.93639 ..."
1,Afghanistan,AFG,AF,"MULTIPOLYGON (((71.04980 38.40866, 71.05714 38..."
2,Angola,AGO,AO,"MULTIPOLYGON (((11.73752 -16.69258, 11.73851 -..."
3,Anguilla,AIA,AI,"MULTIPOLYGON (((-63.03767 18.21296, -63.09952 ..."
4,Albania,ALB,AL,"MULTIPOLYGON (((19.74777 42.57890, 19.74601 42..."
...,...,...,...,...
250,Samoa,WSM,WS,"MULTIPOLYGON (((-171.57002 -13.93816, -171.564..."
251,Yemen,YEM,YE,"MULTIPOLYGON (((53.30824 12.11839, 53.31027 12..."
252,South Africa,ZAF,ZA,"MULTIPOLYGON (((37.86378 -46.94085, 37.83644 -..."
253,Zambia,ZMB,ZM,"MULTIPOLYGON (((31.11984 -8.61663, 31.14102 -8..."


In [17]:
# all the countries that are in countries but not democracy_countries
diff1 = list(set(democracy_countries) - set(countries))
diff1

['North Macedonia',
 "Cote d'Ivoire",
 'Czechia',
 'Democratic Republic of Congo',
 'Guinea-Bissau',
 'Congo',
 'Eswatini',
 'Hong Kong',
 'Serbia',
 'Tanzania',
 'Timor']

In [18]:
# the two dataframes have the same country under different names, so reconcile that
mapping_dict = {
    "United Republic of Tanzania" : 'Tanzania',
    'Czech Republic' : 'Czechia',
    'Swaziland' : 'Eswatini',
    'Democratic Republic of the Congo' : 'Democratic Republic of Congo',
    'Republic of Congo' : 'Congo',
    'Hong Kong S.A.R.' : 'Hong Kong',
    'Guinea Bissau' : 'Guinea-Bissau',
    'Macedonia' : 'North Macedonia',
    'East Timor' : 'Timor',
    'Ivory Coast' : "Cote d'Ivoire",
    'Oceania' : "Australia",
    'Republic of Serbia' : 'Serbia'
}

corrected_list = [mapping_dict.get(country, country) for country in countries]


In [19]:
# all the countries that are in corrected list but not in democracy_countries
diff2 = list(set(democracy_countries) - set(corrected_list))
print(diff2)


[]


In [20]:
countries_gpd['Entity'] = countries_gpd['Entity'].replace(mapping_dict)

In [21]:
democracy_merged = countries_gpd.merge(democracy_df, on='Entity', how='right')
democracy_merged

Unnamed: 0,Entity,ISO_A3,ISO_A2,geometry,democracy_score_2015,democracy_score_2016,democracy_score_2017,democracy_score_2018,democracy_score_2019,democracy_score_2020,democracy_score_2021,democracy_score_2022
0,Afghanistan,AFG,AF,"MULTIPOLYGON (((71.04980 38.40866, 71.05714 38...",2.77,2.55,2.55,2.97,2.85,2.85,0.32,0.32
1,Albania,ALB,AL,"MULTIPOLYGON (((19.74777 42.57890, 19.74601 42...",5.91,5.91,5.98,5.98,5.89,6.08,6.11,6.41
2,Algeria,DZA,DZ,"MULTIPOLYGON (((8.60251 36.93951, 8.60566 36.9...",3.95,3.56,3.56,3.50,4.01,3.77,3.77,3.66
3,Angola,AGO,AO,"MULTIPOLYGON (((11.73752 -16.69258, 11.73851 -...",3.35,3.40,3.62,3.62,3.72,3.66,3.37,3.96
4,Argentina,ARG,AR,"MULTIPOLYGON (((-68.65412 -54.88624, -68.65414...",7.02,6.96,6.96,7.02,7.02,6.95,6.81,6.85
...,...,...,...,...,...,...,...,...,...,...,...,...
162,Venezuela,VEN,VE,"MULTIPOLYGON (((-61.39027 8.58058, -61.36986 8...",5.00,4.68,3.87,3.16,2.88,2.76,2.11,2.23
163,Vietnam,VNM,VN,"MULTIPOLYGON (((106.66871 8.75349, 106.66586 8...",3.53,3.38,3.08,3.08,3.08,2.94,2.94,2.73
164,Yemen,YEM,YE,"MULTIPOLYGON (((53.30824 12.11839, 53.31027 12...",2.24,2.07,2.07,1.95,1.95,1.95,1.95,1.95
165,Zambia,ZMB,ZM,"MULTIPOLYGON (((31.11984 -8.61663, 31.14102 -8...",6.28,5.99,5.68,5.61,5.09,4.86,5.72,5.80


In [22]:
# Convert the GeoDataFrame to a long format DataFrame
democracy_long_df = democracy_merged.melt(
    id_vars=['Entity', 'ISO_A3', 'ISO_A2', 'geometry'],
    value_vars=['democracy_score_2015', 'democracy_score_2016',
                'democracy_score_2017', 'democracy_score_2018','democracy_score_2019', 'democracy_score_2020', 'democracy_score_2021','democracy_score_2022'],
    var_name='Year',
    value_name='Democracy Score'
)

# Extract the year from the Year column
democracy_long_df['Year'] = democracy_long_df['Year'].str.extract('(\d+)').astype(int)

# Use the data_server provider to allow Altair to work with the GeoDataFrame directly
alt.data_transformers.register('data_server', data_server)
alt.data_transformers.enable('data_server')

# Define the base chart
base_chart = alt.Chart(democracy_long_df).mark_geoshape(
    stroke='black',
    strokeWidth=0.5
).encode(
    tooltip=[
        alt.Tooltip('Entity:N', title='Country'),
    ]
).project('equirectangular')

# Create a list of years for which you have happiness scores
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

# Create a slider for selecting the year
year_slider = alt.binding_range(min=years[0], max=years[-1], step=1)
slider_selection = alt.selection_single(bind=year_slider, fields=['Year'], name='Select', init={'Year': years[0]})

# Define the choropleth map using the year slider
map_chart = base_chart.add_selection(
    slider_selection
).transform_filter(
    slider_selection
).encode(
    color=alt.Color('Democracy Score:Q', title='Democracy Score', scale=alt.Scale(scheme='viridis')),
    tooltip=[
        alt.Tooltip('Entity:N', title='Country'),
        alt.Tooltip('Democracy Score:Q', title='Democracy Score')
    ]
).properties(
    title="Democracy Score by Country",
    width = 800, 
    height = 600
)

# Display the map
map_chart
