In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import altair as alt
from sklearn.impute import SimpleImputer
import numpy as np


In [2]:
df = pd.read_csv("Life Expectancy Data.csv")


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 22 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country                          2938 non-null   object 
 1   Year                             2938 non-null   int64  
 2   Status                           2938 non-null   object 
 3   Life expectancy                  2928 non-null   float64
 4   Adult Mortality                  2928 non-null   float64
 5   infant deaths                    2938 non-null   int64  
 6   Alcohol                          2744 non-null   float64
 7   percentage expenditure           2938 non-null   float64
 8   Hepatitis B                      2385 non-null   float64
 9   Measles                          2938 non-null   int64  
 10   BMI                             2904 non-null   float64
 11  under-five deaths                2938 non-null   int64  
 12  Polio               

In [4]:
df.isna().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [5]:
df = df.rename(columns=lambda x: x.strip())

In [6]:
# Replacing the Null Values with mean values of the data
imputer=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None)
df['Life expectancy']=imputer.fit_transform(df[['Life expectancy']])
df['Adult Mortality']=imputer.fit_transform(df[['Adult Mortality']])
df['Alcohol']=imputer.fit_transform(df[['Alcohol']])
df['Hepatitis B']=imputer.fit_transform(df[['Hepatitis B']])
df['BMI']=imputer.fit_transform(df[['BMI']])
df['Polio']=imputer.fit_transform(df[['Polio']])
df['Total expenditure']=imputer.fit_transform(df[['Total expenditure']])
df['Diphtheria']=imputer.fit_transform(df[['Diphtheria']])
df['GDP']=imputer.fit_transform(df[['GDP']])
df['Population']=imputer.fit_transform(df[['Population']])
df['thinness  1-19 years']=imputer.fit_transform(df[['thinness  1-19 years']])
df['thinness 5-9 years']=imputer.fit_transform(df[['thinness 5-9 years']])
df['Income composition of resources']=imputer.fit_transform(df[['Income composition of resources']])
df['Schooling']=imputer.fit_transform(df[['Schooling']])


In [7]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles', 'BMI', 'under-five deaths', 'Polio', 'Total expenditure',
       'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 'thinness  1-19 years',
       'thinness 5-9 years', 'Income composition of resources', 'Schooling'],
      dtype='object')

In [8]:

fig = px.choropleth(
    df, 
    locations="Country",  
    locationmode="country names",
    color="Life expectancy",  
    animation_frame="Year",  
    title="Life Expectancy Over Time",  
    color_continuous_scale="Viridis",
    width=900, 
    height=600,
    hover_name="Country",  
    hover_data={"GDP": True, "Life expectancy": True}  
)
fig.write_html("visualizations/life_expectancy_map.html")


fig.show()

In [9]:
import plotly.express as px

# Define a threshold for high life expectancy (e.g., above 75)
df_high_life = df[df["Life expectancy"] > 75]  

fig = px.choropleth(
    df_high_life,  
    locations="Country",  
    locationmode="country names",
    color="Life expectancy",  
    animation_frame="Year",  
    title="Life Expectancy Over Time",  
    color_continuous_scale="Viridis",
    width=900, 
    height=600,
    hover_name="Country",  
    hover_data={"GDP": True, "Life expectancy": True}  
)

fig.show()


In [10]:
# Calculate the correlation between years of schooling and life expectancy
correlation = df['Schooling'].corr(df['Life expectancy'])
print(f"Correlation between Years of Schooling and Life Expectancy: {correlation}")

# Create a slider for GDP threshold
gdp_cutoff = alt.param(value=0,
                       bind=alt.binding_range(min=1,
                                              max=120000,
                                              step=100,
                                              name='Max GDP: '))

# Create a slider for year selection
year_slider = alt.binding_range(min=df['Year'].min(), max=df['Year'].max(), step=1, name='Year: ')
year_selection = alt.param(value=df['Year'].min(), bind=year_slider)

# Create an interval selection for brushing
selection = alt.selection_point(empty='none')

# Create the scatter plot with brushing
scatter_chart = alt.Chart(df).mark_square().add_params(
    gdp_cutoff,
    year_selection
).add_params(
    selection
).transform_filter(
    (alt.datum.GDP < gdp_cutoff) & (alt.datum.Year == year_selection)
).encode(
    x=alt.X('Schooling:Q', scale=alt.Scale(domain=[df['Schooling'].min(), df['Schooling'].max()]), title='Schooling (Years)'),
    y=alt.Y('Life expectancy:Q', scale=alt.Scale(domain=[df['Life expectancy'].min(), df['Life expectancy'].max()]), title='Life Expectancy (Years)'),
    tooltip=['Country', 'Schooling', 'Life expectancy', 'GDP'],
    opacity=alt.condition(
        selection, alt.value(1), alt.value(0.75)
    ),
    color=alt.condition(
        selection, 'Country:N', alt.value('lightblue')
    )
).properties(
    title="Relationship Between Schooling, Life Expectancy, and GDP",
    width=400,
    height=400
)

# Create the bar chart filtered by the scatter plot selection
bar_chart = alt.Chart(df).mark_bar().transform_filter(
    selection
).encode(
    x=alt.X('sum(GDP):Q', title='Total GDP'),
    y=alt.Y('Country:N', sort='-x', title='Country', axis=alt.Axis(labelAngle=-90)),
    color=alt.Color('Country:N', legend=None)
).properties(
    title="GDP of Selected Countries",
    width=475,
    height=400
)

# Combine the charts side by side
dashboard = scatter_chart | bar_chart

# Save the dashboard as an HTML file
dashboard.save("visualizations/gdp_vs_schooling_dashboard.html")

dashboard

Correlation between Years of Schooling and Life Expectancy: 0.7150663398620062
