# Session 05

In [1]:
import pandas as pd
import plotly.express as px

pd.set_option("display.max_rows", None)

df = pd.read_csv("https://raw.githubusercontent.com/wcj365/python-stats-dataviz/refs/heads/master/fall2024/data/World_Development_Indicators_(WDI).csv")

df.shape

(4123, 9)

In [3]:
df.isna().sum()

Unnamed: 0,0
Year,0
Country,0
GDP per capita (current US$),161
"Life expectancy at birth, total (years)",346
"Population, total",0
Country Code,0
Region,0
Income Group,0
Lending Type,0


In [4]:
df[["Year", "Country","GDP per capita (current US$)", "Life expectancy at birth, total (years)"]].sample(5)

Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)"
2118,2013,Lesotho,1141.360923,49.002
3532,2021,Sudan,749.706787,65.267
3022,2005,Qatar,52468.445648,76.064
1523,2007,Guatemala,2490.747828,70.1
2428,2019,Mexico,10434.578365,74.202


In [5]:
df_2022 = df[df["Year"] != 2022]
df_2022.shape

(3906, 9)

In [6]:
df_2022.isna().sum()

Unnamed: 0,0
Year,0
Country,0
GDP per capita (current US$),140
"Life expectancy at birth, total (years)",129
"Population, total",0
Country Code,0
Region,0
Income Group,0
Lending Type,0


In [7]:
df_2022 = df_2022.dropna()
df_2022.isna().sum()

Unnamed: 0,0
Year,0
Country,0
GDP per capita (current US$),0
"Life expectancy at birth, total (years)",0
"Population, total",0
Country Code,0
Region,0
Income Group,0
Lending Type,0


In [8]:
df_2022.shape

(3641, 9)

In [9]:
df.duplicated().sum()

0

In [10]:
df[df.duplicated()]

Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type


In [11]:
df_cleaned = df.drop_duplicates()

print(f"Cleaned shape: {df_cleaned.shape}")
print(df_cleaned.sample(5))


Cleaned shape: (4123, 9)
      Year     Country  GDP per capita (current US$)  \
1868  2010       Japan                  44968.156235   
1666  2017     Hungary                  14621.239596   
2218  2018  Luxembourg                 116786.511655   
4098  2017      Zambia                   1495.752138   
2069  2021     Lao PDR                   2535.623434   

      Life expectancy at birth, total (years)  Population, total Country Code  \
1868                                82.842683        128070000.0          JPN   
1666                                75.817073          9787966.0          HUN   
2218                                82.295122           607950.0          LUX   
4098                                62.120000         17298054.0          ZMB   
2069                                68.061000          7425057.0          LAO   

                     Region         Income Group    Lending Type  
1868    East Asia & Pacific          High income  Not classified  
1666  Europe & Ce

In [12]:
year_of_interest = 2019

df_2019 = df[df["Year"] == year_of_interest]
print(df_2019.shape)
df_2019.sample(5)


(217, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
1649,2019,"Hong Kong SAR, China",48359.001195,85.180488,7507900.0,HKG,East Asia & Pacific,High income,Not classified
2998,2019,Portugal,23330.817289,81.67561,10286263.0,PRT,Europe & Central Asia,High income,Not classified
1288,2019,Finland,48629.858228,81.982927,5521606.0,FIN,Europe & Central Asia,High income,Not classified
889,2019,Costa Rica,12669.341155,79.427,5084532.0,CRI,Latin America & Caribbean,Upper middle income,IBRD
1516,2019,Guam,37752.633077,77.717,168624.0,GUM,East Asia & Pacific,High income,Not classified


In [13]:
df_2019.isna().sum()

Unnamed: 0,0
Year,0
Country,0
GDP per capita (current US$),6
"Life expectancy at birth, total (years)",8
"Population, total",0
Country Code,0
Region,0
Income Group,0
Lending Type,0


In [14]:
df_2019 = df_2019.dropna(subset=["GDP per capita (current US$)", "Life expectancy at birth, total (years)"])
print(df_2019.shape)
df_2019.sample(5)

(203, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
775,2019,Chile,14632.690308,80.326,19039485.0,CHL,Latin America & Caribbean,High income,IBRD
471,2019,Bosnia and Herzegovina,6094.724823,77.241,3360711.0,BIH,Europe & Central Asia,Upper middle income,IBRD
243,2019,Azerbaijan,4805.753718,73.102,10024283.0,AZE,Europe & Central Asia,Upper middle income,IBRD
4119,2019,Zimbabwe,1421.868596,61.292,15354608.0,ZWE,Sub-Saharan Africa,Lower middle income,Blend
2276,2019,Malawi,584.362867,64.119,18867337.0,MWI,Sub-Saharan Africa,Low income,IDA


In [15]:

fig = px.scatter(
    df_2019,
    x="GDP per capita (current US$)",
    y="Life expectancy at birth, total (years)",
    color="Country",
    size="Population, total",
    title=f"Relationship between GDP per Capita and Life Expectancy in {year_of_interest}",
    labels={
        "GDP per capita (current US$)": "GDP per Capita (Current US$)",
        "Life expectancy at birth, total (years)": "Life Expectancy (Years)"
    }
)

fig.show()

## Compare China, US, India, Russia

In [16]:
df_4_countries = df_2019[df_2019["Country Code"].isin(["USA", "CHN", "IND", "RUS"])]
print(df_4_countries.shape)
df_4_countries

(4, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
794,2019,China,10143.860221,77.968,1407745000.0,CHN,East Asia & Pacific,Upper middle income,IBRD
1706,2019,India,2050.1638,70.91,1383112000.0,IND,South Asia,Lower middle income,IBRD
3074,2019,Russian Federation,11536.258789,73.083902,144406300.0,RUS,Europe & Central Asia,Upper middle income,IBRD
3929,2019,United States,65120.394663,78.787805,328330000.0,USA,North America,High income,Not classified


In [17]:

fig = px.scatter(
    df_4_countries,
    x="GDP per capita (current US$)",
    y="Life expectancy at birth, total (years)",
    color="Country",
    text="Country",
    size="Population, total",
    title=f"Relationship between GDP per Capita and Life Expectancy in {year_of_interest}",
    labels={
        "GDP per capita (current US$)": "GDP per Capita (Current US$)",
        "Life expectancy at birth, total (years)": "Life Expectancy (Years)"
    },
    height=600,
    width=1000
)

# Update text position
fig.update_traces(textposition='top center')
fig.update_layout(showlegend=False)

fig.show()

In [18]:
df_quad = df[df["Country Code"].isin(["USA", "CHN", "IND", "RUS"])]
print(df_quad.shape)
df_quad.sample(5)

(76, 9)


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
3077,2022,Russian Federation,15270.706055,,144236900.0,RUS,Europe & Central Asia,Upper middle income,IBRD
787,2012,China,6300.58218,76.192,1354190000.0,CHN,East Asia & Pacific,Upper middle income,IBRD
1697,2010,India,1350.63447,66.909,1240614000.0,IND,South Asia,Lower middle income,IBRD
784,2009,China,3832.227457,75.343,1331260000.0,CHN,East Asia & Pacific,Upper middle income,IBRD
1708,2021,India,2238.127142,67.24,1407564000.0,IND,South Asia,Lower middle income,IBRD


In [19]:
fig = px.line(
    df_quad,
    x="Year",
    color="Country",
    y="GDP per capita (current US$)"
)

fig.show()

In [20]:
df_group = df.groupby(["Year", "Region"])["Population, total"].sum().reset_index()
df_group

Unnamed: 0,Year,Region,"Population, total"
0,2004,East Asia & Pacific,2094011000.0
1,2004,Europe & Central Asia,870270800.0
2,2004,Latin America & Caribbean,549897600.0
3,2004,Middle East & North Africa,347044900.0
4,2004,North America,324809700.0
5,2004,South Asia,1515703000.0
6,2004,Sub-Saharan Africa,746546800.0
7,2005,East Asia & Pacific,2109795000.0
8,2005,Europe & Central Asia,872913700.0
9,2005,Latin America & Caribbean,556739500.0


# Homework #5
Pick a few countries of your interest (for example, G7, BRICS, Nordic countries).
- 1 Create a line chart showing the trend of Life Expectancy over time for comparing those countries.
- 2 Pick a year from the data of the above few countries and create a scatter plot to compare their GDP per capita and life expectancy. The size of the dots represent the population. This type of scatter plot is called bubble chart.
- 3 Bonus - compute the average GDP per capita, the average life expectancy, and the total population for each year of all the seven regions (aggregation) and repeat 1 and 2 using the aggregated data for all regions

#Countries Chosen: Colombia, Chile, Argentina, Paraguay, Uruguay

In [70]:
df_south_america5 = df[df['Country'].isin(["Colombia", "Argentina", "Uruguay", "Paraguay", "Chile"])]
df_south_america5



Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
133,2004,Argentina,4258.160261,74.855,38668796.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
134,2005,Argentina,5086.627761,75.139,39070501.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
135,2006,Argentina,5890.978002,75.433,39476851.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
136,2007,Argentina,7210.595548,75.006,39876111.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
137,2008,Argentina,8977.506851,75.641,40273769.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
138,2009,Argentina,8184.389889,75.936,40684338.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
139,2010,Argentina,10385.964432,75.721,40788453.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
140,2011,Argentina,12848.739151,76.124,41261490.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
141,2012,Argentina,13082.664326,76.467,41733271.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
142,2013,Argentina,13080.254732,76.491,42202935.0,ARG,Latin America & Caribbean,Upper middle income,IBRD


In [72]:
fig = px.line(df_south_america5, x='Year', y='Life expectancy at birth, total (years)', title='Life Expectancy', color = 'Country')
fig.show()

In [73]:
df_south_america2021 = df_south_america5[df_south_america5['Year'] == 2021]
df_south_america2021


Unnamed: 0,Year,Country,GDP per capita (current US$),"Life expectancy at birth, total (years)","Population, total",Country Code,Region,Income Group,Lending Type
150,2021,Argentina,10650.86046,75.39,45808747.0,ARG,Latin America & Caribbean,Upper middle income,IBRD
777,2021,Chile,16240.607776,78.944,19493184.0,CHL,Latin America & Caribbean,High income,IBRD
815,2021,Colombia,6182.707099,72.83,51516562.0,COL,Latin America & Caribbean,Upper middle income,IBRD
2924,2021,Paraguay,5959.441794,70.262,6703799.0,PRY,Latin America & Caribbean,Upper middle income,IBRD
3950,2021,Uruguay,17923.995333,75.436,3426260.0,URY,Latin America & Caribbean,High income,IBRD


In [79]:
fig = px.scatter(df_south_america2021,
                 x='GDP per capita (current US$)',
                 y='Life expectancy at birth, total (years)',
                 size = 'Population, total',
                 color = 'Country'
                 )
fig.show()