In [179]:
import pandas as pd

In [180]:
avg_global_gini_df = pd.read_csv("Data/AverageGlobalGini.csv")
uk_yearly_gdp_df = pd.read_csv("Data/uk_yearly_gdpcapita.csv")
us_yearly_gdp_df = pd.read_csv("Data/usa_yearly_gdpcapita.csv")

Correcting column names

In [181]:
uk_yearly_gdp_df.columns = uk_yearly_gdp_df.iloc[0]
us_yearly_gdp_df.columns = us_yearly_gdp_df.iloc[0]
uk_yearly_gdp_df = uk_yearly_gdp_df.iloc[1:]
us_yearly_gdp_df = us_yearly_gdp_df.iloc[1:]

Correcting column datatypes to be the same in order to merge

In [182]:
avg_global_gini_df["Year"] = avg_global_gini_df["Year"].astype(int)
uk_yearly_gdp_df["Year"] = uk_yearly_gdp_df["Year"].astype(int)
us_yearly_gdp_df["Year"] = us_yearly_gdp_df["Year"].astype(int)

Removing unnecessary columns and renaming columns so they make sense in context of merged dataframe

In [183]:
del(avg_global_gini_df["Unnamed: 0"])
uk_yearly_gdp_df.rename(columns={'Annual Growth Rate (%)': 'UK GDP Growth Rate'}, inplace=True)
us_yearly_gdp_df.rename(columns={'Annual Growth Rate (%)': 'US GDP Growth Rate'}, inplace=True)
avg_global_gini_df.rename(columns={'Value': 'Average Global Gini'}, inplace=True)

In [184]:
interactive_df = pd.merge(avg_global_gini_df, uk_yearly_gdp_df, on='Year', how='outer')
interactive_df = pd.merge(interactive_df, us_yearly_gdp_df, on='Year', how='outer')
interactive_df

Unnamed: 0,Year,Average Global Gini,UK GDP Per Capita (US $),UK GDP Growth Rate,USA GDP Per Capita (US $),US GDP Growth Rate
0,1963,37.600000,"$1,613",5.75%,"$3,375",4.03%
1,1964,38.100000,"$1,748",8.36%,"$3,574",5.91%
2,1965,37.500000,"$1,874",7.17%,"$3,828",7.10%
3,1966,37.800000,"$1,987",6.04%,"$4,146",8.33%
4,1967,36.900000,"$2,059",3.63%,"$4,336",4.59%
...,...,...,...,...,...,...
58,2021,40.513636,"$46,586",15.54%,"$70,219",10.53%
59,2022,39.257143,"$45,850",-1.58%,"$76,399",8.80%
60,1962,,"$1,526",3.63%,"$3,244",5.78%
61,1961,,"$1,472",5.35%,"$3,067",1.98%


Final clean of data removing dollar signs, percentage points and commas then converting to correct datatype

In [190]:
for column in interactive_df.columns:
    # skipping column that have non string contents
    if interactive_df[column].dtype == 'O':
        interactive_df[column] = interactive_df[column].str.replace('[$,%]', '', regex=True).astype(float)
    interactive_df[column] = pd.to_numeric(interactive_df[column], errors='coerce')
  
interactive_df

Unnamed: 0,Year,Average Global Gini,UK GDP Per Capita (US $),UK GDP Growth Rate,USA GDP Per Capita (US $),US GDP Growth Rate
0,1963,37.600000,1613.0,5.75,3375.0,4.03
1,1964,38.100000,1748.0,8.36,3574.0,5.91
2,1965,37.500000,1874.0,7.17,3828.0,7.10
3,1966,37.800000,1987.0,6.04,4146.0,8.33
4,1967,36.900000,2059.0,3.63,4336.0,4.59
...,...,...,...,...,...,...
58,2021,40.513636,46586.0,15.54,70219.0,10.53
59,2022,39.257143,45850.0,-1.58,76399.0,8.80
60,1962,,1526.0,3.63,3244.0,5.78
61,1961,,1472.0,5.35,3067.0,1.98


In [187]:
import altair as alt

Generic method for creating chart based of two columns

In [188]:
def plot_chart(df,col_x,col_y):
    chart = alt.Chart(df).mark_circle().encode(
        x=col_x,
        y=col_y,
        tooltip=['Year', col_x, col_y]
    )

    return chart

In [193]:
plot_chart(interactive_df,'Year','USA GDP Per Capita (US $)')