In [12]:
import pandas as pd

In [13]:
avg_global_gini_df = pd.read_csv("../Data/AverageGlobalGini.csv")
uk_yearly_gdp_df = pd.read_csv("../Data/uk_yearly_gdpcapita.csv")
us_yearly_gdp_df = pd.read_csv("../Data/usa_yearly_gdpcapita.csv")
us_poverty_df = pd.read_csv("../Data/poverty.csv")
uk_voter_turnout_df = pd.read_csv("../Data/voter_turnout_data.csv")

Correcting column names

In [14]:
uk_yearly_gdp_df.columns = uk_yearly_gdp_df.iloc[0]
us_yearly_gdp_df.columns = us_yearly_gdp_df.iloc[0]
uk_yearly_gdp_df = uk_yearly_gdp_df.iloc[1:]
us_yearly_gdp_df = us_yearly_gdp_df.iloc[1:]

Correcting column datatypes to be the same in order to merge

In [15]:
avg_global_gini_df["Year"] = avg_global_gini_df["Year"].astype(int)
uk_yearly_gdp_df["Year"] = uk_yearly_gdp_df["Year"].astype(int)
us_yearly_gdp_df["Year"] = us_yearly_gdp_df["Year"].astype(int)

Removing unnecessary columns and renaming columns so they make sense in context of merged dataframe

In [16]:
del(avg_global_gini_df["Unnamed: 0"])
us_poverty_df.rename(columns={'Persons Below Poverty':'Persons Below Poverty (US)',	'Percent Below Poverty': 'Percent Below Poverty (US)'}, inplace=True)
uk_yearly_gdp_df.rename(columns={'Annual Growth Rate (%)': 'UK GDP Growth Rate'}, inplace=True)
us_yearly_gdp_df.rename(columns={'Annual Growth Rate (%)': 'US GDP Growth Rate'}, inplace=True)
avg_global_gini_df.rename(columns={'GlobalGini': 'Average Global Gini'}, inplace=True)

In [17]:
interactive_df = pd.merge(avg_global_gini_df, uk_yearly_gdp_df, on='Year', how='outer')
interactive_df = pd.merge(interactive_df, us_yearly_gdp_df, on='Year', how='outer')
interactive_df = pd.merge(interactive_df, us_poverty_df, on='Year', how='outer')
interactive_df = pd.merge(interactive_df, uk_voter_turnout_df, on='Year', how='outer')
interactive_df

Unnamed: 0,Year,Average Global Gini,UK GDP Per Capita (US $),UK GDP Growth Rate,USA GDP Per Capita (US $),US GDP Growth Rate,Persons Below Poverty (US),Percent Below Poverty (US),Voter Turnout in UK
0,1963,37.6,"$1,613",5.75%,"$3,375",4.03%,,,
1,1964,38.1,"$1,748",8.36%,"$3,574",5.91%,,,77.1
2,1965,37.5,"$1,874",7.17%,"$3,828",7.10%,,,
3,1966,37.8,"$1,987",6.04%,"$4,146",8.33%,,,75.8
4,1967,36.9,"$2,059",3.63%,"$4,336",4.59%,,,
...,...,...,...,...,...,...,...,...,...
64,1945,,,,,,,,72.8
65,1950,,,,,,,,83.9
66,1951,,,,,,,,82.6
67,1955,,,,,,,,76.8


Final clean of data removing dollar signs, percentage points and commas then converting to correct datatype

In [18]:
for column in interactive_df.columns:
    # skipping column that have non string contents
    if interactive_df[column].dtype == 'O':
        interactive_df[column] = interactive_df[column].str.replace('[$,%]', '', regex=True).astype(float)
    interactive_df[column] = pd.to_numeric(interactive_df[column], errors='coerce')
  
interactive_df

Unnamed: 0,Year,Average Global Gini,UK GDP Per Capita (US $),UK GDP Growth Rate,USA GDP Per Capita (US $),US GDP Growth Rate,Persons Below Poverty (US),Percent Below Poverty (US),Voter Turnout in UK
0,1963,37.6,1613.0,5.75,3375.0,4.03,,,
1,1964,38.1,1748.0,8.36,3574.0,5.91,,,77.1
2,1965,37.5,1874.0,7.17,3828.0,7.10,,,
3,1966,37.8,1987.0,6.04,4146.0,8.33,,,75.8
4,1967,36.9,2059.0,3.63,4336.0,4.59,,,
...,...,...,...,...,...,...,...,...,...
64,1945,,,,,,,,72.8
65,1950,,,,,,,,83.9
66,1951,,,,,,,,82.6
67,1955,,,,,,,,76.8


In [19]:
import altair as alt

Generic method for creating chart based of two columns

In [20]:
def plot_chart(df,col_x,col_y):
    chart = alt.Chart(df).mark_circle().encode(
        x=col_x,
        y=col_y,
        tooltip=['Year', col_x, col_y]
    )

    return chart

In [21]:
plot_chart(interactive_df,'Year','USA GDP Per Capita (US $)')

In [22]:
interactive_df.to_csv("../Data/InteractiveData.csv")