In [None]:
#Questions 3 and 4

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import fuzz
import plotly.express as px
import warnings; warnings.simplefilter('ignore')

In [None]:
# Question 5(1)

gdp_df = pd.read_csv('../data/gdp_per_capita.csv', skipfooter = 2, engine = 'python')
gdp_df.head(10)

In [None]:
# Question 5(2)

gdp_df.tail(5)

In [None]:
# Question 6

gdp_df = gdp_df.drop(columns = "Value Footnotes")
gdp_df.columns = ['Country', 'Year', 'GDP_Per_Capita']

In [None]:
# Question 7

gdp_df.info()

In [None]:
# Question 8

gdp_df['Year'].value_counts().sort_index()

Only years since 1990. Greater numbers of more recent years--better reporting?

In [None]:
# Question 9(1)

len(pd.unique(gdp_df['Country']))

In [None]:
# Question 9(2)

gdp_df['Country'].value_counts()

The least represented countries have very small populations and thus fewer workable samples. Those countries are also relatively remote.

In [None]:
# Question 10

gdp_2014 = gdp_df[gdp_df['Year'] == 2014]
gdp_2014

In [None]:
# Question 11

gdp_2014.describe()

In [None]:
# Question 12

plt.hist(gdp_df['GDP_Per_Capita'], bins = 20);

Extremely right-skewed. The vast majority of the countries represented are poor, with outlier ("developed") countries skewing the data. The median is probably more representative than the mean with respect to typical country GDP per capita.

In [None]:
# Question 13(1)

gdp_2014.nlargest(5, columns = 'GDP_Per_Capita')

In [None]:
# Question 13(2)

gdp_2014.nsmallest(5, columns = 'GDP_Per_Capita')

In [None]:
# Question 14

gdp_pivoted = pd.pivot_table(data = gdp_df.loc[(gdp_df['Year'].isin([1990, 2017]))], 
                             index = ['Country'], 
                             columns = ['Year']).dropna()

In [None]:
# Question 15

gdp_pivoted['Percent_Change'] = 100*(gdp_pivoted[('GDP_Per_Capita', 2017)] - gdp_pivoted[('GDP_Per_Capita', 1990)])/ gdp_pivoted[('GDP_Per_Capita', 1990)]

In [None]:
# Question 16

gdp_pivoted.loc[gdp_pivoted['Percent_Change'] < 0].value_counts().sum()

In [None]:
# Question 17 (includes bonus)

gdp_pivoted_sorted = gdp_pivoted.sort_values('Percent_Change', ascending = False)
top_change = str(gdp_pivoted_sorted['Percent_Change'].index[0])
second_top_change = str(gdp_pivoted_sorted['Percent_Change'].index[1])
top_change_values = gdp_df.loc[(gdp_df['Country'] == top_change) & (gdp_df['Year'] < 2018), ['Year','GDP_Per_Capita']]
second_top_change_values = gdp_df.loc[(gdp_df['Country'] == second_top_change) & (gdp_df['Year'] < 2018), ['Year','GDP_Per_Capita']]
plt.plot(top_change_values['Year'], top_change_values['GDP_Per_Capita'], color = 'red')
plt.plot(second_top_change_values['Year'], second_top_change_values['GDP_Per_Capita'], color = 'blue')
plt.legend(['Equatorial Guinea','China']);

Equatorial Guinea's GDP per capita rose sharply starting around 1995 but has started declining sharply since The Great Recession (~2007). While China's GDP per capita is lower overall, its growth has been steadier and more consistent over time.

In [None]:
# Question 18

continents = pd.read_csv('../data/continents.csv')

In [None]:
# Question 19

gdp_df = pd.merge(gdp_df, continents)

In [None]:
# Question 20

no_dupe_countries = gdp_df.drop_duplicates('Country')
no_dupe_countries.groupby('Continent')['Country'].count().plot(kind = 'bar', colormap = "Accent");

In [None]:
# Question 21

sns.boxplot(x = gdp_df['Continent'], y = gdp_df['GDP_Per_Capita'].loc[gdp_df['Year'] == 2014], palette = 'Pastel2')
fig = plt.gcf()
fig.set_size_inches(15, 12)

The 2014 GDPs per capita for countries in Europe were generally the highest among the continents (highest median and IQR). Those GDPs per capita were also normally distributed. The 2014 GDPs per capita for countries in Asia and North America had the greatest amount of variability, with a number of outliers on the high end. The 2014 GDPs per capita for countries in Africa were the lowest (lowest median and IQR) and had the least amount of variability.

In [None]:
# Question 22

internet_df = pd.read_csv('../data/internet_use.csv', on_bad_lines= 'skip', skipfooter = 181, engine = 'python')
internet_df

In [None]:
# Question 23

internet_df = internet_df.drop(columns = "Value Footnotes")
internet_df.columns = ['Country', 'Year', 'Internet_Use_Pct']

In [None]:
# Question 24

internet_df['Internet_Use_Pct'].astype(float)
internet_df.info()

In [None]:
# Question 25(1)

used_years = internet_df.loc[internet_df['Internet_Use_Pct'] != 0].sort_values('Year').reset_index(drop = True)
used_years['Year'].iloc[0]

In [None]:
# Question 25(2)

x = used_years['Year']
y = used_years['Internet_Use_Pct']
sns.stripplot(x = x, y = y)
fig = plt.gcf()
fig.set_size_inches(20, 6)

In [None]:
# Questions 26 and 27 (bonus included)

# Clean-up function definition

def Country_Swap(target, acquirer):
    """Makes initial replacements of certain terms for eventual fuzzy matching and then fuzzy matches country names to improve dataframe merge results."""
    Swap_Out = ['Korea (Rep. of)', 'Democratic', 'Republic', 'Czechia', 'Slovakia', 'West Bank and Gaza', 'Kyrgyzstan', 'T.F.Y.R. Macedonia', 'Brunei Darussalam', 'Syrian Arab Rep.']
    Swap_In = ['Korea', 'Dem.', 'Rep.', 'Czech Rep.', 'Slovak Rep.', 'Palestine', 'Kyrgyz Rep.', 'North Macedonia', 'Brunei', 'Syria']
    for i in Swap_Out:
        acquirer['Country'] = acquirer['Country'].str.replace(i, Swap_In[Swap_Out.index(i)])
        target['Country'] = target['Country'].str.replace(i, Swap_In[Swap_Out.index(i)])
        
    banned_list = ['Australia', 'Austria', 'Iceland', 'Ireland', 'Congo', 'China']
    target_list = list(target['Country'].unique())
    acquirer_list = list(acquirer['Country'].unique())
    
    for i in target_list:
        for j in acquirer_list:
            if i not in acquirer_list:
                if (fuzz.ratio(i,j) >= 80) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
                elif (fuzz.partial_ratio(i,j) >= 90) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
                elif (fuzz.token_set_ratio(i,j) >= 90) and (i not in banned_list) and (j not in banned_list):
                    target['Country'] = target['Country'].replace(to_replace = i, value = j)
    return target, acquirer

In [None]:
# Performing the clean-up and merging

internet_df, gdp_df = Country_Swap(internet_df, gdp_df)
gdp_and_internet_use = pd.merge(gdp_df, internet_df, how = 'inner')

In [None]:
# Question 28

gdp_and_internet_use_2014 = gdp_and_internet_use.loc[gdp_and_internet_use['Year'].isin([2014])]
gdp_and_internet_use_2014['Country'].loc[gdp_and_internet_use_2014['Internet_Use_Pct'] > 90].count()

In [None]:
# Question 29(1)

criteria_one = gdp_and_internet_use_2014['GDP_Per_Capita'].nlargest(3).index
three_largest = list(gdp_and_internet_use_2014['Country'].loc[criteria_one])
three_largest

In [None]:
# Question 29(2)

criteria_two = gdp_and_internet_use['Country'].isin(three_largest)
slimmed_df_three_largest = gdp_and_internet_use[['Country', 'Year', 'Internet_Use_Pct']].loc[criteria_two]

graphs = sns.FacetGrid(slimmed_df_three_largest, col = 'Country')
graphs.map(sns.regplot, "Year", "Internet_Use_Pct")
graphs.set(ylim=(-5, 105))
fig = plt.gcf()
fig.set_size_inches(20, 6)

There is no data for Qatar before 2000: it's unclear whether that indicates that the population in Qatar simply didn't have access to the internet or just a limitation on the data. Otherwise, the internet use percentage has increased about the same amount for each country--quickly approaching 100%.

In [None]:
# Question 30

sns.scatterplot(x = gdp_and_internet_use_2014['GDP_Per_Capita'], 
                y = gdp_and_internet_use_2014['Internet_Use_Pct'], 
                hue = gdp_and_internet_use_2014['Continent']);

Internet use percentage quickly approaches 100% as GDP per capita for a country increases.

In [None]:
# Question 31

np.corrcoef(gdp_and_internet_use_2014['GDP_Per_Capita'], gdp_and_internet_use_2014['Internet_Use_Pct'])[0][1]

Increase in GDP per capita of a country are strongly and positively correlated with an increase in internet use percentage of that country's population.

In [None]:
# Question 32

gdp_and_internet_use_2014['GDP_Per_Capita_Log'] = np.log(gdp_and_internet_use_2014['GDP_Per_Capita'])
np.corrcoef(gdp_and_internet_use_2014['GDP_Per_Capita_Log'], gdp_and_internet_use_2014['Internet_Use_Pct'])[0][1]

The correlation is even stronger than what was shown previously (i.e., without log).

In [None]:
# Question 33

gaiu_us_only = gdp_and_internet_use.set_index('Country').filter(like = 'United States', axis = 0).sort_values('Year')
print(gaiu_us_only)
np.corrcoef(gaiu_us_only['GDP_Per_Capita'],gaiu_us_only['Internet_Use_Pct'])[0][1]

No, as changes to GDP per capita in the U.S. during this time have been marginal (likely due to inflation), and increase in internet usage is more likely due to its increased availability over time (more buildout of networks, e.g.).

In [None]:
# Writing csv files to data folder.

gdp_df.to_csv('../data/gdp_df_to_be_imported_and_merged.csv', index = False)
gdp_and_internet_use.to_csv('../data/gdp_and_internet_use_to_be_imported_and_merged.csv', index = False)