In [130]:
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

In [124]:
# dataset taken from http://data.un.org/Data.aspx?d=ComTrade&f=_l1Code%3a50

printed_trade = pd.read_csv('data/printed_goods_world_trade.csv')\
	.drop(columns=['Weight (kg)'])\
	.rename(columns={
		'Country or Area': 'country',
		'Trade (USD)': 'printed_trade_usd'
	})\
	.dropna(subset=['printed_trade_usd'])\
	.rename(columns=str.lower)\
	.replace(to_replace='USA', value='United States')\
	.pivot_table(
		index=['country', 'year'],
		values='printed_trade_usd',
		aggfunc=np.sum
	)

printed_trade.info()
printed_trade.head()

printed_trade.query('country == "United States"')

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3068 entries, ('Albania', 1996) to ('Zimbabwe', 2011)
Data columns (total 1 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   printed_trade_usd  3068 non-null   float64
dtypes: float64(1)
memory usage: 34.9+ KB


Unnamed: 0_level_0,Unnamed: 1_level_0,printed_trade_usd
country,year,Unnamed: 2_level_1
United States,1991,5439473000.0
United States,1992,5769179000.0
United States,1993,6130243000.0
United States,1994,6245540000.0
United States,1995,6847231000.0
United States,1996,6935208000.0
United States,1997,7313102000.0
United States,1998,7602182000.0
United States,1999,7758473000.0
United States,2000,8184302000.0


In [116]:
def round_2(n: float):
	if n == '':
		return
	try:
		return float(np.format_float_positional(float(n), precision=2))
	except Exception:
		print(f'Unable to convert {n} to float')

gdp = pd\
    .read_csv('data/gdp_per_capita.csv', converters={'Value': round_2})\
    .drop(columns=['Value Footnotes']) \
    .dropna(subset=['Value']) \
    .rename(columns={'Country or Area': 'Country', 'Value': 'GDP_Per_Capita'})\
	.rename(columns=str.lower)\
    .astype({'year': int})

gdp.info()
gdp.head()

gdp.query('country == "United States"')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6868 entries, 0 to 6867
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   country         6868 non-null   object 
 1   year            6868 non-null   int64  
 2   gdp_per_capita  6868 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 214.6+ KB


Unnamed: 0,country,year,gdp_per_capita
6572,United States,2019,62530.39
6573,United States,2018,61498.37
6574,United States,2017,60062.22
6575,United States,2016,59043.21
6576,United States,2015,58509.58
6577,United States,2014,57273.13
6578,United States,2013,56269.96
6579,United States,2012,55632.93
6580,United States,2011,54806.12
6581,United States,2010,54359.13


In [117]:
def is_year_filter(obj):
	try:
		datetime.strptime(obj, '%Y')
		return obj
	except Exception:
		# print(f'{obj} is not a year')
		return

def is_numeric_filter(obj):
	try:
		float(obj)
		return obj
	except Exception:
		# print(f'{obj} is not a number')
		return

internet = pd.read_csv(
	'data/internet_use.csv',
	converters=
	{
		'Year': is_year_filter,
		'Value': is_numeric_filter
	})\
	.dropna(subset=['Year', 'Value'])\
	.drop(columns=['Value Footnotes'])\
	.rename(columns={'Country or Area': 'Country', 'Value': 'internet_use_pct'})\
	.rename(columns=str.lower)\
	.astype({'year': int, 'internet_use_pct': float})

internet.info()
internet.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4495 entries, 0 to 4494
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   country           4495 non-null   object 
 1   year              4495 non-null   int64  
 2   internet_use_pct  4495 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 140.5+ KB


Unnamed: 0,country,year,internet_use_pct
0,Afghanistan,2014,6.39
1,Afghanistan,2013,5.9
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.0
4,Afghanistan,2010,4.0


In [125]:
gdp_internet_printed = gdp\
	.merge(internet, on=['country', 'year'])\
	.merge(
		printed_trade,
		on=['country', 'year'],
		suffixes=['_internet', '_obesity'])\
	.rename(columns=str.lower)\
	.set_index(['country', 'year'])

gdp_internet_printed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,gdp_per_capita,internet_use_pct,printed_trade_usd
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Albania,2012,11263.85,54.655959,21113586.0
Albania,2011,11088.09,49.0,19920103.0
Albania,2010,10783.82,45.0,23086167.0
Albania,2009,10346.86,41.2,17780646.0
Albania,2008,9944.23,23.86,16706161.0


In [139]:
plt.rcParams['figure.facecolor'] = 'white'

usa = gdp_internet_printed.loc['United States']