# In this notebook the region info will be added.

In [90]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import sys
from time import time
import re
import datetime as dt

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

# Constants definition
DATA_PATH = '../data/'
PROJECT_ROOT = '../'
SRC = PROJECT_ROOT + 'src/'

EXCHANGES = 'filt_rates.csv'
COUNTRY_CODES = 'filt_codes.csv'
REGIONS = 'regions_raw.csv'
CPI = 'filt_cpi.csv'

sys.path.append(SRC)

Populating the interactive namespace from numpy and matplotlib
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [91]:
codes_df = pd.read_csv(DATA_PATH + COUNTRY_CODES, index_col=0)
rates_df = pd.read_csv(DATA_PATH + EXCHANGES, index_col=0)
cpi_df = pd.read_csv(DATA_PATH + CPI, index_col=0)
regions_df = pd.read_csv(DATA_PATH + REGIONS, index_col=0)

In [92]:
print(regions_df.shape)
regions_df.head()

(263, 8)


Unnamed: 0_level_0,code,Region,IncomeGroup,TableName,SpecialNotes,Unnamed: 6,Unnamed: 7,Unnamed: 8
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABW,,Latin America & Caribbean,High income,Aruba,SNA data for 2000-2011 are updated from offici...,1994-1999 from UN databases. Base year has ch...,,
AFG,,South Asia,Low income,Afghanistan,Fiscal year end: March 20; reporting period fo...,,,
AGO,,Sub-Saharan Africa,Lower middle income,Angola,,,,
ALB,,Europe & Central Asia,Upper middle income,Albania,,,,
AND,,Europe & Central Asia,High income,Andorra,WB-3 code changed from ADO to AND to align wit...,,,


In [93]:
filt_regions_df = regions_df.dropna(subset=['code'])
print(filt_regions_df.shape)
filt_regions_df.head()

(49, 8)


Unnamed: 0_level_0,code,Region,IncomeGroup,TableName,SpecialNotes,Unnamed: 6,Unnamed: 7,Unnamed: 8
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ARE,AED,Middle East & North Africa,High income,United Arab Emirates,,,,
BGR,BGN,Europe & Central Asia,Upper middle income,Bulgaria,The new reference year for chain linked series...,,,
BRA,BRL,Latin America & Caribbean,Upper middle income,Brazil,,,,
CAN,CAD,North America,High income,Canada,Fiscal year end: March 31,reporting period for national accounts data: CY.,,
CHE,CHF,Europe & Central Asia,High income,Switzerland,,,,


In [94]:
print(codes_df.shape)
codes_df.head()

(51, 2)


Unnamed: 0_level_0,name,country
code,Unnamed: 1_level_1,Unnamed: 2_level_1
EUR,Euro,European Union
CYP,Cyprus Pound,Cyprus
MTL,Maltese Lira,Malta
SKK,Slovak Koruna,Slovakia
CAD,Canadian Dollar,Canada


In [95]:
rates_df.shape

(120, 51)

In [96]:
cpi_df.shape

(120, 51)

### Let's join the regions

In [97]:
filt_regions_df.set_index('code', inplace=True)
codes_df = codes_df.join(filt_regions_df, how='left')
codes_df

Unnamed: 0_level_0,name,country,Region,IncomeGroup,TableName,SpecialNotes,Unnamed: 6,Unnamed: 7,Unnamed: 8
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EUR,Euro,European Union,,,,,,,
CYP,Cyprus Pound,Cyprus,Europe & Central Asia,High income,Cyprus,A simple multiplier is used to convert the nat...,,,
MTL,Maltese Lira,Malta,Middle East & North Africa,High income,Malta,A simple multiplier is used to convert the nat...,,,
SKK,Slovak Koruna,Slovakia,,,,,,,
CAD,Canadian Dollar,Canada,North America,High income,Canada,Fiscal year end: March 31,reporting period for national accounts data: CY.,,
HKD,Hong Kong Dollar,Hong Kong,East Asia & Pacific,High income,"Hong Kong SAR, China",On 1 July 1997 China resumed its exercise of s...,,,
CZK,Czech Koruna,Czech Republic,Europe & Central Asia,High income,Czech Republic,,,,
DKK,Danish Krone,Denmark,Europe & Central Asia,High income,Denmark,,,,
ISK,Iceland Krona,Iceland,Europe & Central Asia,High income,Iceland,,,,
ILS,New Israeli Shekel,Israel,Middle East & North Africa,High income,Israel,,,,


In [98]:
# Look at the missing regions
codes_df.rename(columns={'Region':'region',
                         'IncomeGroup': 'income_group',
                         'TableName': 'table_name',
                         'SpecialNotes': 'special_notes'}, inplace=True)

In [99]:
codes_df.head()

Unnamed: 0_level_0,name,country,region,income_group,table_name,special_notes,Unnamed: 6,Unnamed: 7,Unnamed: 8
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EUR,Euro,European Union,,,,,,,
CYP,Cyprus Pound,Cyprus,Europe & Central Asia,High income,Cyprus,A simple multiplier is used to convert the nat...,,,
MTL,Maltese Lira,Malta,Middle East & North Africa,High income,Malta,A simple multiplier is used to convert the nat...,,,
SKK,Slovak Koruna,Slovakia,,,,,,,
CAD,Canadian Dollar,Canada,North America,High income,Canada,Fiscal year end: March 31,reporting period for national accounts data: CY.,,


In [100]:
codes_df['region'].isnull().sum()

2

In [101]:
codes_df[codes_df['region'].isnull()]

Unnamed: 0_level_0,name,country,region,income_group,table_name,special_notes,Unnamed: 6,Unnamed: 7,Unnamed: 8
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EUR,Euro,European Union,,,,,,,
SKK,Slovak Koruna,Slovakia,,,,,,,


In [102]:
regions = codes_df['region'].unique()
regions = regions[1:]
regions

array(['Europe & Central Asia', 'Middle East & North Africa',
       'North America', 'East Asia & Pacific', 'South Asia',
       'Sub-Saharan Africa', 'Latin America & Caribbean'], dtype=object)

In [103]:
len(regions)

7

In [105]:
codes_df.loc['EUR','region'] = regions[0]
codes_df.loc['SKK','region'] = regions[0]

In [106]:
codes_df

Unnamed: 0_level_0,name,country,region,income_group,table_name,special_notes,Unnamed: 6,Unnamed: 7,Unnamed: 8
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
EUR,Euro,European Union,Europe & Central Asia,,,,,,
CYP,Cyprus Pound,Cyprus,Europe & Central Asia,High income,Cyprus,A simple multiplier is used to convert the nat...,,,
MTL,Maltese Lira,Malta,Middle East & North Africa,High income,Malta,A simple multiplier is used to convert the nat...,,,
SKK,Slovak Koruna,Slovakia,Europe & Central Asia,,,,,,
CAD,Canadian Dollar,Canada,North America,High income,Canada,Fiscal year end: March 31,reporting period for national accounts data: CY.,,
HKD,Hong Kong Dollar,Hong Kong,East Asia & Pacific,High income,"Hong Kong SAR, China",On 1 July 1997 China resumed its exercise of s...,,,
CZK,Czech Koruna,Czech Republic,Europe & Central Asia,High income,Czech Republic,,,,
DKK,Danish Krone,Denmark,Europe & Central Asia,High income,Denmark,,,,
ISK,Iceland Krona,Iceland,Europe & Central Asia,High income,Iceland,,,,
ILS,New Israeli Shekel,Israel,Middle East & North Africa,High income,Israel,,,,


In [107]:
codes_df.to_csv(DATA_PATH + 'codes_full.csv')

In [108]:
codes_df = codes_df[codes_df.columns[:3]]
print(codes_df.shape)
codes_df.head()

(51, 3)


Unnamed: 0_level_0,name,country,region
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
EUR,Euro,European Union,Europe & Central Asia
CYP,Cyprus Pound,Cyprus,Europe & Central Asia
MTL,Maltese Lira,Malta,Middle East & North Africa
SKK,Slovak Koruna,Slovakia,Europe & Central Asia
CAD,Canadian Dollar,Canada,North America


In [109]:
codes_df.to_csv(DATA_PATH + COUNTRY_CODES)

# The preprocessing is done. Let's save all the files in a new folder.

In [110]:
PREPROCESSED = 'preprocessed_data/'
codes_df.to_csv(DATA_PATH + PREPROCESSED + 'codes.csv')
cpi_df.to_csv(DATA_PATH + PREPROCESSED + 'cpi.csv')
rates_df.to_csv(DATA_PATH + PREPROCESSED + 'rates.csv')