## Merge and Tidy up World Datasets

This notebook briefly inspects and tidies up the dataset of country statistics for 2006 retrieved from the the [World Bank's DataBank of World Development Indicators](https://databank.worldbank.org/reports.aspx?source=world-development-indicators#) and combines that data with a country region dataset found [here](https://github.com/lukes/ISO-3166-Countries-with-Regional-Codes/blob/master/all/all.csv). The combined dataset is outputted to a CSV file for insertion into the Neo4j database.

In [1]:
from pathlib import Path

import pandas as pd

DATA_PATH = Path("../data")

df = pd.read_csv(DATA_PATH / "continent_metadata.csv", usecols=["name", "region"])
dt = pd.read_csv(DATA_PATH / "world_bank_national_data.csv")
dt.head()

Unnamed: 0,Time,Time Code,Country Name,Country Code,GDP per capita (current US$) [NY.GDP.PCAP.CD],Population density (people per sq. km of land area) [EN.POP.DNST],"Population, total [SP.POP.TOTL]","Literacy rate, adult total (% of people ages 15 and above) [SE.ADT.LITR.ZS]",Access to electricity (% of population) [EG.ELC.ACCS.ZS]
0,2006,YR2006,Afghanistan,AFG,263.733601866274,40.5272035938243,26433058,..,30.7186908721924
1,2006,YR2006,Albania,ALB,2972.74292399799,109.21704379562,2992547,..,100
2,2006,YR2006,Algeria,DZA,3478.71000237566,14.1245505386818,33641007,72.648681640625,98.869010925293
3,2006,YR2006,American Samoa,ASM,8340.52343974691,295.545,59109,..,..
4,2006,YR2006,Andorra,AND,42674.7589681092,172.329787234043,80995,..,100


In [2]:
RENAME_COLUMNS = {
    "Country Name": "name", 
    "GDP per capita (current US$) [NY.GDP.PCAP.CD]": "gdp_per_capita",
    "Population density (people per sq. km of land area) [EN.POP.DNST]": "population_density",
    "Population, total [SP.POP.TOTL]": "population",
    "Literacy rate, adult total (% of people ages 15 and above) [SE.ADT.LITR.ZS]": "literacy_rate",
    "Access to electricity (% of population) [EG.ELC.ACCS.ZS]": "access_to_electricity",
}

dt.rename(columns=RENAME_COLUMNS, inplace=True)
dt.drop(labels=["Country Code", "Time", "Time Code"], axis=1, inplace=True)

In [3]:
len(dt)

217

In [4]:
dx = dt.merge(df, on="name", how="left")

In [6]:
dx.loc[dx.region.isna()]

Unnamed: 0,name,gdp_per_capita,population_density,population,literacy_rate,access_to_electricity,region
13,"Bahamas, The",30713.7980618188,33.0701298701299,331032,..,100,
24,Bosnia and Herzegovina,3416.51237636652,73.5433984375,3765422,..,99.3872756958008,
27,British Virgin Islands,..,160.146666666667,24022,..,98.0067291259766,
39,Channel Islands,63244.925375939,..,152999,..,100,
44,Democratic Republic of the Congo,255.433043197206,24.9566820317152,56578046,..,10.2929649353027,
45,Republic of the Congo,2155.40635716309,10.9667437774524,3745143,..,34.86474609375,
47,Cote d'Ivoire,1347.98882431501,58.9777169811321,18754914,..,60.1710510253906,
50,Curacao,..,..,141239,..,100,
52,Czechia,15261.7975911138,132.542459546926,10238905,..,100,
71,"Gambia, The",662.362877179135,157.257312252964,1591444,..,38.0789451599121,


In [7]:
print(f"Number of countries that do not have an assigned region: {len(dx.loc[dx.region.isna()])}")

Number of countries that do not have an assigned region: 28


In [8]:
# Clean up country names...
dx.to_csv(DATA_PATH / "world_economic_data.csv", index=False)

In [10]:
# Manually clean up World Economic data and country names
df = pd.read_csv(DATA_PATH / "world_economic_data.csv")
df.head()

Unnamed: 0,name,gdp_per_capita,population_density,population,literacy_rate,access_to_electricity,region
0,Afghanistan,263.733601866274,40.5272035938243,26433058,..,30.7186908721924,Asia
1,Albania,2972.74292399799,109.21704379562,2992547,..,100,Europe
2,Algeria,3478.71000237566,14.1245505386818,33641007,72.648681640625,98.869010925293,Africa
3,American Samoa,8340.52343974691,295.545,59109,..,..,Oceania
4,Andorra,42674.7589681092,172.329787234043,80995,..,100,Europe


In [11]:
df["name"] = df["name"].str.casefold()

In [13]:
df.to_csv(DATA_PATH / "world_economic_data.csv", index=False)

In [18]:
df = pd.read_csv(DATA_PATH / "sakila_customers.csv")
df.head()

Unnamed: 0,id,country
0,218,Afghanistan
1,441,Algeria
2,69,Algeria
3,176,Algeria
4,320,American Samoa


In [19]:
df["country"] = df["country"].str.casefold()
df.head()

Unnamed: 0,id,country
0,218,afghanistan
1,441,algeria
2,69,algeria
3,176,algeria
4,320,american samoa


In [16]:
df.country.unique()

array(['afghanistan', 'algeria', 'american samoa', 'angola', 'anguilla',
       'argentina', 'armenia', 'austria', 'azerbaijan', 'bahrain',
       'bangladesh', 'belarus', 'bolivia', 'brazil', 'brunei', 'bulgaria',
       'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china',
       'colombia', 'congo, the democratic republic of the',
       'czech republic', 'dominican republic', 'ecuador', 'egypt',
       'estonia', 'ethiopia', 'faroe islands', 'finland', 'france',
       'french guiana', 'french polynesia', 'gambia', 'germany', 'greece',
       'greenland', 'holy see (vatican city state)', 'hong kong',
       'hungary', 'india', 'indonesia', 'iran', 'iraq', 'israel', 'italy',
       'japan', 'kazakstan', 'kenya', 'kuwait', 'latvia', 'liechtenstein',
       'lithuania', 'madagascar', 'malawi', 'malaysia', 'mexico',
       'moldova', 'morocco', 'mozambique', 'myanmar', 'nauru', 'nepal',
       'netherlands', 'new zealand', 'nigeria', 'north korea', 'oman',
       'pakistan', 'par

In [20]:
df.to_csv(DATA_PATH / "sakila_customers_out.csv")