In [1]:
import pandas as pd
import pycountry
import json

## clean data

In [2]:
# population data
pop_df_raw = pd.read_csv("population_data/API_SP.POP/API_SP.POP.csv")
pop_obj = list()
for index, row in pop_df_raw.iterrows():
    for elem in reversed(row):
        # select last known population
        if elem == elem:
            population = elem
            break
    if isinstance(elem, float):
        pop_obj.append({
            "country": row["Country Code"],
            "population": population
        })
pop_df = pd.DataFrame(pop_obj)
pop_df.to_csv("population_data/pop_clean.csv")

# gdp data
gdp_df_raw = pd.read_csv("population_data/API_NY.GDP/API_NY.GDP.csv")
gdp_obj = list()
for index, row in gdp_df_raw.iterrows():
    for elem in reversed(row):
        # select last known gdp
        if elem == elem:
            gdp = elem
            break
    if isinstance(elem, float):
        gdp_obj.append({
            "country": row["Country Code"],
            "gdp": gdp
        })
gdp_df = pd.DataFrame(gdp_obj)
gdp_df.to_csv("population_data/gdp_clean.csv")

## add population and gdp data to lcc data

In [6]:
with open("stats/lcc_country.json", "r", encoding="utf8") as in_file:
    countries = json.load(in_file)

for alpha_2, data in countries.items():
    # if alpha_2 == "INTERNATIONAL":
    #     continue
    country = pycountry.countries.get(alpha_2=alpha_2)
    if country is None:
        continue
    alpha_3 = country.alpha_3
    population = pop_df.loc[pop_df['country'] == alpha_3]["population"]
    gdp = gdp_df.loc[gdp_df['country'] == alpha_3]["gdp"]
    if len(population) > 0:
        data["population"] = population.values[0]
    if len(gdp) > 0:
        data["gdp"] = gdp.values[0]

with open("stats/lcc_pop_gdp_country.json", "w", encoding="utf8") as out_file:
    json.dump(countries, out_file)

# dataframe and csv
country_df = pd.DataFrame.from_dict(countries, orient="index")
country_df = country_df.query("count != 0")
country_df.dropna(inplace=True, subset=["population", "gdp"], how="all")
# country_df.index.name = "country"
country_df.to_csv("stats/lcc_pop_gdp_country.csv")
country_df

Unnamed: 0,ds_nodes,largest_cc_size,largest_cc_coverage,count,population,gdp
CA,1112.370833,1552.127083,0.764373,480,37589262.0,1.736426e+12
GB,2674.062500,3638.099085,0.813568,656,66834405.0,2.829108e+12
HK,1539.268750,2453.662500,0.590035,160,7507400.0,3.657115e+11
CO,6355.829787,5791.212766,0.615037,47,50339443.0,3.236160e+11
US,2533.549163,3650.748745,0.762718,4780,328239523.0,2.143323e+13
...,...,...,...,...,...,...
WS,67.000000,83.000000,0.945878,2,197097.0,8.522502e+08
VU,107.666667,43.333333,0.645401,3,299882.0,9.342400e+08
PW,176.000000,178.500000,0.989645,2,18008.0,2.683549e+08
CU,4196.000000,3072.000000,0.560482,1,11333483.0,1.000230e+11


In [4]:
country = pycountry.countries.get(alpha_2="MF")
country

Country(alpha_2='MF', alpha_3='MAF', name='Saint Martin (French part)', numeric='663')