In [14]:
# Here we create the master table to the regression analysis
# The objective is to have all complexity measures, all the controls, and all the targets we're interested to predict
import glob
import pandas as pd

In [15]:
# We start from the target variables, GDP PC (+PPP)
# The data is in constant USD and comes from the World Bank
# This is the "test" data, spanning from 2014 to 2017 (the training was from 2011 to 2014)
gdppc = pd.read_csv("../../data/raw/gdppc.csv")
gdppcppp = pd.read_csv("../../data/raw/gdppcppp.csv")

gdppc = gdppc[["exporter", "2014", "2017"]].dropna()
gdppc.columns = ("exporter", "gdppc_start", "gdppc_end")
gdppcppp = gdppcppp[["exporter", "2014", "2017"]].dropna()
gdppcppp.columns = ("exporter", "gdppcppp_start", "gdppcppp_end")

# df will contain our final output table
df = gdppc.merge(gdppcppp, on = "exporter", how = "outer")
df

Unnamed: 0,exporter,gdppc_start,gdppc_end,gdppcppp_start,gdppcppp_end
0,ABW,26647.938101,29007.693003,36444.262057,38442.413838
1,AFG,613.856689,519.884773,2069.424642,2058.383832
2,AGO,5408.410496,4095.812942,8179.296007,7310.901738
3,ALB,4578.631994,4531.020806,11259.296700,12811.759436
4,AND,41303.929371,38962.880354,,
...,...,...,...,...,...
246,XKX,4080.330717,4045.614209,9043.183283,10530.481665
247,YEM,1673.146354,960.528534,,
248,ZAF,6433.187277,6132.479841,12520.713748,12703.421242
249,ZMB,1763.062571,1534.866751,3539.445584,3485.005238


In [16]:
# Now we load up all the controls. The following two datasets come from the World Bank and is in constant USD
# First, we want to know how much the country grew in exports of natural resources
# These need to be taken out because they depend on "luck" and not on complex economies
nat_res = pd.read_csv("../../data/raw/nat_res.csv")
nat_res["nat_res_share_increase"] = (nat_res["2017"] - nat_res["2014"]) / 100
df = df.merge(nat_res[["exporter", "nat_res_share_increase"]], on = "exporter", how = "left")

# Then we want to know how much the country grew in general export volumes
# We want to take this out because maybe the country opened up to trade without developing new capabilities
exports = pd.read_csv("../../data/raw/exports.csv")
exports["export_share_increase"] = (exports["2017"] - exports["2014"]) / 100
df = df.merge(exports[["exporter", "export_share_increase"]], on = "exporter", how = "left")

# We use the WGI data as an estimation of the quality of the government infrastructure
# The data comes from here: https://info.worldbank.org/governance/wgi/
# We average across the training years
for txt in glob.glob("../../data/raw/wgi*.csv"):
    wgi = pd.read_csv(txt, sep = "\t")
    label = txt.split('_')[1].split('.')[0]
    wgi[label] = wgi[["2011", "2012", "2013", "2014"]].mean(axis = 1)
    df = df.merge(wgi[["exporter", label]], on = "exporter", how = "left")

df

Unnamed: 0,exporter,gdppc_start,gdppc_end,gdppcppp_start,gdppcppp_end,nat_res_share_increase,export_share_increase,regqual,govteffectiv,polstab,rulelaw,corrupt,voice
0,ABW,26647.938101,29007.693003,36444.262057,38442.413838,-0.000004,-0.042234,1.352822,1.153897,1.256603,1.246278,1.089621,1.2775
1,AFG,613.856689,519.884773,2069.424642,2058.383832,0.001627,,-1.261256,-1.381238,-2.462760,-1.645689,-1.447653,-1.2475
2,AGO,5408.410496,4095.812942,8179.296007,7310.901738,-0.072286,-0.156909,-1.027559,-1.122507,-0.370756,-1.230242,-1.341616,-1.1175
3,ALB,4578.631994,4531.020806,11259.296700,12811.759436,-0.019683,0.033568,0.215953,-0.219621,0.037976,-0.457892,-0.664078,0.0675
4,AND,41303.929371,38962.880354,,,0.000000,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,XKX,4080.330717,4045.614209,9043.183283,10530.481665,-0.003916,0.041479,,,,,,
247,YEM,1673.146354,960.528534,,,-0.104635,,-0.786152,-1.262843,-2.474466,-1.232560,-1.319054,-1.3425
248,ZAF,6433.187277,6132.479841,12520.713748,12703.421242,-0.005317,-0.018410,0.374047,0.382591,-0.048510,0.150955,-0.046358,0.6025
249,ZMB,1763.062571,1534.866751,3539.445584,3485.005238,0.018934,-0.038294,-0.460750,-0.531928,0.440720,-0.331435,-0.332412,-0.1300


In [17]:
# We add diversity as a special control, which is the count of exported products
# We don't want our complex methods to be perfectly correlated with a simple row count!
div = pd.read_csv("../../data/processed/mcp_exp.csv", sep = "\t")

# RCA and RPOP diversities are different, which means
div_rca = div[div["rca"] > 1].groupby(by = "exporter").size().reset_index().rename(columns = {0: "diversity_rca"})
div_rpop = div[div["rpop"] > 0.25].groupby(by = "exporter").size().reset_index().rename(columns = {0: "diversity_rpop"})

df = df.merge(div_rca, on = "exporter")
df = df.merge(div_rpop, on = "exporter")

df

Unnamed: 0,exporter,gdppc_start,gdppc_end,gdppcppp_start,gdppcppp_end,nat_res_share_increase,export_share_increase,regqual,govteffectiv,polstab,rulelaw,corrupt,voice,diversity_rca,diversity_rpop
0,ABW,26647.938101,29007.693003,36444.262057,38442.413838,-0.000004,-0.042234,1.352822,1.153897,1.256603,1.246278,1.089621,1.2775,35,267
1,AFG,613.856689,519.884773,2069.424642,2058.383832,0.001627,,-1.261256,-1.381238,-2.462760,-1.645689,-1.447653,-1.2475,73,20
2,AGO,5408.410496,4095.812942,8179.296007,7310.901738,-0.072286,-0.156909,-1.027559,-1.122507,-0.370756,-1.230242,-1.341616,-1.1175,8,11
3,ALB,4578.631994,4531.020806,11259.296700,12811.759436,-0.019683,0.033568,0.215953,-0.219621,0.037976,-0.457892,-0.664078,0.0675,147,164
4,ARE,43751.838886,40644.804043,73619.100584,67183.626557,-0.106386,0.000341,0.724570,1.204191,0.859782,0.578884,1.181367,-0.9950,95,901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,VNM,2030.261955,2365.621666,5745.158585,7155.745829,-0.032062,0.151887,-0.624072,-0.209613,0.171169,-0.492776,-0.512444,-1.4050,227,370
154,YEM,1673.146354,960.528534,,,-0.104635,,-0.786152,-1.262843,-2.474466,-1.232560,-1.319054,-1.3425,39,24
155,ZAF,6433.187277,6132.479841,12520.713748,12703.421242,-0.005317,-0.018410,0.374047,0.382591,-0.048510,0.150955,-0.046358,0.6025,216,523
156,ZMB,1763.062571,1534.866751,3539.445584,3485.005238,0.018934,-0.038294,-0.460750,-0.531928,0.440720,-0.331435,-0.332412,-0.1300,82,76


In [18]:
# And now we load up all the (exporter) complexity we calculated
lit = pd.read_csv("../01_estimate_complexities/literature_complexities_country.csv", sep = "\t")
df = df.merge(lit, on = "exporter")
    
ann_rca = pd.read_csv("../01_estimate_complexities/annihilv1_complexity_exp_rca.csv", sep = "\t")
df = df.merge(ann_rca, on = "exporter")

ann_rpop = pd.read_csv("../01_estimate_complexities/annihilv1_complexity_exp_rpop.csv", sep = "\t")
df = df.merge(ann_rpop, on = "exporter")

var_rca = pd.read_csv("../01_estimate_complexities/variance_complexity_rca.csv", sep = "\t")
df = df.merge(var_rca, on = "exporter")

var_rpop = pd.read_csv("../01_estimate_complexities/variance_complexity_rpop.csv", sep = "\t")
df = df.merge(var_rpop, on = "exporter")

moran_rca = pd.read_csv("../01_estimate_complexities/moran_complexity_rca.csv", sep = "\t")
df = df.merge(moran_rca, on = "exporter")

moran_rpop = pd.read_csv("../01_estimate_complexities/moran_complexity_rpop.csv", sep = "\t")
df = df.merge(moran_rpop, on = "exporter")

df.to_csv("growth_regression_table.csv", index = False, sep = "\t")

df

Unnamed: 0,exporter,gdppc_start,gdppc_end,gdppcppp_start,gdppcppp_end,nat_res_share_increase,export_share_increase,regqual,govteffectiv,polstab,...,eci_rca,eci_rpop,fitness12_rca,fitness12_rpop,ann_rca,ann_rpop,var_rca,var_rpop,moran_rca,moran_rpop
0,ABW,26647.938101,29007.693003,36444.262057,38442.413838,-0.000004,-0.042234,1.352822,1.153897,1.256603,...,0.122585,0.838788,0.103507,0.745492,1.276050,0.654559,0.308810,0.948828,0.006520,0.007402
1,AFG,613.856689,519.884773,2069.424642,2058.383832,0.001627,,-1.261256,-1.381238,-2.462760,...,-0.865256,-1.656564,0.205833,0.003582,1.130574,2.046344,0.147894,0.069057,0.023012,0.000427
2,AGO,5408.410496,4095.812942,8179.296007,7310.901738,-0.072286,-0.156909,-1.027559,-1.122507,-0.370756,...,-1.205506,-0.984400,0.000044,0.001116,2.524447,2.098274,0.044465,0.074250,0.005924,0.005980
3,ALB,4578.631994,4531.020806,11259.296700,12811.759436,-0.019683,0.033568,0.215953,-0.219621,0.037976,...,-0.311605,0.063788,0.531469,0.253260,0.871609,0.965354,0.268193,0.281332,0.031768,0.020953
4,ARE,43751.838886,40644.804043,73619.100584,67183.626557,-0.106386,0.000341,0.724570,1.204191,0.859782,...,0.190255,0.799657,0.373723,3.080079,0.822533,0.276051,0.399991,1.131100,0.009648,0.011189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,VNM,2030.261955,2365.621666,5745.158585,7155.745829,-0.032062,0.151887,-0.624072,-0.209613,0.171169,...,0.131371,0.600881,1.619043,1.023054,0.707402,0.747015,0.221260,0.587735,0.039189,0.022263
154,YEM,1673.146354,960.528534,,,-0.104635,,-0.786152,-1.262843,-2.474466,...,-1.398730,-1.461118,0.033184,0.004185,1.408130,1.949515,0.105991,0.057315,0.014303,0.003507
155,ZAF,6433.187277,6132.479841,12520.713748,12703.421242,-0.005317,-0.018410,0.374047,0.382591,-0.048510,...,0.227489,0.603744,1.052638,1.422032,0.595828,0.623731,0.432956,0.460394,0.023690,0.013767
156,ZMB,1763.062571,1534.866751,3539.445584,3485.005238,0.018934,-0.038294,-0.460750,-0.531928,0.440720,...,-0.375720,-0.359686,0.209657,0.086567,1.002492,1.095789,0.362109,0.322239,0.017999,0.006278


In [19]:
# Just out of curiosity, how much do some of these variables correlate with each other?

print(df[[
    "eci_rca",
    "fitness12_rca",
    "ann_rca",
    "var_rca",
    "moran_rca",
    "eci_rpop",
    "fitness12_rpop",
    "ann_rpop",
    "var_rpop",
    "moran_rpop",
    "gdppc_start",
    "gdppcppp_start",
    "nat_res_share_increase",
    "export_share_increase",
    "diversity_rca",
    "diversity_rpop"
]].corr())


                         eci_rca  fitness12_rca   ann_rca   var_rca  \
eci_rca                 1.000000       0.805053 -0.772430  0.781512   
fitness12_rca           0.805053       1.000000 -0.595747  0.634540   
ann_rca                -0.772430      -0.595747  1.000000 -0.769907   
var_rca                 0.781512       0.634540 -0.769907  1.000000   
moran_rca               0.321777       0.503516 -0.432935  0.156107   
eci_rpop                0.777212       0.545216 -0.524629  0.584704   
fitness12_rpop          0.839332       0.844536 -0.560851  0.660590   
ann_rpop               -0.794866      -0.605067  0.647556 -0.680591   
var_rpop                0.843961       0.727438 -0.535885  0.686561   
moran_rpop              0.447326       0.385036 -0.375073  0.365397   
gdppc_start             0.537528       0.396180 -0.148216  0.283098   
gdppcppp_start          0.465725       0.328045 -0.057346  0.241096   
nat_res_share_increase  0.160205       0.190185 -0.342067  0.173393   
export