# Maddison Database Project

In [1]:
import pandas as pd
from pathlib import Path
from owid import catalog
import plotly.express as px
import plotly.io as pio

#Loading MPD 2020
file = Path('data/mpd2020.xlsx')
mpd = pd.read_excel(file, sheet_name='Full data')

#Loading MPD 2020 - Regional data
file = Path('data/mpd2020.xlsx')
mpd_regional = pd.read_excel(file, sheet_name='Regional data', header=1)

#Loading Regional composition
file = Path('data/Regional composition_web.xlsx')
mpd_regionalcomp = pd.read_excel(file, sheet_name='Sheet2')

In [2]:
mpd

Unnamed: 0,countrycode,country,year,gdppc,pop
0,AFG,Afghanistan,1820,,3280.00000
1,AFG,Afghanistan,1870,,4207.00000
2,AFG,Afghanistan,1913,,5730.00000
3,AFG,Afghanistan,1950,1156.0000,8150.00000
4,AFG,Afghanistan,1951,1170.0000,8284.00000
...,...,...,...,...,...
21677,ZWE,Zimbabwe,2014,1594.0000,13313.99205
21678,ZWE,Zimbabwe,2015,1560.0000,13479.13812
21679,ZWE,Zimbabwe,2016,1534.0000,13664.79457
21680,ZWE,Zimbabwe,2017,1582.3662,13870.26413


In [3]:
catalog.find(table="maddison")

Unnamed: 0,table,dataset,version,namespace,channel,is_public,dimensions,path,format
91,maddison_gdp,ggdc_maddison,2020-10-01,ggdc,garden,True,"[""country"", ""year""]",garden/ggdc/2020-10-01/ggdc_maddison/maddison_gdp,feather


In [4]:
mpd_catalog = catalog.find(table="maddison", dataset="ggdc_maddison").load()
mpd_df = pd.DataFrame(mpd_catalog)
mpd_df = mpd_df.reset_index()
mpd_df

Unnamed: 0,country,year,gdp_per_capita,population,gdp
0,Afghanistan,1820,,3280000.0,
1,Afghanistan,1870,,4207000.0,
2,Afghanistan,1913,,5730000.0,
3,Afghanistan,1950,1156.000000,8150000.0,9.421400e+09
4,Afghanistan,1951,1170.000000,8284000.0,9.692280e+09
...,...,...,...,...,...
21849,Zimbabwe,2014,1594.000000,13313992.0,2.122250e+10
21850,Zimbabwe,2015,1560.000000,13479138.0,2.102745e+10
21851,Zimbabwe,2016,1534.000000,13664795.0,2.096179e+10
21852,Zimbabwe,2017,1582.366211,13870264.0,2.194784e+10


In [5]:
list(mpd_df['country'].unique())

['Afghanistan',
 'Albania',
 'Algeria',
 'Angola',
 'Argentina',
 'Armenia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Benin',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czechia',
 'Czechoslovakia',
 'Democratic Republic of Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'East Asia',
 'Eastern Europe',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Finland',
 'Former Sudan',
 'France',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Greece',
 'Guatemala',
 'Guinea',
 'Guinea-Bissau',
 'Haiti',
 'Honduras',
 'Hong Kong',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'I

In [6]:
mpd_df_regions = mpd_df[mpd_df['country'].isin(['Western Europe', 'Eastern Europe', 'Western Offshoots', 'Latin America',
                           'East Asia', 'South and South-East Asia', 'Middle East', 'Sub-Sahara Africa'])].copy().reset_index(drop=True)
mpd_df_regions

Unnamed: 0,country,year,gdp_per_capita,population,gdp
0,East Asia,1820,1088.584961,427756992.0,4.656498e+11
1,East Asia,1850,899.657654,455774016.0,4.100406e+11
2,East Asia,1870,989.477173,409152992.0,4.048476e+11
3,East Asia,1900,1086.328491,457057984.0,4.965151e+11
4,East Asia,1920,1160.000000,543904000.0,6.309286e+11
...,...,...,...,...,...
151,Western Offshoots,2000,44329.269531,336264992.0,1.490638e+13
152,Western Offshoots,2010,48090.152344,370321984.0,1.780884e+13
153,Western Offshoots,2016,51667.976562,389729984.0,2.013656e+13
154,Western Offshoots,2017,52597.218750,391730080.0,2.060391e+13


In [7]:
list(mpd_df_regions['country'].unique())

['East Asia',
 'Eastern Europe',
 'Latin America',
 'Middle East',
 'South and South-East Asia',
 'Sub-Sahara Africa',
 'Western Europe',
 'Western Offshoots']

In [8]:
mpd_regional = mpd_regional.rename(columns={"Region": "Year"})
mpd_regional = mpd_regional.drop(index=0).reset_index(drop=True)
mpd_regional

Unnamed: 0,Year,Western Europe,Eastern Europe,Western Offshoots,Latin America,Asia (East),Asia (South and South-East),Middle East,Sub-Sahara Africa,Western Europe.1,Western Offshoots.1,Eastern Europe.1,Latin America.1,Asia (South and South-East).1,Asia (East).1,Middle East.1,Sub-Sahara Africa.1,World,World GDP pc
0,1820,2306.96669,818.0,2513.045418,952.816821,1088.584951,929.0,974.0,800.0,132371.0,11231.0,90785.0,20099.0,255695.0,427757.0,35600.0,60000.0,1033538.0,1101.565459
1,1830,2384.314573,942.0,,,,,,,,,,,,,,,,
2,1840,2579.534884,907.0,,,,,,,,,,,,,,,,
3,1850,2678.0,985.0,3474.410149,1080.657319,899.657658,929.0,1000.0,800.0,165348.0,26760.0,117661.0,29485.0,278706.0,455774.0,42000.0,65000.0,1180734.0,1225.081067
4,1860,3034.0,1358.0,4214.440703,1588.0,,,,,,,,,,,,,,
5,1870,3301.304919,1575.0,4647.453901,1318.541091,989.477165,850.346572,1165.0,800.0,186566.0,46088.0,141626.0,37692.0,327692.0,409153.0,49839.0,70000.0,1268656.0,1497.982325
6,1880,3585.154363,1886.0,6019.123928,,,,,,,,,,,,,,,
7,1890,4079.313847,2204.0,6480.955612,1672.812962,,950.999236,,,,,,,,,,,,
8,1900,4724.295785,2700.0,7740.850451,1750.626743,1086.328445,994.419007,1300.0,850.0,232433.0,86396.0,194693.0,61004.0,366060.0,457058.0,56000.0,86000.0,1539644.0,2212.043316
9,1910,5134.759064,2282.571503,9354.690921,2193.539665,,1143.117595,,,,,,,,,,,,


In [9]:
mpd_regionalcomp

Unnamed: 0,country,year,region
0,ITA,1820,Western Europe
1,NLD,1820,Western Europe
2,PRT,1820,Western Europe
3,SWE,1820,Western Europe
4,GBR,1820,Western Europe
...,...,...,...
295,TZA,1950,Sub Saharan Africa
296,UGA,1950,Sub Saharan Africa
297,ZAF,1950,Sub Saharan Africa
298,ZMB,1950,Sub Saharan Africa


In [10]:
mpd_withregions = pd.merge(mpd, mpd_regionalcomp, left_on=['countrycode','year'], right_on=['country','year'], how='left', validate='many_to_many')
mpd_withregions

Unnamed: 0,countrycode,country_x,year,gdppc,pop,country_y,region
0,AFG,Afghanistan,1820,,3280.00000,,
1,AFG,Afghanistan,1870,,4207.00000,,
2,AFG,Afghanistan,1913,,5730.00000,,
3,AFG,Afghanistan,1950,1156.0000,8150.00000,AFG,South and South East Asia
4,AFG,Afghanistan,1951,1170.0000,8284.00000,,
...,...,...,...,...,...,...,...
21678,ZWE,Zimbabwe,2014,1594.0000,13313.99205,,
21679,ZWE,Zimbabwe,2015,1560.0000,13479.13812,,
21680,ZWE,Zimbabwe,2016,1534.0000,13664.79457,,
21681,ZWE,Zimbabwe,2017,1582.3662,13870.26413,,


In [11]:
mpd_withregions_notnull = mpd_withregions[~mpd_withregions['region'].isnull()].copy().reset_index(drop=True)
mpd_withregions_notnull = mpd_withregions_notnull.dropna(subset=['gdppc', 'pop']).copy().reset_index(drop=True)

mpd_withregions_notnull

Unnamed: 0,countrycode,country_x,year,gdppc,pop,country_y,region
0,AFG,Afghanistan,1950,1156.0,8150.000,AFG,South and South East Asia
1,AGO,Angola,1950,1677.0,4117.617,AGO,Sub Saharan Africa
2,ARE,United Arab Emirates,1950,25182.0,71.520,ARE,Middle East
3,ARG,Argentina,1820,1591.0,534.000,ARG,Latin America
4,ARG,Argentina,1890,3851.0,3376.000,ARG,Latin America
...,...,...,...,...,...,...,...
289,VNM,Viet Nam,1950,1049.0,25348.144,VNM,South and South East Asia
290,YEM,Yemen,1950,1452.0,4777.089,YEM,Middle East
291,ZAF,South Africa,1950,4041.0,13595.840,ZAF,Sub Saharan Africa
292,ZMB,Zambia,1950,1054.0,2553.000,ZMB,Sub Saharan Africa


In [12]:
mpd_withregions_notnull['gdp'] = mpd_withregions_notnull['gdppc'] * mpd_withregions_notnull['pop']

In [13]:
region_list = list(mpd_withregions_notnull['region'].unique())
region_list.sort()
year_list = list(mpd_withregions_notnull['year'].unique())
year_list.sort()

regionaldata = pd.DataFrame() #creates an empty dataframe to use it in the following interation

for i in year_list: #for each year with regional data
    db_peryear = mpd_withregions_notnull[mpd_withregions_notnull['year']==i].copy().reset_index(drop=True) #dataframe with data from country i
    
    for j in region_list:
        db_peryear_region = db_peryear[db_peryear['region']==j].copy().reset_index(drop=True)
        db_aux = pd.DataFrame()
    
        db_aux.loc[0,'year'] = i #assigns country i
        db_aux.loc[0,'region'] = j #assigns country i
        
        db_aux.loc[0,'gdp_sum'] = db_peryear_region['gdp'].sum()
        db_aux.loc[0,'pop_sum'] = db_peryear_region['pop'].sum()
        
        #regionaldata = regionaldata.append(db_aux, ignore_index=True)
        regionaldata = pd.concat([regionaldata, db_aux], ignore_index=True)
    
regionaldata['gdppc'] = regionaldata['gdp_sum']/regionaldata['pop_sum']
regionaldata

Unnamed: 0,year,region,gdp_sum,pop_sum,gdppc
0,1820.0,East Asia,376869000.0,412000.0,914.730583
1,1820.0,Eastern Europe,0.0,0.0,
2,1820.0,Latin America,14954460.0,15695.0,952.816821
3,1820.0,Middle East,9812076.0,10074.0,974.000000
4,1820.0,South and South East Asia,14825629.0,17927.0,827.000000
...,...,...,...,...,...
99,1990.0,Middle East,0.0,0.0,
100,1990.0,South and South East Asia,0.0,0.0,
101,1990.0,Sub Saharan Africa,0.0,0.0,
102,1990.0,Western Europe,0.0,0.0,


In [14]:
region_list

['East Asia',
 'Eastern Europe',
 'Latin America',
 'Middle East',
 'South and South East Asia',
 'Sub Saharan Africa',
 'Western Europe',
 'Western Offshoots']

In [15]:
list(mpd_df_regions['country'].unique())

['East Asia',
 'Eastern Europe',
 'Latin America',
 'Middle East',
 'South and South-East Asia',
 'Sub-Sahara Africa',
 'Western Europe',
 'Western Offshoots']

In [16]:
regionaldata.loc[regionaldata['region'] == 'South and South East Asia', 'region'] = 'South and South-East Asia'
regionaldata.loc[regionaldata['region'] == 'Sub Saharan Africa', 'region'] = 'Sub-Sahara Africa'

In [17]:
comparison = pd.merge(regionaldata, mpd_df_regions, left_on=['year','region'], right_on=['year','country'], how='left', validate='many_to_many')
comparison

Unnamed: 0,year,region,gdp_sum,pop_sum,gdppc,country,gdp_per_capita,population,gdp
0,1820.0,East Asia,376869000.0,412000.0,914.730583,East Asia,1088.584961,4.277570e+08,4.656498e+11
1,1820.0,Eastern Europe,0.0,0.0,,Eastern Europe,818.000000,9.078500e+07,7.426213e+10
2,1820.0,Latin America,14954460.0,15695.0,952.816821,Latin America,952.816833,2.009900e+07,1.915067e+10
3,1820.0,Middle East,9812076.0,10074.0,974.000000,Middle East,974.000000,3.560000e+07,3.467440e+10
4,1820.0,South and South-East Asia,14825629.0,17927.0,827.000000,South and South-East Asia,929.000000,2.556950e+08,2.375407e+11
...,...,...,...,...,...,...,...,...,...
99,1990.0,Middle East,0.0,0.0,,Middle East,6435.131348,3.089720e+08,1.988275e+12
100,1990.0,South and South-East Asia,0.0,0.0,,South and South-East Asia,2573.680664,1.556990e+09,4.007195e+12
101,1990.0,Sub-Sahara Africa,0.0,0.0,,Sub-Sahara Africa,1800.776855,5.078280e+08,9.144849e+11
102,1990.0,Western Europe,0.0,0.0,,Western Europe,25440.035156,3.742350e+08,9.520552e+12


In [18]:
comparison['ratio'] = comparison['gdppc']/comparison['gdp_per_capita']

In [31]:
comparison[['ratio']].describe()

Unnamed: 0,ratio
count,31.0
mean,1.127208
std,0.541606
min,0.82246
25%,0.98875
50%,1.0
75%,1.013654
max,3.587361


In [20]:
comparison[(comparison['region'] == 'East Asia') & (comparison['year'] == 1920)]

Unnamed: 0,year,region,gdp_sum,pop_sum,gdppc,country,gdp_per_capita,population,gdp,ratio
72,1920.0,East Asia,166000600.0,55818.0,2973.9623,East Asia,1160.0,543904000.0,630928600000.0,2.563761


In [21]:
regionaldata[(regionaldata['region'] == 'East Asia') & (comparison['year'] == 1920)]

Unnamed: 0,year,region,gdp_sum,pop_sum,gdppc
72,1920.0,East Asia,166000600.0,55818.0,2973.9623


In [22]:
mpd_withregions[(mpd_withregions['region'] == 'East Asia') & (mpd_withregions['year'] == 1920)]

Unnamed: 0,countrycode,country_x,year,gdppc,pop,country_y,region
3054,CHN,China,1920,,472000.0,CHN,East Asia
10546,JPN,Japan,1920,2973.9623,55818.0,JPN,East Asia


In [23]:
mpd_withregions_notnull[(mpd_withregions_notnull['region'] == 'East Asia') & (mpd_withregions_notnull['year'] == 1920)]

Unnamed: 0,countrycode,country_x,year,gdppc,pop,country_y,region,gdp
156,JPN,Japan,1920,2973.9623,55818.0,JPN,East Asia,166000600.0


In [26]:
pio.renderers.default = "notebook_connected" #"png" for GitHub, "notebook_connected" for local analysis

fig = px.scatter(comparison, x="year", y="ratio", 
                 hover_data=['gdppc', 'gdp_per_capita'], opacity=0.5, color='region', 
                 title="Regional GDP pc comparison: Maddison vs. Reconstructed Maddison",
                 log_x=False,
                 log_y=False,
                labels={
                     "ratio": "Reconstructed/Original: 1=equal",
                     "year": "Year",
                     "region": "Region"
                 })

fig.update_traces(marker=dict(size=20, line=dict(width=0, color='blue')))
fig.show()

In [42]:
comparison_excel = comparison[['year', 'region', 'gdp_sum', 'pop_sum', 'gdppc', 'gdp', 'population', 'gdp_per_capita', 'ratio']]
comparison_excel = comparison_excel.dropna(subset=['ratio']).copy().reset_index(drop=True)
comparison_excel

Unnamed: 0,year,region,gdp_sum,pop_sum,gdppc,gdp,population,gdp_per_capita,ratio
0,1820.0,East Asia,376869000.0,412000.0,914.730583,465649800000.0,427756992.0,1088.584961,0.840293
1,1820.0,Latin America,14954460.0,15695.0,952.816821,19150670000.0,20099000.0,952.816833,1.0
2,1820.0,Middle East,9812076.0,10074.0,974.0,34674400000.0,35600000.0,974.0,1.0
3,1820.0,South and South-East Asia,14825630.0,17927.0,827.0,237540700000.0,255695008.0,929.0,0.890205
4,1820.0,Western Europe,140131400.0,49618.0,2824.204244,305375500000.0,132371000.0,2306.966797,1.224207
5,1820.0,Western Offshoots,28224010.0,11231.0,2513.045418,28224010000.0,11231000.0,2513.04541,1.0
6,1830.0,Western Europe,236502600.0,93887.0,2519.013619,,,2384.314453,1.056494
7,1840.0,Western Europe,289426300.0,103388.0,2799.418556,,,2579.534912,1.085242
8,1850.0,East Asia,399448000.0,444000.0,899.657658,410040600000.0,455774016.0,899.657654,1.0
9,1850.0,Western Europe,436969600.0,162969.0,2681.304732,442801900000.0,165348000.0,2678.0,1.001234


In [43]:
#file = Path('data/comparison.xlsx')
#comparison_excel.to_excel(file, index=False)