# Precompute Neighbours

Gini and Mean Income based neighbour selection method:

For given country/year pair, neighbour means the nearby country/year pairs in terms of gini/income distance.
We will calculate 50 neighbours for each country/year pair

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../scripts/')

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 98

In [3]:
%load_ext autoreload
%autoreload 1

In [4]:
%aimport shapeslib
%aimport etllib


In [7]:
# income_file = '../../../ddf--gapminder--fasttrack/ddf--datapoints--mincpcap_cppp--by--country--time.csv'
# gini_file =  '../../../ddf--gapminder--fasttrack/ddf--datapoints--gini--by--country--time.csv'
income_file = '../../../ddf--gapminder--fasttrack/ddf--datapoints--mhhinc--by--country--time.csv'
gini_file =  '../../../ddf--gapminder--fasttrack/ddf--datapoints--gini_2100--by--country--time.csv'

In [8]:
shapes_file = '../source/fixtures/povcal_country_year.csv'
known_shapes = pd.read_csv(shapes_file)

In [9]:
bracket_number_from_income = etllib.bracket_number_from_income

In [10]:
income = pd.read_csv(income_file).set_index(['country', 'time'])
gini = pd.read_csv(gini_file).set_index(['country', 'time'])

income.index.names = ['country', 'year']
gini.index.names = ['country', 'year']

income.columns = ['income']
gini.columns = ['gini']

In [11]:
income_gini = pd.concat([income, gini], axis=1)

In [12]:
income_gini

Unnamed: 0_level_0,Unnamed: 1_level_0,income,gini
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
afg,1800,456,30.50
afg,1801,477,30.50
afg,1802,499,30.50
afg,1803,522,30.50
afg,1804,546,30.50
...,...,...,...
zwe,2096,12509,50.86
zwe,2097,12778,50.86
zwe,2098,13053,50.86
zwe,2099,13334,50.86


In [13]:
# we need both income and gini, so drop those we only have one indicator
income_gini = income_gini.dropna(how='any')

In [14]:
income_gini_noc = income_gini.reset_index(drop=True).drop_duplicates()

In [15]:
# double check: is there some countries available in povcalnet but we don't have gini/income data

In [16]:
known_shapes_list = known_shapes.values.tolist()

In [17]:
known_shapes_list = [tuple(x) for x in known_shapes_list]

In [18]:
known_income_gini = income_gini.loc[known_shapes_list]

KeyError: "[('mhl', 1981), ('mhl', 1982), ('mhl', 1983), ('mhl', 1984), ('mhl', 1985), ('mhl', 1986), ('mhl', 1987), ('mhl', 1988), ('mhl', 1989), ('mhl', 1990), ('mhl', 1991), ('mhl', 1992), ('mhl', 1993), ('mhl', 1994), ('mhl', 1995), ('mhl', 1996), ('mhl', 1997), ('mhl', 1998), ('mhl', 1999), ('mhl', 2000), ('mhl', 2001), ('mhl', 2002), ('mhl', 2003), ('mhl', 2004), ('mhl', 2005), ('mhl', 2006), ('mhl', 2007), ('mhl', 2008), ('mhl', 2009), ('mhl', 2010), ('mhl', 2011), ('mhl', 2012), ('mhl', 2013), ('mhl', 2014), ('mhl', 2015), ('mhl', 2016), ('mhl', 2017), ('mhl', 2018), ('mhl', 2019), ('nru', 2004), ('nru', 2005), ('nru', 2006), ('nru', 2007), ('nru', 2008), ('nru', 2009), ('nru', 2010), ('nru', 2011), ('nru', 2012), ('nru', 2013), ('nru', 2014), ('nru', 2015), ('nru', 2016), ('nru', 2017), ('nru', 2018), ('nru', 2019), ('twn', 1981), ('twn', 1982), ('twn', 1983), ('twn', 1984), ('twn', 1985), ('twn', 1986), ('twn', 1987), ('twn', 1988), ('twn', 1989), ('twn', 1990), ('twn', 1991), ('twn', 1992), ('twn', 1993), ('twn', 1994), ('twn', 1995), ('twn', 1996), ('twn', 1997), ('twn', 1998), ('twn', 1999), ('twn', 2000), ('twn', 2001), ('twn', 2002), ('twn', 2003), ('twn', 2004), ('twn', 2005), ('twn', 2006), ('twn', 2007), ('twn', 2008), ('twn', 2009), ('twn', 2010), ('twn', 2011), ('twn', 2012), ('twn', 2013), ('twn', 2014), ('twn', 2015), ('twn', 2016), ('twn', 2017), ('twn', 2018), ('twn', 2019), ('kos', 2000), ('kos', 2001), ('kos', 2002), ('kos', 2003), ('kos', 2004), ('kos', 2005), ('kos', 2006), ('kos', 2007), ('kos', 2008), ('kos', 2009), ('kos', 2010), ('kos', 2011), ('kos', 2012), ('kos', 2013), ('kos', 2014), ('kos', 2015), ('kos', 2016), ('kos', 2017), ('kos', 2018), ('kos', 2019)] not in index"

In [19]:
# so we can't use the shapes for these countries:

mask = known_shapes['country'].isin(['mhl', 'nru', 'twn', 'kos'])

known_shapes_list = known_shapes[~mask].values.tolist()
known_shapes_list = [tuple(x) for x in known_shapes_list]
known_income_gini = income_gini.loc[known_shapes_list]

In [20]:
known_income_gini

Unnamed: 0_level_0,Unnamed: 1_level_0,income,gini
country,year,Unnamed: 2_level_1,Unnamed: 3_level_1
ago,1981,1030,38.19
ago,1982,995,38.19
ago,1983,1002,38.19
ago,1984,1029,38.19
ago,1985,1033,38.19
...,...,...,...
zwe,2015,2311,43.94
zwe,2016,2291,44.14
zwe,2017,2253,44.34
zwe,2018,2310,47.21


In [21]:
# known_income_gini.to_csv('../wip/income_gini_for_known_shape_countries.csv')

In [22]:
from multiprocessing import Pool

In [23]:
POOLSIZE = 8

def get_distances_res(v):
    i = v[0]
    g = v[1]
    cno, neis = shapeslib.get_neighbors(i, g, known_income_gini)
    return (i, g), (cno, neis)

with Pool(POOLSIZE) as p:
    res_distances = p.map(get_distances_res, income_gini_noc.values)

In [24]:
res_distances_dict = dict(res_distances)

In [25]:
i, g = shapeslib.get_income_gini(('ago', 1800), income_gini)
res_distances_dict[(i, g)]

(7,
 [('lbr', 1995),
  ('cod', 2002),
  ('cod', 2001),
  ('lbr', 1996),
  ('lbr', 1994),
  ('cod', 2003),
  ('gin', 1986),
  ('gin', 1985),
  ('gin', 1987),
  ('cod', 2000),
  ('gin', 1991),
  ('cod', 2004),
  ('gin', 1984),
  ('gin', 1988),
  ('gin', 1990),
  ('gin', 1983),
  ('gin', 1989),
  ('gin', 1982),
  ('gin', 1981),
  ('mmr', 1991),
  ('mmr', 1988),
  ('mmr', 1990),
  ('mmr', 1989),
  ('cod', 1999),
  ('cod', 2005),
  ('mmr', 1992),
  ('moz', 1985),
  ('cod', 1998),
  ('moz', 1986),
  ('cod', 2006),
  ('mmr', 1993),
  ('cod', 1997),
  ('lbr', 1993),
  ('moz', 1987),
  ('mmr', 1987),
  ('mmr', 1994),
  ('cod', 2007),
  ('moz', 1984),
  ('moz', 1992),
  ('mli', 1982),
  ('cod', 1996),
  ('uga', 1986),
  ('moz', 1988),
  ('moz', 1983),
  ('uga', 1987),
  ('mmr', 1995),
  ('mli', 1983),
  ('uga', 1985),
  ('cod', 1995),
  ('mmr', 1986)])

In [26]:
all_neighbours = []
for cy in income_gini.index.values:
    i, g = shapeslib.get_income_gini(cy, income_gini)
    all_neighbours.append((cy, res_distances_dict[(i, g)]))

In [27]:
all_neighbours = dict(all_neighbours)

In [28]:
all_neighbours[('ago', 1800)]

(7,
 [('lbr', 1995),
  ('cod', 2002),
  ('cod', 2001),
  ('lbr', 1996),
  ('lbr', 1994),
  ('cod', 2003),
  ('gin', 1986),
  ('gin', 1985),
  ('gin', 1987),
  ('cod', 2000),
  ('gin', 1991),
  ('cod', 2004),
  ('gin', 1984),
  ('gin', 1988),
  ('gin', 1990),
  ('gin', 1983),
  ('gin', 1989),
  ('gin', 1982),
  ('gin', 1981),
  ('mmr', 1991),
  ('mmr', 1988),
  ('mmr', 1990),
  ('mmr', 1989),
  ('cod', 1999),
  ('cod', 2005),
  ('mmr', 1992),
  ('moz', 1985),
  ('cod', 1998),
  ('moz', 1986),
  ('cod', 2006),
  ('mmr', 1993),
  ('cod', 1997),
  ('lbr', 1993),
  ('moz', 1987),
  ('mmr', 1987),
  ('mmr', 1994),
  ('cod', 2007),
  ('moz', 1984),
  ('moz', 1992),
  ('mli', 1982),
  ('cod', 1996),
  ('uga', 1986),
  ('moz', 1988),
  ('moz', 1983),
  ('uga', 1987),
  ('mmr', 1995),
  ('mli', 1983),
  ('uga', 1985),
  ('cod', 1995),
  ('mmr', 1986)])

In [29]:
import json

In [30]:
all_neighbours_json = dict()

for k, v in all_neighbours.items():
    c, y = k
    if c not in all_neighbours_json.keys():
        all_neighbours_json[c] = dict()
    all_neighbours_json[c][y] = {
        "countries": v[0],
        "neighbours": [list(x) for x in v[1]]
    }

In [31]:
all_neighbours_json['afg'][1900]

{'countries': 7,
 'neighbours': [['lbr', 1995],
  ['cod', 2002],
  ['cod', 2001],
  ['lbr', 1996],
  ['lbr', 1994],
  ['cod', 2003],
  ['gin', 1986],
  ['gin', 1985],
  ['gin', 1987],
  ['cod', 2000],
  ['gin', 1991],
  ['cod', 2004],
  ['gin', 1984],
  ['gin', 1988],
  ['gin', 1990],
  ['gin', 1983],
  ['gin', 1989],
  ['gin', 1982],
  ['gin', 1981],
  ['mmr', 1991],
  ['mmr', 1990],
  ['mmr', 1988],
  ['mmr', 1989],
  ['cod', 1999],
  ['cod', 2005],
  ['mmr', 1992],
  ['moz', 1985],
  ['cod', 1998],
  ['moz', 1986],
  ['cod', 2006],
  ['mmr', 1993],
  ['cod', 1997],
  ['lbr', 1993],
  ['moz', 1987],
  ['mmr', 1987],
  ['mmr', 1994],
  ['cod', 2007],
  ['moz', 1984],
  ['moz', 1992],
  ['mli', 1982],
  ['cod', 1996],
  ['uga', 1986],
  ['moz', 1988],
  ['moz', 1983],
  ['uga', 1987],
  ['mmr', 1995],
  ['mli', 1983],
  ['uga', 1985],
  ['cod', 1995],
  ['mmr', 1986]]}

In [32]:
k = json.dumps(all_neighbours_json)

In [33]:
with open('../source/fixtures/neighbours_list.json', 'w') as f:
    f.write(k)
    f.close()