# Wrangling data on subnational population

Data provided by [World Bank](https://databank.worldbank.org/reports.aspx?source=subnational-population#)

In [111]:
import pandas as pd
import numpy as np
import geopandas as gpd

## 1 Load  Data
Data downloaded as csv to local disc from above link.

In [112]:
pop = pd.read_csv('e5267ea5-0dea-4e0f-84b7-e0007b0a1c00_Data.csv')

print(pop.shape)
pop.sample(10)

(84, 13)


Unnamed: 0,Series Name,Series Code,Level_attr,Country Name,Country Code,2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
63,"Population, total",SP.POP.TOTL,Kenya,"Kenya, Nairobi",KEN_Nairobi_Province_KE.NP_51328_KEN004,3267000.0,3381000.0,3499000.0,3618000.0,3739000.0,3863000.0,3987000.0,4113000.0
51,"Population, total",SP.POP.TOTL,South Sudan,"South Sudan, Eastern Equatoria",SSD_Eastern_Equatoria_SS.EE_2750_SSD002,1061000.0,1104000.0,1146000.0,1187000.0,1226000.0,1265000.0,1303000.0,1342000.0
20,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Mara",TZA_Mara_TZ.MA_48366_TZA009,1747000.0,1799000.0,1852000.0,1906000.0,1961000.0,2017000.0,2075000.0,2134000.0
28,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Rukwa",TZA_Rukwa_TZ.RK_115008_TZA017,983000.0,1020000.0,1058000.0,1097000.0,1137000.0,1178000.0,1221000.0,1265000.0
50,"Population, total",SP.POP.TOTL,South Sudan,"South Sudan, Central Equatoria",SSD_Central_Equatoria_SS.BG_2748_SSD001,1292000.0,1345000.0,1396000.0,1445000.0,1493000.0,1540000.0,1587000.0,1634000.0
71,"Population, total",SP.POP.TOTL,Ethiopia,"Ethiopia, Beneshangul Gumu",ETH_Beneshangul_Gumu_ET.BE_1230_ETH004,908000.0,933000.0,958000.0,983000.0,1009000.0,1036000.0,1062000.0,1089000.0
57,"Population, total",SP.POP.TOTL,South Sudan,"South Sudan, Warab",SSD_Warab_SS.WR_2765_SSD008,1139000.0,1186000.0,1231000.0,1274000.0,1316000.0,1358000.0,1399000.0,1441000.0
7,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Dodoma",TZA_Dodoma_TZ.DO_48359_TZA003,2112000.0,2167000.0,2222000.0,2278000.0,2336000.0,2394000.0,2452000.0,2512000.0
46,"Population, total",SP.POP.TOTL,Sudan,"Sudan, Southern Darfur",SDN_Southern_Darfur_SD.SD_2764_SDN018,4442000.0,4522000.0,4607000.0,4697000.0,4790000.0,4887000.0,4984000.0,5083000.0
42,"Population, total",SP.POP.TOTL,Sudan,"Sudan, Northern Darfur",SDN_Northern_Darfur_SD.ND_2757_SDN011,2258000.0,2259000.0,2265000.0,2267000.0,2274000.0,2276000.0,2279000.0,2283000.0


## 2 Reshape Data
- Drop rows for national level data.
- Extract admin1 name from the 'Country Name' column.
- Rename columns for future ease of use.
- Melt data down to county-admin1-year level and losing excess data.

In [113]:
pop = pop[~pop['Level_attr'].isna()]

pop.head(2)

Unnamed: 0,Series Name,Series Code,Level_attr,Country Name,Country Code,2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016]
5,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Arusha",TZA_Arusha_City_115082,1682000.0,1737000.0,1793000.0,1851000.0,1910000.0,1970000.0,2032000.0,2095000.0
6,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Dar-es-salaam",TZA_Dar-es-salaam_TZ.DS_48358_TZA002,3957000.0,4206000.0,4469000.0,4747000.0,5041000.0,5353000.0,5682000.0,6031000.0


In [114]:
pop['admin1'] = pop['Country Name'].map(lambda x: x.split(', ')[1])

pop.head(2)

Unnamed: 0,Series Name,Series Code,Level_attr,Country Name,Country Code,2009 [YR2009],2010 [YR2010],2011 [YR2011],2012 [YR2012],2013 [YR2013],2014 [YR2014],2015 [YR2015],2016 [YR2016],admin1
5,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Arusha",TZA_Arusha_City_115082,1682000.0,1737000.0,1793000.0,1851000.0,1910000.0,1970000.0,2032000.0,2095000.0,Arusha
6,"Population, total",SP.POP.TOTL,Tanzania,"Tanzania, Dar-es-salaam",TZA_Dar-es-salaam_TZ.DS_48358_TZA002,3957000.0,4206000.0,4469000.0,4747000.0,5041000.0,5353000.0,5682000.0,6031000.0,Dar-es-salaam


In [115]:
naming = {
    '2009 [YR2009]': '2009',
    '2010 [YR2010]': '2010',
    '2011 [YR2011]': '2011',
    '2012 [YR2012]': '2012',
    '2013 [YR2013]': '2013',
    '2014 [YR2014]': '2014',
    '2015 [YR2015]': '2015',
    '2016 [YR2016]': '2016',
    'Level_attr': 'country'
}

pop.rename(columns=naming, inplace=True)

pop.sample(2)

Unnamed: 0,Series Name,Series Code,country,Country Name,Country Code,2009,2010,2011,2012,2013,2014,2015,2016,admin1
40,"Population, total",SP.POP.TOTL,Sudan,"Sudan, Nile River",SDN_Nile_River_SD.RN_2761_SDN004,1222000.0,1245000.0,1273000.0,1303000.0,1334000.0,1362000.0,1394000.0,1425000.0,Nile River
53,"Population, total",SP.POP.TOTL,South Sudan,"South Sudan, Jonglei",SSD_Jonglei_SS.JG_2751_SSD004,1591000.0,1656000.0,1719000.0,1779000.0,1838000.0,1896000.0,1954000.0,2012000.0,Jonglei


In [116]:
pop_long = pop.melt(id_vars = ['country', 'admin1'],
                   value_vars = ['2009', '2010', '2011', '2012', 
                                 '2013', '2014', '2015', '2016'],
                   var_name = 'year',
                   value_name = 'pop')

pop_long.sample(3)

Unnamed: 0,country,admin1,year,pop
128,South Sudan,Western Equatoria,2010,754000.0
348,South Sudan,Warab,2013,1316000.0
415,South Sudan,Central Equatoria,2014,1540000.0


In [117]:
pop_long.shape

(592, 4)

## 3 Merge with our baseline data set
- Load the geojson of region-month level hunger score data into a table.
- Link admin1 names  
    - difference the two sets
    - use levenshtein distance to build a map for converting admin1 names from population data
    - convert
- Merge the two data â€” outer join preserving all hunger scores and broad casting population stats to all months of that year.

### 3.1 Load

In [73]:
scores = gpd.read_file('region_scores.geojson')

print(scores.shape)
scores.sample(10)

(10124, 6)


Unnamed: 0,ADMIN0,ADMIN1,month,year,CS,geometry
8621,Somalia,Lower Juba,1,2010,2.5,"MULTIPOLYGON (((41.92260 -1.16289, 41.92260 -1..."
9048,Somalia,Mudug,4,2012,3.0,"POLYGON ((49.50000 7.57000, 49.64971 7.39783, ..."
8187,Somalia,Bari,7,2010,2.5,"POLYGON ((50.79877 11.98356, 50.80576 11.97939..."
9925,Tanzania,Kilimanjaro,1,2015,2.0,"POLYGON ((37.27619 -2.85750, 37.29011 -2.86061..."
4342,South Sudan,Western Equatoria,7,2010,1.5,"POLYGON ((28.03236 6.74766, 28.04826 6.73971, ..."
6149,Kenya,Mandera,4,2013,2.333333,"POLYGON ((40.99342 4.10780, 40.99861 4.10450, ..."
4933,Kenya,Elgeyo-Marakwet,7,2010,1.5,"POLYGON ((35.69886 1.27917, 35.69833 1.27657, ..."
1435,Sudan,Blue Nile,1,2013,2.0,"POLYGON ((34.48300 12.54923, 34.49492 12.54164..."
6498,Kenya,Murang'a,10,2014,1.5,"POLYGON ((36.74594 -0.56975, 36.74997 -0.57127..."
8586,Somalia,Hiiraan,6,2019,2.0,"POLYGON ((45.53511 5.48507, 45.67000 5.19000, ..."


In [118]:
naming = {
    'ADMIN0': 'country',
    'ADMIN1': 'admin1'
}

scores.rename(columns=naming, inplace=True)

scores.head(1)

Unnamed: 0,country,admin1,month,year,CS,geometry
0,Ethiopia,Addis Ababa,10,2018,1.0,"POLYGON ((38.78052 9.09681, 38.79296 9.08120, ..."


In [119]:
# convert year column of population dataframe from string to int before join
pop_long['year'] = pd.to_numeric(pop_long['year'], downcast='integer')

### 3.2 Link admin1

In [120]:
import Levenshtein as lev

In [121]:
zones_m = scores['admin1'].unique()
zones_j = pop_long['admin1'].unique()
wrong_zones = set(zones_j) - set(zones_m)

In [122]:
lev.ratio('aa', 'ab')

0.5

In [123]:
zone_matches = {}

for z1 in zones_m:
    for z2 in wrong_zones:
        if lev.ratio(z1, z2) > 0.7:
            print(z1, '----', z2)
            zone_matches[z2] = z1

Harari ---- Hareri
Benshangul Gumuz ---- Beneshangul Gumu
Central Darfur ---- Western Darfur
East Darfur ---- Western Darfur
Gedaref ---- Gadaref
North Darfur ---- Northern Darfur
North Darfur ---- Southern Darfur
North Kordofan ---- Northern Kordofan
North Kordofan ---- Southern Kordofan
Northern ---- North Eastern
Sinnar ---- Sennar
South Darfur ---- Northern Darfur
South Darfur ---- Southern Darfur
South Kordofan ---- Northern Kordofan
South Kordofan ---- Southern Kordofan
West Darfur ---- Western Darfur
Northern Bahr el Ghazal ---- Northern Bahr El Ghazal
Northern Bahr el Ghazal ---- Western Bahr El Ghazal
Warrap ---- Warab
Western Bahr el Ghazal ---- Northern Bahr El Ghazal
Western Bahr el Ghazal ---- Western Bahr El Ghazal
Dar es Salaam ---- Dar-es-salaam


In [124]:
# function for converting admin1
def link(zone):
    if zone in zone_matches:
        correct_zone = zone_matches[zone]
        return correct_zone
    else:
        return zone

In [125]:
pop_long['admin1'] = pop_long['admin1'].map(link)

### 3.3 Merge data

In [127]:
pop_j = scores.merge(pop_long, 
            on = ['country', 'admin1', 'year'],
            how = 'left')

print(pop_j.shape)
pop_j.sample(3)

(10394, 7)


Unnamed: 0,country,admin1,month,year,CS,geometry,pop
7672,Kenya,Turkana,7,2011,2.5,"POLYGON ((35.31929 5.33388, 35.33401 5.32995, ...",
7821,Kenya,Vihiga,6,2018,1.0,"POLYGON ((34.90278 0.19705, 34.90316 0.19655, ...",
4432,South Sudan,Western Bahr el Ghazal,4,2010,1.5,"POLYGON ((25.81703 10.43416, 25.81988 10.43083...",406000.0


In [132]:
pop_j.admin1.value_counts()

South Kordofan            260
South Darfur              233
Northern                  188
Western Bahr el Ghazal    178
Upper Nile                169
                         ... 
Dar es Salaam              10
Kigoma                     10
Njombe                      8
Katavi                      8
Rukwa                       8
Name: admin1, Length: 132, dtype: int64

In [137]:
zones_main_s = scores.admin1.unique()

zones_main_s.sort()
zones_main_s

array(['Addis Ababa', 'Afar', 'Amhara', 'Arusha', 'Awdal', 'Bakool',
       'Banaadir', 'Bari', 'Baringo', 'Bay', 'Benshangul Gumuz',
       'Blue Nile', 'Bomet', 'Bungoma', 'Busia', 'Central',
       'Central Darfur', 'Central Equatoria', 'Dar es Salaam',
       'Dire Dawa', 'Dodoma', 'East Darfur', 'Eastern',
       'Eastern Equatoria', 'Elgeyo-Marakwet', 'Embu', 'Galgaduud',
       'Gambela', 'Garissa', 'Gedaref', 'Gedo', 'Geita', 'Gezira',
       'Harari', 'Hiiraan', 'Homa Bay', 'Iringa', 'Isiolo', 'Jonglei',
       'Kagera', 'Kajiado', 'Kakamega', 'Kassala', 'Katavi', 'Kericho',
       'Khartoum', 'Kiambu', 'Kigoma', 'Kilifi', 'Kilimanjaro',
       'Kirinyaga', 'Kisii', 'Kisumu', 'Kitui', 'Kwale', 'Laikipia',
       'Lakes', 'Lamu', 'Lindi', 'Lower Juba', 'Lower Shabelle',
       'Machakos', 'Makueni', 'Mandera', 'Manyara', 'Mara', 'Marsabit',
       'Mbeya', 'Meru', 'Middle Juba', 'Middle Shabelle', 'Migori',
       'Mombasa', 'Morogoro', 'Mtwara', 'Mudug', "Murang'a", 'Mwanza',


In [138]:
scores[scores['admin1']=='South Kordofan'].head(20)

Unnamed: 0,country,admin1,month,year,CS,geometry
2597,Sudan,South Kordofan,12,2018,1.8,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2692,Sudan,South Kordofan,10,2018,3.0,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2625,Sudan,South Kordofan,1,2015,1.8,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2659,Sudan,South Kordofan,10,2017,2.5,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2655,Sudan,South Kordofan,10,2017,2.5,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2726,Sudan,South Kordofan,1,2012,2.0,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2723,Sudan,South Kordofan,7,2013,2.0,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2732,Sudan,South Kordofan,7,2009,1.5,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2683,Sudan,South Kordofan,4,2013,2.0,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
2734,Sudan,South Kordofan,10,2009,1.5,"POLYGON ((29.93368 12.74932, 29.97152 12.72099..."
