****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [1]:
import json
import pandas as pd
from pandas.io.json import json_normalize

jsondata = json.load(open('data/world_bank_projects.json')) 
df = json_normalize(jsondata) # load json into pandas

df.head() # first, let's see some obs

Unnamed: 0,_id.$oid,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,source,status,supplementprojectflg,theme1.Name,theme1.Percent,theme_namecode,themecode,totalamt,totalcommamt,url
0,52b213b38594d8a2be17c780,1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,IBRD,Active,N,Education for all,100,"[{'code': '65', 'name': 'Education for all'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,52b213b38594d8a2be17c781,2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,IBRD,Active,N,Other economic management,30,"[{'code': '24', 'name': 'Other economic manage...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en
2,52b213b38594d8a2be17c782,2014,November,2013-11-01T00:00:00Z,MINISTRY OF FINANCE AND ECONOMIC DEVEL,,Tuvalu!$!TV,TV,Tuvalu,Tuvalu,...,IBRD,Active,Y,Regional integration,46,"[{'code': '47', 'name': 'Regional integration'...",52812547,6060000,6060000,http://www.worldbank.org/projects/P145310?lang=en
3,52b213b38594d8a2be17c783,2014,October,2013-10-31T00:00:00Z,MIN. OF PLANNING AND INT'L COOPERATION,,Republic of Yemen!$!RY,RY,Republic of Yemen,"Yemen, Republic of",...,IBRD,Active,N,Participation and civic engagement,50,"[{'code': '57', 'name': 'Participation and civ...",5957,0,1500000,http://www.worldbank.org/projects/P144665?lang=en
4,52b213b38594d8a2be17c784,2014,October,2013-10-31T00:00:00Z,MINISTRY OF FINANCE,2019-04-30T00:00:00Z,Kingdom of Lesotho!$!LS,LS,Kingdom of Lesotho,Lesotho,...,IBRD,Active,N,Export development and competitiveness,30,"[{'code': '45', 'name': 'Export development an...",4145,13100000,13100000,http://www.worldbank.org/projects/P144933/seco...


In [2]:
len(df) # check number of observations (projects)

500

In [3]:
df.index.get_duplicates() # check if there is no duplicate

[]

In [4]:
# see the list of all variables.
df.columns.values.tolist()

['_id.$oid',
 'approvalfy',
 'board_approval_month',
 'boardapprovaldate',
 'borrower',
 'closingdate',
 'country_namecode',
 'countrycode',
 'countryname',
 'countryshortname',
 'docty',
 'envassesmentcategorycode',
 'grantamt',
 'ibrdcommamt',
 'id',
 'idacommamt',
 'impagency',
 'lendinginstr',
 'lendinginstrtype',
 'lendprojectcost',
 'majorsector_percent',
 'mjsector_namecode',
 'mjtheme',
 'mjtheme_namecode',
 'mjthemecode',
 'prodline',
 'prodlinetext',
 'productlinetype',
 'project_abstract.cdata',
 'project_name',
 'projectdocs',
 'projectfinancialtype',
 'projectstatusdisplay',
 'regionname',
 'sector',
 'sector1.Name',
 'sector1.Percent',
 'sector2.Name',
 'sector2.Percent',
 'sector3.Name',
 'sector3.Percent',
 'sector4.Name',
 'sector4.Percent',
 'sector_namecode',
 'sectorcode',
 'source',
 'status',
 'supplementprojectflg',
 'theme1.Name',
 'theme1.Percent',
 'theme_namecode',
 'themecode',
 'totalamt',
 'totalcommamt',
 'url']

In [5]:
# It looks like we need 3 variables 'mjtheme_namecode', 'countryname', '_id.$oid'. 
# '_id.$oid' seems like an identifier of a project, which should be unique along obs.
# 'countryname' seems like a name of a country.
# 'mjtheme_namecode' is major project theme (given by the question).  
df[['mjtheme_namecode', 'countryname', '_id.$oid']].head()

Unnamed: 0,mjtheme_namecode,countryname,_id.$oid
0,"[{'code': '8', 'name': 'Human development'}, {...",Federal Democratic Republic of Ethiopia,52b213b38594d8a2be17c780
1,"[{'code': '1', 'name': 'Economic management'},...",Republic of Tunisia,52b213b38594d8a2be17c781
2,"[{'code': '5', 'name': 'Trade and integration'...",Tuvalu,52b213b38594d8a2be17c782
3,"[{'code': '7', 'name': 'Social dev/gender/incl...",Republic of Yemen,52b213b38594d8a2be17c783
4,"[{'code': '5', 'name': 'Trade and integration'...",Kingdom of Lesotho,52b213b38594d8a2be17c784


In [6]:
# Check '_id.$oid' variable. This variable should be unique by observation 
df['_id.$oid'].isnull().values.ravel().sum() # check how many obs with missing '_id.$oid'

0

In [7]:
df['_id.$oid'].index.get_duplicates() # check duplicating _id.$oid'

[]

In [8]:
# '_id.$oid' seems like a valid identifier so we can proceed and assign index to our dataframe. 
df = df.set_index(df['_id.$oid']) 
df[['countryname']].head() 

Unnamed: 0_level_0,countryname
_id.$oid,Unnamed: 1_level_1
52b213b38594d8a2be17c780,Federal Democratic Republic of Ethiopia
52b213b38594d8a2be17c781,Republic of Tunisia
52b213b38594d8a2be17c782,Tuvalu
52b213b38594d8a2be17c783,Republic of Yemen
52b213b38594d8a2be17c784,Kingdom of Lesotho


In [9]:
# [Question 1]: First, check if 'countryname' is actually a country name.  
df['countryname'].unique() 

array(['Federal Democratic Republic of Ethiopia', 'Republic of Tunisia',
       'Tuvalu', 'Republic of Yemen', 'Kingdom of Lesotho',
       'Republic of Kenya', 'Republic of India',
       "People's Republic of China", 'Kingdom of Morocco',
       'Republic of South Sudan', 'Republic of Ghana',
       'Democratic Republic of Timor-Leste', 'Hashemite Kingdom of Jordan',
       'Samoa', 'Republic of Madagascar', 'Kingdom of Cambodia',
       'Kyrgyz Republic', 'Nepal', 'Republic of Tajikistan',
       'Republic of Azerbaijan', 'East Asia and Pacific',
       "Lao People's Democratic Republic", 'Pacific Islands',
       'Solomon Islands', 'Republic of Mozambique',
       "People's Republic of Angola", 'United Republic of Tanzania',
       'Federal Republic of Nigeria', 'Republic of Seychelles',
       "People's Republic of Bangladesh", 'Republic of Senegal',
       'Republic of the Union of Myanmar', 'West Bank and Gaza',
       'Argentine Republic', 'Republic of The Gambia',
       'Russ

In [10]:
# Variable 'country' seems to be what we need. 
# Next, let's check how many projects that do not have 'countryname'.
df['countryname'].isnull().values.ravel().sum()

0

In [11]:
# ANSWER TO QUESTION (1)
df_Q1 = df[['_id.$oid','countryname']].groupby('countryname').count().sort_values('_id.$oid',ascending=False)
df_Q1.head(11)
# Output of this cell is an answer for question 1. 
# Note that I put down top 11 countries to see that the 11th one is less than the 10th.  

Unnamed: 0_level_0,_id.$oid
countryname,Unnamed: 1_level_1
People's Republic of China,19
Republic of Indonesia,19
Socialist Republic of Vietnam,17
Republic of India,16
Republic of Yemen,13
People's Republic of Bangladesh,12
Nepal,12
Kingdom of Morocco,12
Republic of Mozambique,11
Africa,11


In [12]:
# [Question 2]: We saw that 'mjtheme_namecode' is nested so first we need to re-download the data.  
df_by_theme = json_normalize(jsondata,'mjtheme_namecode',['countryname',['_id','$oid']])
df_by_theme.head(10)

Unnamed: 0,code,name,_id.$oid,countryname
0,8,Human development,52b213b38594d8a2be17c780,Federal Democratic Republic of Ethiopia
1,11,,52b213b38594d8a2be17c780,Federal Democratic Republic of Ethiopia
2,1,Economic management,52b213b38594d8a2be17c781,Republic of Tunisia
3,6,Social protection and risk management,52b213b38594d8a2be17c781,Republic of Tunisia
4,5,Trade and integration,52b213b38594d8a2be17c782,Tuvalu
5,2,Public sector governance,52b213b38594d8a2be17c782,Tuvalu
6,11,Environment and natural resources management,52b213b38594d8a2be17c782,Tuvalu
7,6,Social protection and risk management,52b213b38594d8a2be17c782,Tuvalu
8,7,Social dev/gender/inclusion,52b213b38594d8a2be17c783,Republic of Yemen
9,7,Social dev/gender/inclusion,52b213b38594d8a2be17c783,Republic of Yemen


In [13]:
# Next, check all possible values for major theme names
df_by_theme.sort_values('name')['name'].unique()

array(['', 'Economic management',
       'Environment and natural resources management',
       'Financial and private sector development', 'Human development',
       'Public sector governance', 'Rule of law', 'Rural development',
       'Social dev/gender/inclusion',
       'Social protection and risk management', 'Trade and integration',
       'Urban development'], dtype=object)

In [14]:
# There are 12 unique major theme names, including empty space, ''.
# Next, create a dataframe mapping 'code' to 'name'.
df_code2name = df_by_theme[['code','name']]
# Take out obs with missing name and drop all duplicates
df_code2name = df_code2name[df_code2name.name!=''].drop_duplicates().sort_values('code')
df_code2name

Unnamed: 0,code,name
2,1,Economic management
18,10,Rural development
6,11,Environment and natural resources management
5,2,Public sector governance
252,3,Rule of law
11,4,Financial and private sector development
4,5,Trade and integration
3,6,Social protection and risk management
8,7,Social dev/gender/inclusion
0,8,Human development


In [15]:
# Then, we can merge this mapping back to the original dataframe.
df_by_theme = pd.merge(df_by_theme.drop('name', 1),df_code2name, on='code',how='left')
df_by_theme['name'].unique() # check if there is still missing value for 'name'.

array(['Human development', 'Environment and natural resources management',
       'Economic management', 'Social protection and risk management',
       'Trade and integration', 'Public sector governance',
       'Social dev/gender/inclusion',
       'Financial and private sector development', 'Rural development',
       'Urban development', 'Rule of law'], dtype=object)

In [16]:
# ANSWER TO QUESTION (2) : see output of this cell.
# 'df_by_theme' is the dataframe that the question asks for. (QUESTION 3)
# Note that I put down top 11 to see that the 11th is truly less than the 10th. 
df_Q2 = df_by_theme[['name','_id.$oid']].groupby('name').count().sort_values('_id.$oid',ascending=False)
df_Q2.head(11)

Unnamed: 0_level_0,_id.$oid
name,Unnamed: 1_level_1
Environment and natural resources management,250
Rural development,216
Human development,210
Public sector governance,199
Social protection and risk management,168
Financial and private sector development,146
Social dev/gender/inclusion,130
Trade and integration,77
Urban development,50
Economic management,38
