# JSON examples and exercise
****
+ get familiar with packages for dealing with JSON
+ study examples with JSON strings and files 
+ work on exercise to be completed and submitted 
****
+ reference: http://pandas.pydata.org/pandas-docs/stable/io.html#io-json-reader
+ data source: http://jsonstudio.com/resources/
****

In [1]:
import pandas as pd

## imports for Python, Pandas

In [2]:
import json
from pandas.io.json import json_normalize

## JSON example, with string

+ demonstrates creation of normalized dataframes (tables) from nested json string
+ source: http://pandas.pydata.org/pandas-docs/stable/io.html#normalization

In [3]:
# define json string
data = [{'state': 'Florida', 
         'shortname': 'FL',
         'info': {'governor': 'Rick Scott'},
         'counties': [{'name': 'Dade', 'population': 12345},
                      {'name': 'Broward', 'population': 40000},
                      {'name': 'Palm Beach', 'population': 60000}]},
        {'state': 'Ohio',
         'shortname': 'OH',
         'info': {'governor': 'John Kasich'},
         'counties': [{'name': 'Summit', 'population': 1234},
                      {'name': 'Cuyahoga', 'population': 1337}]}]

In [4]:
# use normalization to create tables from nested element
json_normalize(data, 'counties')

Unnamed: 0,name,population
0,Dade,12345
1,Broward,40000
2,Palm Beach,60000
3,Summit,1234
4,Cuyahoga,1337


In [5]:
# further populate tables created from nested element
json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])

Unnamed: 0,name,population,info.governor,state,shortname
0,Dade,12345,Rick Scott,Florida,FL
1,Broward,40000,Rick Scott,Florida,FL
2,Palm Beach,60000,Rick Scott,Florida,FL
3,Summit,1234,John Kasich,Ohio,OH
4,Cuyahoga,1337,John Kasich,Ohio,OH


****
## JSON example, with file

+ demonstrates reading in a json file as a string and as a table
+ uses small sample file containing data about projects funded by the World Bank 
+ data source: http://jsonstudio.com/resources/

In [6]:
# load json as string
json.load(open('data/world_bank_projects_less.json'))
None

In [7]:
# load as Pandas dataframe
sample_json_df = pd.read_json('data/world_bank_projects_less.json')
sample_json_df

Unnamed: 0,_id,approvalfy,board_approval_month,boardapprovaldate,borrower,closingdate,country_namecode,countrycode,countryname,countryshortname,...,sectorcode,source,status,supplementprojectflg,theme1,theme_namecode,themecode,totalamt,totalcommamt,url
0,{u'$oid': u'52b213b38594d8a2be17c780'},1999,November,2013-11-12T00:00:00Z,FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA,2018-07-07T00:00:00Z,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia,...,"ET,BS,ES,EP",IBRD,Active,N,"{u'Percent': 100, u'Name': u'Education for all'}","[{u'code': u'65', u'name': u'Education for all'}]",65,130000000,130000000,http://www.worldbank.org/projects/P129828/ethi...
1,{u'$oid': u'52b213b38594d8a2be17c781'},2015,November,2013-11-04T00:00:00Z,GOVERNMENT OF TUNISIA,,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia,...,"BZ,BS",IBRD,Active,N,"{u'Percent': 30, u'Name': u'Other economic man...","[{u'code': u'24', u'name': u'Other economic ma...",5424,0,4700000,http://www.worldbank.org/projects/P144674?lang=en


****
## JSON exercise

Using data in file 'data/world_bank_projects.json' and the techniques demonstrated above,
1. Find the 10 countries with most projects
2. Find the top 10 major project themes (using column 'mjtheme_namecode')
3. In 2. above you will notice that some entries have only the code and the name is missing. Create a dataframe with the missing names filled in.

In [8]:
# read the data in as a json string 

projects_json = json.load(open('data/world_bank_projects.json'))

In [9]:
# read the data in as a dataframe 

projects_df = pd.read_json('data/world_bank_projects.json')

In [10]:
# figure out the relevant column names for problem 1: top 10 countries with the most projects

for ind, label in enumerate(projects_df.columns):
    if 'countr' in label:
        print ind, label

6 country_namecode
7 countrycode
8 countryname
9 countryshortname


In [11]:
prob1_df = projects_df[['country_namecode', 'countrycode', 'countryname', 'countryshortname']]
prob1_df.head()

Unnamed: 0,country_namecode,countrycode,countryname,countryshortname
0,Federal Democratic Republic of Ethiopia!$!ET,ET,Federal Democratic Republic of Ethiopia,Ethiopia
1,Republic of Tunisia!$!TN,TN,Republic of Tunisia,Tunisia
2,Tuvalu!$!TV,TV,Tuvalu,Tuvalu
3,Republic of Yemen!$!RY,RY,Republic of Yemen,"Yemen, Republic of"
4,Kingdom of Lesotho!$!LS,LS,Kingdom of Lesotho,Lesotho


In [12]:
# solution to problem 1: top 10 countries with the most projects

prob1_df.countryshortname.value_counts().sort_values(ascending=False).head(10)

China                 19
Indonesia             19
Vietnam               17
India                 16
Yemen, Republic of    13
Morocco               12
Bangladesh            12
Nepal                 12
Africa                11
Mozambique            11
Name: countryshortname, dtype: int64

In [13]:
# dataframe for problem 2 and 3: top 10 major project themes and filling in the missing names 

prob2_df = json_normalize(projects_json, 'mjtheme_namecode')
prob2_df.head()

Unnamed: 0,code,name
0,8,Human development
1,11,
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [14]:
# figure out the possible values of 'code'

prob2_df.code.apply(int).value_counts().sort_index()

1      38
2     199
3      15
4     146
5      77
6     168
7     130
8     210
9      50
10    216
11    250
Name: code, dtype: int64

In [15]:
# build a dictionary of 'code' and the corresponding 'name'

temp_df = prob2_df[prob2_df.name != '']
mydict = {}
for num in range(1,12):
    mydict[str(num)] = temp_df[temp_df.code == str(num)].iloc[0,1]
mydict

{'1': u'Economic management',
 '10': u'Rural development',
 '11': u'Environment and natural resources management',
 '2': u'Public sector governance',
 '3': u'Rule of law',
 '4': u'Financial and private sector development',
 '5': u'Trade and integration',
 '6': u'Social protection and risk management',
 '7': u'Social dev/gender/inclusion',
 '8': u'Human development',
 '9': u'Urban development'}

In [16]:
# solution to problem 3: fill in the missing values of 'name' 

prob2_df.name = prob2_df.code.apply(lambda x: mydict[x])
prob2_df.head()

Unnamed: 0,code,name
0,8,Human development
1,11,Environment and natural resources management
2,1,Economic management
3,6,Social protection and risk management
4,5,Trade and integration


In [17]:
# solution to problem 2: top 10 major project themes 

prob2_df.name.value_counts().sort_values(ascending=False).head(10)

Environment and natural resources management    250
Rural development                               216
Human development                               210
Public sector governance                        199
Social protection and risk management           168
Financial and private sector development        146
Social dev/gender/inclusion                     130
Trade and integration                            77
Urban development                                50
Economic management                              38
Name: name, dtype: int64