In [1]:
import os
import pandas as pd
import numpy as np
import csv

In [2]:
# read data source
df = pd.read_json('data.json')
len(df)

51821

In [3]:
# check first 5 data
df.head()

Unnamed: 0,id,operatingUnit,spatialCoverage,region,province,commodityGroup,commodity,valueChainSegment,function,subprogram,...,investment_6,investmentTotal,municipalities_1,municipalities_2,municipalities_3,municipalities_4,municipalities_5,municipalities_6,fundSource,remarks
0,1,RFO 11,Regional,Region 11,Davao del Norte,Livestock,Chicken (Native),Functional Service,"Extension Support, Education and Training Serv...",,...,0.0,200000.0,0,0,1,0,0,0,DA,Tagum
1,2,RFO 11,Regional,Region 11,Davao del Norte,Livestock,Carabao,Functional Service,"Extension Support, Education and Training Serv...",,...,0.0,200000.0,0,0,1,0,0,0,DA,Tagum
2,3,RFO 11,Regional,Region 11,Davao City,High Value Crops,Vegetables,Production,Production Support Services,,...,0.0,400000.0,0,0,1,0,0,0,"DA, LGU",Davao City
3,4,RFO 11,Regional,Region 11,Davao del Norte,Livestock,Chicken (Native),Production,Production Support Services,,...,0.0,500000.0,0,0,1,0,0,0,DA,Tagum
4,5,RFO 11,Regional,Region 11,Davao Oriental,Rice,Rice,Input Supply,Production Support Services,,...,0.0,500000.0,0,0,1,0,0,0,DA,Boston


In [4]:
# rename columns
df.columns = ['id','operating_unit','spatial_coverage','region','province','commodity_group','commodity','value_chain_segment','program','subprogram','intervention_type','intervention','intervention_others','unit','intervention_details','multi_year','target_1','target_2','target_3','target_4','target_5','target_6','investment_1','investment_2','investment_3','investment_4','investment_5','investment_6','investment_total','municipalities_1','municipalities_2','municipalities_3','municipalities_4','municipalities_5','municipalities_6','fund_source','remarks']

In [5]:
df['province'].replace('','na', inplace=True)
df['unit'].replace('','na', inplace=True)
df['subprogram'].replace('','na', inplace=True)

In [6]:
# recode data
df['operating_unit'] = df['operating_unit'].astype('category')
df['operating_unit_id'] = df['operating_unit'].cat.codes
df['spatial_coverage'] = df['spatial_coverage'].astype('category')
df['spatial_coverage_id'] = df['spatial_coverage'].cat.codes
df['region'] = df['region'].astype('category')
df['region_id'] = df['region'].cat.codes
df['province'] = df['province'].astype('category')
df['province_id'] = df['province'].cat.codes
df['commodity_group'] = df['commodity_group'].astype('category')
df['commodity_group_id'] = df['commodity_group'].cat.codes
df['commodity'] = df['commodity'].astype('category')
df['commodity_id'] = df['commodity'].cat.codes
df['value_chain_segment'] = df['value_chain_segment'].astype('category')
df['value_chain_segment_id'] = df['value_chain_segment'].cat.codes
df['program'] = df['program'].astype('category')
df['program_id'] = df['program'].cat.codes
df['subprogram'] = df['subprogram'].astype('category')
df['subprogram_id'] = df['subprogram'].cat.codes
df['unit'] = df['unit'].astype('category')
df['unit_id'] = df['unit'].cat.codes
df['multi_year'] = df['multi_year'].astype('category')
df['multi_year'] = df['multi_year'].cat.codes

# get sample data
df.sample(5)


Unnamed: 0,id,operating_unit,spatial_coverage,region,province,commodity_group,commodity,value_chain_segment,program,subprogram,...,operating_unit_id,spatial_coverage_id,region_id,province_id,commodity_group_id,commodity_id,value_chain_segment_id,program_id,subprogram_id,unit_id
34507,34508,RFO 13,Regional,CARAGA,Regionwide (CARAGA),Corn,Cassava,Functional Service,"Extension Support, Education and Training Serv...","Extension Support, Education and Training Serv...",...,23,2,2,87,1,64,0,5,13,184
39691,39692,RFO 4A,Regional,Region 4A,Rizal,High Value Crops,Cashew \n,Production,"Agricultural Machinery, Equipment, and Facilit...","Agricultural Machinery, Equipment, and Facilit...",...,26,2,11,88,4,63,7,0,0,184
42725,42726,RFO 13,Regional,CARAGA,Dinagat Islands,Fisheries,Shellfish,Functional Service,"Extension Support, Education and Training Serv...","Extension Support, Education and Training Serv...",...,23,2,2,34,2,322,0,5,13,184
32697,32698,RFO 1,Regional,Region 1,Ilocos Norte,High Value Crops,Mango,Market,Market Development Services,Market Development Services,...,19,2,5,40,4,216,2,8,23,184
22567,22568,RFO 11,Regional,Region 11,Davao City,High Value Crops,Coffee,Functional Service,"Extension Support, Education and Training Serv...","Extension Support, Education and Training Serv...",...,21,2,7,29,4,86,0,5,13,184


In [7]:
df.dtypes

id                           int64
operating_unit            category
spatial_coverage          category
region                    category
province                  category
commodity_group           category
commodity                 category
value_chain_segment       category
program                   category
subprogram                category
intervention_type           object
intervention                object
intervention_others         object
unit                      category
intervention_details        object
multi_year                    int8
target_1                   float64
target_2                   float64
target_3                   float64
target_4                   float64
target_5                   float64
target_6                   float64
investment_1               float64
investment_2               float64
investment_3               float64
investment_4               float64
investment_5               float64
investment_6               float64
investment_total    

In [8]:
def writeToCsv(csv_file,csv_columns,dict_data):
    # convert dict to list
    data_list = dict_data.items()
    try:
        with open(csv_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, dialect='excel', quoting=csv.QUOTE_NONNUMERIC)
            writer.writerow(csv_columns)
            for data in data_list:
                writer.writerow(data)
    except IOError as err:
        errno, strerror = err.args
        print("I/O error({0}): {1}".format(errno, strerror))    
    return

In [9]:
csv_columns = ['id','name']

In [10]:
operating_unit = dict(enumerate(df['operating_unit'].cat.categories))
writeToCsv('operating_unit.csv',csv_columns,operating_unit)

In [11]:
program = dict(enumerate(df['program'].cat.categories))
writeToCsv('program.csv',csv_columns,program)

In [12]:
subprogram = dict(enumerate(df['subprogram'].cat.categories))
print(subprogram)
writeToCsv('subprogram.csv',csv_columns,subprogram)

{0: 'Agricultural Machinery, Equipment, and Facilities Support Services', 1: 'Agriculture and Fishery Policy Services', 2: 'Agriculture and Fishery Regulatory Support', 3: 'Block Farm Program', 4: 'Bridge Construction and Repair', 5: 'Coconut Industry Development Program', 6: 'Credit Facilitation Services', 7: 'Credit Facilitation Services\n', 8: 'Credit Support Services', 9: 'Crop Insurance Program', 10: 'Cross-Cutting', 11: 'Dairy Industry Development Program', 12: 'Development of organizational policies, plans and procedures', 13: 'Extension Support, Education and Training Services', 14: 'Farm-to-Mill Roads', 15: 'Farm-to-market Road Network Services', 16: 'Fisheries Infrastructure Development Program', 17: 'Formulation, monitoring and evaluation of agricultural and fishery policies, plans and programs', 18: 'Hatcheries maintained', 19: 'Information Support Services', 20: 'Irrigation Network Services', 21: 'Irrigation Systems Development Program', 22: 'Irrigation Systems Restoration

In [13]:
unit = dict(enumerate(df['unit'].cat.categories))
print(unit)
writeToCsv('unit.csv',csv_columns,unit)

{0: '4WD Tractor Distributed (number)', 1: 'Area (has)', 2: 'Area (has.)', 3: 'Area planted', 4: 'Bag', 5: 'Combine Harvester distributed (number)', 6: 'Farmers Insured (No.)', 7: 'Goat distributed', 8: 'Groups assisted to access credit', 9: 'HT distributed', 10: 'Head', 11: 'Hectares', 12: 'Individuals assisted (No.)', 13: 'Kilogram', 14: 'Kilometer', 15: 'Litre', 16: 'Metric-ton', 17: 'No of Corn Dehuskers distributed', 18: 'No of Multi-purpose drying pavement constructed', 19: 'No of bags', 20: 'No of trainings conducted', 21: 'No of units distributed', 22: 'No, of activities participated', 23: 'No.', 24: 'No. of 4 WD tractor distributed', 25: 'No. of 4-Wheel Drive Tractor', 26: 'No. of 4-row corn planter distributed', 27: 'No. of Cluster areas (Hybrid)', 28: 'No. of Common Service Facility established', 29: 'No. of FBS conducted', 30: 'No. of FSB conducted', 31: 'No. of Facility established', 32: 'No. of Fertlizer Bags provided', 33: 'No. of Field Day conducted', 34: 'No. of Mango 

In [14]:
value_chain_segment = dict(enumerate(df['value_chain_segment'].cat.categories))
writeToCsv('value_chain_segment.csv',csv_columns,value_chain_segment)

In [15]:
commodity = dict(enumerate(df['commodity'].cat.categories))
writeToCsv('commodity.csv',csv_columns,commodity)

In [16]:
commodity_group = dict(enumerate(df['commodity_group'].cat.categories))
writeToCsv('commodity_group.csv',csv_columns,commodity_group)

In [17]:
province = dict(enumerate(df['province'].cat.categories))
print(province)
writeToCsv('province.csv',csv_columns,province)

{0: 'Abra', 1: 'Agusan del Norte', 2: 'Agusan del Sur', 3: 'Aklan', 4: 'Albay', 5: 'Antique', 6: 'Apayao', 7: 'Aurora', 8: 'Basilan', 9: 'Bataan', 10: 'Batanes', 11: 'Batangas', 12: 'Benguet', 13: 'Biliran', 14: 'Bohol', 15: 'Bukidnon', 16: 'Bulacan', 17: 'Butuan City', 18: 'Cagayan', 19: 'Cagayan de Oro City', 20: 'Camarines Norte', 21: 'Camarines Sur', 22: 'Camiguin', 23: 'Capiz', 24: 'Catanduanes', 25: 'Cavite', 26: 'Cebu', 27: 'Compostela Valley', 28: 'Cotabato (North Cotabato)', 29: 'Davao City', 30: 'Davao Occidental', 31: 'Davao Oriental', 32: 'Davao del Norte', 33: 'Davao del Sur', 34: 'Dinagat Islands', 35: 'Eastern Samar', 36: 'General Santos City', 37: 'Guimaras', 38: 'Ifugao', 39: 'Iligan City', 40: 'Ilocos Norte', 41: 'Ilocos Sur', 42: 'Iloilo', 43: 'Iloilo City', 44: 'Inter-regional', 45: 'Isabela', 46: 'Kalinga', 47: 'La Union', 48: 'Laguna', 49: 'Lanao del Norte', 50: 'Lanao del Sur', 51: 'Leyte', 52: 'Maguindanao', 53: 'Marinduque', 54: 'Masbate', 55: 'Misamis Occident

In [18]:
region = dict(enumerate(df['region'].cat.categories))
writeToCsv('region.csv',csv_columns,region)

In [19]:
spatial_coverage = dict(enumerate(df['spatial_coverage'].cat.categories))
writeToCsv('spatial_coverage.csv',csv_columns,spatial_coverage)

In [20]:
columns_to_remove = ['operating_unit','spatial_coverage','region','province','commodity_group','commodity','value_chain_segment','program','subprogram','unit']
df.drop(columns_to_remove, axis=1, inplace=True)

In [21]:
df.head()

Unnamed: 0,id,intervention_type,intervention,intervention_others,intervention_details,multi_year,target_1,target_2,target_3,target_4,...,operating_unit_id,spatial_coverage_id,region_id,province_id,commodity_group_id,commodity_id,value_chain_segment_id,program_id,subprogram_id,unit_id
0,1,Others,Stregthen/Conduct of Poultry Production and He...,,No. of training conducted,0,0.0,0.0,1.0,0.0,...,21,2,7,32,5,76,0,5,43,184
1,2,Others,Training on Integrated Farming,,No. of integrated farming conducted,0,0.0,0.0,1.0,0.0,...,21,2,7,32,5,56,0,5,43,184
2,3,Others,Development of multiplier farms to increase th...,,No of multiplier farms developed,0,0.0,0.0,1.0,0.0,...,21,2,7,29,4,369,7,11,43,184
3,4,Others,Dispersal of Native Chicken,,No. of native chicken dispersed,0,0.0,0.0,2500.0,0.0,...,21,2,7,32,5,76,7,11,43,184
4,5,Others,Provision of certified rice seeds,,No of certified rice seeds provided,0,0.0,0.0,333.0,0.0,...,21,2,7,31,7,284,1,11,43,169


In [22]:
# export to csv
df.to_csv('interventions.csv', sep=',', float_format='%.2f', encoding='utf-8', index=False)