In [2608]:
import logging
import json
import csv
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 
from pandas import read_csv
from functools import reduce
from collections import defaultdict

## Configuration 

In [2589]:
#etl = 'kobo2elastic'
etl = 'curis2elastic'
#etl = 'oldcuris2newcuris'

input_schema_file = ''
input_data_file = ''
mapping_file = ''
    
if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.complete.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

## Get input JSON Schema (draft 07)

In [2590]:
schema_data = {}
schema_df = pd.DataFrame()

with open(input_schema_file) as f:
    schema_data = json.load(f)

schema_df = json_normalize(schema_data)
schema_df.T

Unnamed: 0,0
$id,http://example.com/root.json
$schema,http://json-schema.org/draft-07/schema#
properties.address.$id,#/properties/address
properties.address.items.$id,#/properties/address/items
properties.address.items.properties.barangay.$id,#/properties/address/items/properties/barangay
properties.address.items.properties.barangay.pattern,^(.*)$
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.$id,#/properties/address/items/properties/country
properties.address.items.properties.country.pattern,^(.*)$


## Get valid index List only (for JSON Schema Only)

In [2591]:
required_index_list = []
required_field_df = pd.DataFrame()

for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                required_index_list.append(indexList)

In [2592]:
required_field_df = schema_df[required_index_list]
required_field_df.T

Unnamed: 0,0
properties.address.items.properties.barangay.title,The Barangay Schema
properties.address.items.properties.barangay.type,string
properties.address.items.properties.country.title,The Country Schema
properties.address.items.properties.country.type,string
properties.address.items.properties.fax_number.items.properties.country_code.title,The Country_code Schema
properties.address.items.properties.fax_number.items.properties.country_code.type,string
properties.address.items.properties.fax_number.items.properties.number.title,The Number Schema
properties.address.items.properties.fax_number.items.properties.number.type,string
properties.address.items.properties.fax_number.items.title,The Items Schema
properties.address.items.properties.fax_number.items.type,object


## Clean value and index

In [2593]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()

def clean_index(x):
    return x.lower().replace("/", ".").replace("properties.", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
newSchema_df.reset_index(level=0, inplace=True)
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df.sort_values(['index'])
newSchema_df.head(60)

Unnamed: 0,index,value
0,address.barangay.title,barangay
1,address.barangay.type,string
2,address.country.title,country
3,address.country.type,string
4,address.fax_number.country_code.title,country_code
5,address.fax_number.country_code.type,string
6,address.fax_number.number.title,number
7,address.fax_number.number.type,string
8,address.fax_number.title,items
9,address.fax_number.type,object


## Create columns for type and key

In [2594]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type

valueSchema_df = valueSchema_df.sort_values(['source_key']).reset_index(drop=True)
valueSchema_df.loc[valueSchema_df['source_type'] == 'array']

csv_filename_df = pd.DataFrame()
csv_filename_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'array']
#list(csv_filename_df['source_key'])
csv_filename_df = csv_filename_df.reset_index(drop=True)
csv_filename_df

Unnamed: 0,source_key,source_type
0,address,array
1,address.fax_number,array
2,address.landline_number,array
3,address.mobile_number,array
4,family_members,array
5,health_informations,array
6,health_informations.diagnosed,array
7,health_informations.family_history,array
8,households,array
9,households.amenities_present_in_house,array


## Create root or default file (e.g. main, resident)

In [2595]:
with open('file/resident.csv', 'w'):
    pass

## Create separate file for each array

In [2596]:
csv_filename_list = []
csv_filename_list = list(csv_filename_df['source_key']) #array type

for i in list(csv_filename_list):
    with open('file/' + i + '.csv', 'w'):
        pass

In [2597]:
csv_header_name_df = pd.DataFrame()
csv_header_name_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'string']

csv_header_name_list = []
csv_header_name_list = list(csv_header_name_df['source_key']) #string type

## Segregate headers for every field type

In [2598]:
main_headers = []
array_headers = []
object_header = []

for header in csv_header_name_list:
    for filename in csv_filename_list:
        tmp_header = header.split(sep='.')
        #print(tmp_header)
        if(len(tmp_header)) > 1:
            #pass
            #print(tmp_header[:-1])
            #tmp_string = '.'.join(tmp_header[:-1])
            tmp_string = header
            
            #print(tmp_string)
            
            #print('============')
            if tmp_string == filename: #for array list
                #print('---array_list---')
                #print(header)
                #print('--header/field: ' + tmp_string + ' == --filename: ' + filename)
                array_headers.append(header)
        elif(len(tmp_header)) == 1:
            tmp_string = header
            if tmp_string == filename:
                array_headers.append(header)
        else:
            main_headers.append(tmp_header[0])

## Create headers for type: lists

In [2599]:
for header in array_headers:
    array_csv_df  = pd.DataFrame(columns=['_id','_index_map', header])
    array_csv_df.to_csv('file/' + header + '.csv', encoding='utf-8', mode='a', index=False)

## Create headers for type: array of objects and string @object level

In [2600]:
array_objects = set()
array_objects = set(csv_header_name_list) - set(csv_filename_list) - set(main_headers)
array_objects_list = list(array_objects)

In [2601]:
other_header_object = []

for name in array_objects_list:
    str1 = name.split('.')
    str2 = '.'.join(str1[0:-1])
    if str2 in csv_filename_list:
        other_header_object.append(name)

In [2602]:
dd = defaultdict(list)
for header in other_header_object:
    filenames = header.split(sep=".")[0:-1]
    filenames = '.'.join(filenames)
    dd[filenames].append(header)

In [2603]:
for header in dd:
    columns_list = list(dd[header])
    columns_list.append('_id')
    columns_list.append('_index_map')
    object_array_csv_df  = pd.DataFrame(columns=columns_list)
    object_array_csv_df.to_csv('file/' + header + '.csv', encoding='utf-8', mode='a', index=False)

## Create headers for type: object

In [2604]:
objects_header_df = pd.DataFrame()
objects_header_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'object']
objects_header_list = list(objects_header_df['source_key'])
objects_header_list

['address',
 'address.fax_number',
 'address.landline_number',
 'address.mobile_number',
 'health_informations',
 'health_informations.blood_pressure',
 'health_informations.blood_pressure.first_reading',
 'health_informations.blood_pressure.second_reading',
 'health_informations.blood_pressure.third_reading',
 'households',
 'identification',
 'identification.id1',
 'identification.id2',
 'identification.id3',
 'profile_picture',
 'profiles',
 'profiles.employment',
 'user-cam']

In [2605]:
objects = set()
set(objects_header_list) - set(csv_filename_list) 
#array_objects = set(csv_header_name_list) - set(csv_filename_list) - set(main_headers)

{'health_informations.blood_pressure',
 'health_informations.blood_pressure.first_reading',
 'health_informations.blood_pressure.second_reading',
 'health_informations.blood_pressure.third_reading',
 'identification',
 'identification.id1',
 'identification.id2',
 'identification.id3',
 'profile_picture',
 'profiles.employment',
 'user-cam'}

## Create header for type: string @root level

In [2606]:
main_header_list = list(dict.fromkeys(main_header))
main_header_list = main_header_list  + main_header_object
main_header_list.append('_id')
main_header_list.append('_index_map')

In [2607]:
main_resident_df  = pd.DataFrame(columns = main_header_list)
main_resident_df.to_csv('file/resident.csv', encoding='utf-8', mode='a', index=False)
main_resident_df

Unnamed: 0,birthdate,family_members,first_name,gender,id,last_name,last_name_suffix,middle_name,nhid,organization,registered_at,type,_id,_index_map
