In [60]:
import logging
import json
import csv
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 
from pandas import read_csv
from functools import reduce
from collections import defaultdict

## Configuration 

In [61]:
#etl = 'kobo2elastic'
etl = 'curis2elastic'
#etl = 'oldcuris2newcuris'

input_schema_file = ''
input_data_file = ''
mapping_file = ''

root_object = 'resident'

if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    input_data_file = 'data/curisData.13-items.json'
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
    output_dir = 'file/curisSchema/'
    
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.partial.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
    output_dir = 'file/koboSchema/'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

## Get input JSON Schema (draft 07)

In [62]:
def _get_schema():
    _data_df = pd.DataFrame()
    
    with open(input_schema_file) as f:
        _data_df = json_normalize(json.load(f))
    
    return _data_df

In [63]:
def _get_json_schema_properties(schema_df):
    _required_properties = []
    _required_fields = pd.DataFrame()

    for prop in list(schema_df):
        if "._" not in prop:
            if len(prop.split(sep='.')) > 2:
                if prop.split(sep='.')[-1] == 'type' or prop.split(sep='.')[-1] == 'title':
                    _required_properties.append(prop)
    
    _required_fields = schema_df[_required_properties]
    
    return _required_fields

In [64]:
def _clean_value(value):
    return value.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()

def _clean_index(index):
    return index.lower().replace("/", ".").replace("properties.", "").replace("items.", "").strip()

In [65]:
def _clean_schema(required_fields_df):
    _newSchema_df = pd.DataFrame()
    _required_fields_df = required_fields_df

    _newSchema_df['value'] = _required_fields_df.T[0].apply(_clean_value)
    _newSchema_df.reset_index(level=0, inplace=True)
    _newSchema_df['index'] = _newSchema_df['index'].apply(_clean_index)
    _newSchema_df.sort_values(['index'])
    
    return _newSchema_df

In [66]:
def _schema_to_keyvalue(newSchema_df):
    _valueSchema_df = pd.DataFrame()
    _newSchema_df = newSchema_df

    schema_length = len(_newSchema_df)
    title_counter = 0
    type_counter = 1
    skip = 2

    title_property = []
    type_property = []

    while (title_counter < schema_length):
        title_property.append(_newSchema_df.iloc[title_counter]['index'].replace('.title',''))
        title_counter += skip

    while (type_counter < schema_length):
        type_property.append(_newSchema_df.iloc[type_counter]['value'])
        type_counter += skip

    _valueSchema_df['source_key'] = title_property
    _valueSchema_df['source_type'] = type_property

    _valueSchema_df = _valueSchema_df.sort_values(['source_key']).reset_index(drop=True)
    _valueSchema_df.loc[_valueSchema_df['source_type'] == 'array']

    return _valueSchema_df

## Create root or default file (e.g. main, resident)

In [67]:
def _write_type_main():
    with open(output_dir + root_object + '.csv', 'w'):
        pass

In [68]:
def _get_type_array(kv_schema_df):
    type_array_df = pd.DataFrame()
    type_array_df = kv_schema_df.loc[kv_schema_df['source_type'] == 'array']
    type_array_df = type_array_df.reset_index(drop=True)

    _csv_filename_list = []
    _csv_filename_list = list(type_array_df['source_key']) #array type
    return _csv_filename_list
    
def _write_type_array_file(csv_filename_list):
    
    for i in list(csv_filename_list):
        with open(output_dir + i + '.csv', 'w'):
            pass
        
    return

In [69]:
def _get_type_primitive(kv_schema_df):
    type_primitive_df = pd.DataFrame()
    type_primitive_df = kv_schema_df.loc[kv_schema_df['source_type'] == 'string']

    _csv_header_name_list = []
    _csv_header_name_list = list(type_primitive_df['source_key']) #string type
    return _csv_header_name_list

In [70]:
def _segregate_fields(property_type_primitives, property_type_array):
    property_types_dd = defaultdict(list)

    primitive_fields = []
    array_fields = []

    for primitive_field in property_type_primitives:
        for array_field in property_type_array:

            tmp_array = primitive_field.split(sep='.')

            if(len(tmp_array)) > 1:

                if primitive_field == array_field: #for array list
                    array_fields.append(primitive_field)

            elif(len(tmp_array)) == 1:

                if primitive_field == array_field:
                    array_fields.append(primitive_field)
                else:
                    primitive_fields.append(primitive_field)

    property_types_dd['primitive']  = list(dict.fromkeys(primitive_fields))
    property_types_dd['array']  = array_fields
    return property_types_dd

In [71]:
def _write_type_array_header(array_fields):
    for array_field in array_fields:
        array_csv_df  = pd.DataFrame(columns=[ array_field,'_id','_index_map'])
        array_csv_df.to_csv(output_dir + array_field + '.csv', encoding='utf-8', mode='a', index=False)
    return 

## Create headers for type: array of objects and primitive type (int, str) @object level

In [72]:
def _get_type_array_object(property_type_array, property_type_primitives, property_type):
    _array_objects = set()
    _array_objects = set(property_type_primitives) - set(property_type_array) - set(property_type['primitive'])
    _array_objects_list = list(_array_objects)
    
    object_type_dd = defaultdict(list)
    non_root_header_object = []
    root_header_object = []

    for name in _array_objects_list:
        str1 = name.split('.')
        str2 = '.'.join(str1[0:-1])
        #print(str2)
        if str2 in property_type_array:
            non_root_header_object.append(name)
        else:
            root_header_object.append(name)

    object_type_dd['non_root_header_object']  = non_root_header_object
    object_type_dd['root_header_object']  = root_header_object
    return object_type_dd

In [73]:
def _get_dd_objects(object_type_dd):
    
    _dd_objects = defaultdict(list)
    _dd_non_root = defaultdict(list)
    _dd_root = defaultdict(list)
    
    for header in object_type_dd['non_root_header_object']:
        filenames = header.split(sep=".")[0:-1]
        filenames = '.'.join(filenames)
        _dd_non_root[filenames].append(header)
    
    for header in object_type_dd['root_header_object']:
        filenames = header.split(sep=".")[0:1]
        filenames = '.'.join(filenames)

        if filenames in property_type_array:
            #print(filenames)
            _dd_non_root[filenames].append(header)
        else:
            _dd_root[root_object].append(header)
    
    _dd_objects['_dd_non_root'] = _dd_non_root
    _dd_objects['_dd_root'] = _dd_root
    
    return _dd_objects

## Write headers for type: array of object

In [74]:
def _write_type_array_object_header(dd_non_root):
    for header in dd_non_root:
        columns_list = []
        columns_list = list(dd_non_root[header])
        columns_list.append('_id')
        columns_list.append('_index_map')

        object_array_csv_df  = pd.DataFrame(columns=columns_list)
        object_array_csv_df.to_csv(output_dir + header + '.csv', encoding='utf-8', mode='a', index=False)

In [75]:
def _write_root_object_header(property_type, property_type_array,type_object):
    main_header_list = []
    #exclude list of string
    main_header_list = list(set(property_type['primitive']) - set(property_type_array)) 
    main_header_list += list(type_object['_dd_root']['resident'])
    main_header_list.append('_id')
    main_header_list.append('_index_map')

    main_resident_df  = pd.DataFrame(columns = main_header_list)
    main_resident_df.to_csv(output_dir + root_object + '.csv', encoding='utf-8', mode='a', index=False)
    main_resident_df

In [76]:
def _write_schema_definition(property_type,type_object):
    filenames_list = property_type['array'] + list(type_object['_dd_non_root'])
    #filenames_list = list(dd_non_root) 
    filenames_list.append('resident')

    filenames_str = ",".join(filenames_list)
    filenames_str
    schema_desc_df = pd.DataFrame() 
    schema_desc_df['file_name'] = [filenames_str]
    schema_desc_df['file_count']  = len(filenames_list)
    schema_desc_df['date'] = pd.to_datetime('today')
    schema_desc_df['source_schema'] = input_schema_file 
    schema_desc_df['version'] = '1.0'
    schema_desc_df.to_csv(output_dir + 'schema.csv', encoding='utf-8', mode='w', index=False)
    return schema_desc_df

In [77]:
schema_df = _get_schema()
required_fields_df = _get_json_schema_properties(schema_df)
clean_schema_df = _clean_schema(required_fields_df)
kv_schema_df = _schema_to_keyvalue(clean_schema_df)

In [78]:
_write_type_main()
property_type_array = _get_type_array(kv_schema_df)
property_type_primitives = _get_type_primitive(kv_schema_df)

In [79]:
property_type = _segregate_fields(property_type_primitives,property_type_array)

In [80]:
_write_type_array_file(property_type_array)
_write_type_array_header(property_type['array'])

object_type_dd = _get_type_array_object(property_type_array, property_type_primitives, property_type)
type_object = _get_dd_objects(object_type_dd)

In [81]:
_write_type_array_object_header(type_object['_dd_non_root'])

In [82]:
_write_root_object_header(property_type, property_type_array,type_object)

In [83]:
_write_schema_definition(property_type,type_object)

Unnamed: 0,file_name,file_count,date,source_schema,version
0,"date_visits,family_members,health_informations...",10,2019-04-16 20:37:44.717067,schema/input/curisSchema.1-item.json,1.0


### BUGS LIST: 
### 1. unique identifier must be defined
### 2. fields starting with underscore are discareded