In [928]:
import logging
import json
import csv
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize 
from pandas import read_csv
from functools import reduce
from collections import defaultdict

## Configuration 

In [929]:
#etl = 'kobo2elastic'
etl = 'curis2elastic'
#etl = 'oldcuris2newcuris'

input_schema_file = ''
input_data_file = ''
mapping_file = ''

output_dir = 'file/curisSchema/'

if etl == 'curis2elastic':
    #old curis to elasticsearch
    input_schema_file = 'schema/input/curisSchema.1-item.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/couchbase2elastic.map.csv'
elif etl == 'kobo2elastic':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/aqmSchema.complete.json'
    input_data_file = 'data/aqmData.2-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'
elif etl == 'oldcuris2newcuris':
    #kobo to elasticsearch
    input_schema_file = 'schema/input/curisData.1-Schema.avro.json'
    input_data_file = 'data/curisData.1-items.json'
    mapping_file = 'schema/map/kobo2elastic.map.csv'

## Get input JSON Schema (draft 07)

In [930]:
schema_data = {}
schema_df = pd.DataFrame()

with open(input_schema_file) as f:
    schema_data = json.load(f)

schema_df = json_normalize(schema_data)
schema_df.T

Unnamed: 0,0
$id,http://example.com/root.json
$schema,http://json-schema.org/draft-07/schema#
properties.address.$id,#/properties/address
properties.address.properties.barangay.$id,#/properties/address/properties/barangay
properties.address.properties.barangay.pattern,^(.*)$
properties.address.properties.barangay.title,The Barangay Schema
properties.address.properties.barangay.type,string
properties.address.properties.country.$id,#/properties/address/properties/country
properties.address.properties.country.pattern,^(.*)$
properties.address.properties.country.title,The Country Schema


## Get valid index List only (for JSON Schema Only)

In [931]:
required_index_list = []
required_field_df = pd.DataFrame()

for indexList in list(schema_df):
    if "._" not in indexList:
        if len(indexList.split(sep='.')) > 2:
            if indexList.split(sep='.')[-1] == 'type' or indexList.split(sep='.')[-1] == 'title':
                required_index_list.append(indexList)

In [932]:
required_field_df = schema_df[required_index_list]
required_field_df.T

Unnamed: 0,0
properties.address.properties.barangay.title,The Barangay Schema
properties.address.properties.barangay.type,string
properties.address.properties.country.title,The Country Schema
properties.address.properties.country.type,string
properties.address.properties.lot_or_house_number.title,The Lot_or_house_number Schema
properties.address.properties.lot_or_house_number.type,string
properties.address.properties.postal_code.title,The Postal_code Schema
properties.address.properties.postal_code.type,string
properties.address.properties.province.title,The Province Schema
properties.address.properties.province.type,string


## Clean value and index

In [933]:
def clean_value(x):
    return x.lower().replace("/", ".").replace("the", "").replace("schema", "").strip()

def clean_index(x):
    return x.lower().replace("/", ".").replace("properties.", "").replace("items.", "").strip()

newSchema_df = pd.DataFrame()
newSchema_df['value'] = required_field_df.T[0].apply(clean_value)
newSchema_df.reset_index(level=0, inplace=True)
newSchema_df['index'] = newSchema_df['index'].apply(clean_index)
newSchema_df.sort_values(['index'])
newSchema_df.head(60)

Unnamed: 0,index,value
0,address.barangay.title,barangay
1,address.barangay.type,string
2,address.country.title,country
3,address.country.type,string
4,address.lot_or_house_number.title,lot_or_house_number
5,address.lot_or_house_number.type,string
6,address.postal_code.title,postal_code
7,address.postal_code.type,string
8,address.province.title,province
9,address.province.type,string


## Create columns for type and key

In [934]:
valueSchema_df = pd.DataFrame()

number = len(newSchema_df)
index = 0
counter = 0
counter1 = 1

array_key = []
array_type = []

while (counter < number):
    array_key.append(newSchema_df.iloc[counter]['index'].replace('.title',''))
    counter += 2
        
while (counter1 < number):
    array_type.append(newSchema_df.iloc[counter1]['value'])
    counter1 += 2

valueSchema_df['source_key'] = array_key
valueSchema_df['source_type'] = array_type

valueSchema_df = valueSchema_df.sort_values(['source_key']).reset_index(drop=True)
valueSchema_df.loc[valueSchema_df['source_type'] == 'array']

csv_filename_df = pd.DataFrame()
csv_filename_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'array']
#list(csv_filename_df['source_key'])
csv_filename_df = csv_filename_df.reset_index(drop=True)
csv_filename_df

Unnamed: 0,source_key,source_type
0,date_visits,array
1,family_members,array
2,health_informations,array
3,health_informations.diagnosed,array
4,health_informations.family_history,array
5,households,array
6,households.amenities_present_in_house,array
7,households.sanitary_type,array
8,profiles,array


## Create root or default file (e.g. main, resident)

In [935]:
with open(output_dir + 'resident.csv', 'w'):
    pass

FileNotFoundError: [Errno 2] No such file or directory: 'file/curisSchema/resident.csv'

## Create separate file for each array

In [None]:
csv_filename_list = []
csv_filename_list = list(csv_filename_df['source_key']) #array type

for i in list(csv_filename_list):
    with open('file/' + i + '.csv', 'w'):
        pass

In [None]:
csv_header_name_df = pd.DataFrame()
csv_header_name_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'string']

csv_header_name_list = []
csv_header_name_list = list(csv_header_name_df['source_key']) #string type

## Segregate headers for every field type

In [None]:
main_headers = []
array_headers = []
object_header = []

for header in csv_header_name_list:
    for filename in csv_filename_list:
        tmp_header = header.split(sep='.')
        #print(tmp_header)
        if(len(tmp_header)) > 1:
            #pass
            #print(tmp_header[:-1])
            #tmp_string = '.'.join(tmp_header[:-1])
            tmp_string = header
            
            #print(tmp_string)
            
            #print('============')
            if tmp_string == filename: #for array list
                #print('---array_list---')
                #print(header)
                #print('--header/field: ' + tmp_string + ' == --filename: ' + filename)
                array_headers.append(header)
        elif(len(tmp_header)) == 1:
            tmp_string = header
            if tmp_string == filename:
                array_headers.append(header)
            else:
                #print(tmp_header)
                main_headers.append(header)

## Write headers for type: lists

In [None]:
for header in array_headers:
    array_csv_df  = pd.DataFrame(columns=[ header,'_id','_index_map'])
    array_csv_df.to_csv(output_dir + header + '.csv', encoding='utf-8', mode='a', index=False)

## TODO filter other primitive type

In [None]:
primitive_header_list = []
primitive_header_df = pd.DataFrame() 
primitive_header_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'string']
primitive_header_list = list(primitive_header_df['source_key'])
primitive_header_list = set(primitive_header_list) - set(array_headers) #exclude list of string
primitive_header_list

## Create headers for type: array of objects and primitive type (int, str) @object level

In [None]:
array_objects = set()
array_objects = set(csv_header_name_list) - set(csv_filename_list) - set(main_headers)
array_objects_list = list(array_objects)
array_objects_list 

## TODO: make nested iterative

In [None]:
non_root_header_object = []
root_header_object = []

for name in array_objects_list:
    str1 = name.split('.')
    str2 = '.'.join(str1[0:-1])
    #print(str2)
    if str2 in csv_filename_list:
        non_root_header_object.append(name)
    else:
        root_header_object.append(name)
        
root_header_object

## Non-root object type

In [None]:
dd_non_root = defaultdict(list)
for header in non_root_header_object:
    filenames = header.split(sep=".")[0:-1]
    filenames = '.'.join(filenames)
    dd_non_root[filenames].append(header)
    
dd_non_root

## Root object type

In [None]:
dd_root = defaultdict(list)
for header in root_header_object:
    filenames = header.split(sep=".")[0:1]
    filenames = '.'.join(filenames)
    
    if filenames in csv_filename_list:
        #print(filenames)
        dd_non_root[filenames].append(header)
    else:
        dd_root['resident'].append(header)

list(dd_root['resident'])

## Write headers for type: array of object

In [None]:
for header in dd:
    columns_list = []
    columns_list = list(dd_non_root[header])
    columns_list.append('_id')
    columns_list.append('_index_map')
    object_array_csv_df  = pd.DataFrame(columns=columns_list)
    object_array_csv_df.to_csv(output_dir + header + '.csv', encoding='utf-8', mode='a', index=False)

## Get headers for type: object

In [None]:
objects_header_list = []
objects_header_df = pd.DataFrame()
objects_header_df = valueSchema_df.loc[valueSchema_df['source_type'] == 'object']
objects_header_list = list(objects_header_df['source_key'])
objects_header_list

## Get header for type: string @root level

In [None]:
main_header_list = []
main_header_list = list(dict.fromkeys(main_headers))
main_header_list =  list(set(main_header_list) - set(array_headers)) #exclude list of string
main_header_list

## Combine and Get headers for type: primitive and object @root level

In [None]:
main_header_list += (list(dd_root['resident']))
main_header_list

In [None]:
main_header_list.append('_id')
main_header_list.append('_index_map')

In [None]:
main_resident_df  = pd.DataFrame(columns = main_header_list)
main_resident_df.to_csv(output_dir + 'resident.csv', encoding='utf-8', mode='a', index=False)
main_resident_df

## Write schema definitions into a text

In [None]:
array_headers

In [None]:
filenames_list = array_headers + list(dd_non_root) 
#filenames_list = list(dd_non_root) 
filenames_list.append('resident')

foo = ",".join(filenames_list)
foo

In [None]:
schema_desc_df = pd.DataFrame() 

In [None]:
schema_desc_df['file_name'] = [foo]
schema_desc_df['file_count']  = len(filenames_list)
schema_desc_df['date'] = pd.to_datetime('today')
schema_desc_df['source_schema'] = input_schema_file 
schema_desc_df['version'] = '1.0'
schema_desc_df

In [None]:
schema_desc_df.to_csv(output_dir + 'schema.csv', encoding='utf-8', mode='w', index=False)