## Debuging Mapper

Mapper is the main function for flat-table to understand each dataframe. In this notebook, we will be debugging mapper function to find insights about features and bugs.

In [1]:
import os
import sys
sys.path.insert(0, os.pardir)

In [2]:
import datetime
import numpy as np
import pandas as pd
from flat_table import mapper, normalize
from flat_table._norm import get_type, to_columns, to_rows, print_parent_child_node

In [3]:
# last debug date
datetime.datetime.today()

datetime.datetime(2020, 8, 23, 14, 16, 44, 368704)

### Sample Dataframe

In [4]:
data = [
    (
        1001, 
        { 'first_name': 'john', 'last_name': 'smith', 'phones': {'mobile': '201-..', 'home': '978-..'} }, 
        [{ 'zip': '07014', 'city': 'clifton' }]
    ),
    (
        1002, 
        np.nan, 
        [{'zip': '07014', 'address1': '1 Journal Square'}]
    ),
    (
        1003, 
        { 'first_name': 'marry', 'last_name': 'kate', 'gender': 'female' }, 
        [{ 'zip': '10001', 'city': 'new york' }, { 'zip': '10008', 'city': 'brooklyn' }]
    ),
]
df = pd.DataFrame(data, columns=['id', 'user_info', 'address'])

### Mapper

In [5]:
headers = ['parent', 'child', 'type', 'obj']
series_list = [('.', n, get_type(s), s) for n, s in df.iteritems()]

for ind, (parent, name, _, child) in enumerate(series_list):
    inside = get_type(child)
    
    # parent name
    if parent == '.' or parent == '':
        parent_name = name
    else:
        parent_name = '.'.join([parent, name])

    # parent child nodes '.' level
    print_parent_child_node(parent, child)
    
    def insert_to_series(p_name, child):  
        if p_name == '.' or p_name == '':
            c_name = child.name
        else:
            c_name = '.'.join((p_name, child.name))
        series_list.insert(
            ind + 1, (p_name, c_name, get_type(_child), _child))
        print_parent_child_node(p_name, _child)
        

    # expand rowwise, add new series for processing
    if inside == 'list':
        _child = to_rows(child)
        insert_to_series('', _child)

    # expand columnwise, add new columns for processing
    if inside == 'dict':
        temp = to_columns(child)
        if temp.shape[1] > 1:
            for _name, _child in temp.iteritems():
                insert_to_series(parent_name, _child)
        else:
            _child = temp.iloc[:, 0]
            insert_to_series(parent_name, _child)

pd.DataFrame(data=series_list, columns=headers)

Unnamed: 0,parent,child,type,obj
0,.,id,int,"0 1001 1 1002 2 1003 Name: id, dtype:..."
1,.,user_info,dict,"0 {'first_name': 'john', 'last_name': 'smit..."
2,user_info,user_info.gender,str,0 NaN 1 NaN 2 female Name: gend...
3,user_info,user_info.phones.home,str,0 978-.. 1 NaN 2 NaN Name: phon...
4,user_info,user_info.phones.mobile,str,0 201-.. 1 NaN 2 NaN Name: phon...
5,user_info,user_info.last_name,str,0 smith 1 NaN 2 kate Name: last_na...
6,user_info,user_info.first_name,str,0 john 1 NaN 2 marry Name: first_n...
7,.,address,list,"0 [{'zip': '07014', 'city': 'cl..."
8,,address,dict,"0 {'zip': '07014', 'city': 'cl..."
9,address,address.address1,str,0 NaN 1 1 Journal Square 2 ...
