In [1]:
import pandas as pd
from numpy import nan
import string
import random
import pandas_tools as pt

In [2]:
# Set ourselves up to make a collection of "records",
# each of which contains data from a single observation in a nested dict format.
def rand_str(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for x in range(size))

rand_float = random.random
rand_int = lambda: random.randint(0,100)
# rand_value = lambda: random.choice([rand_str, rand_float, rand_int])()

In [3]:
def make_record(identifier):
    record = {
        'identifier': identifier,
        'category0': {
            'field0': rand_float(),
            'cat0_field0': rand_float(),
            'cat0_field1': rand_int(),
            'cat0_field2': {
                'cat0_field2_subfield0': rand_str(),
                'cat0_field2_subfield1': rand_float(),
                'cat0_field2_subfield2': {
                    'cat0_field2_subsubfield0': rand_int()
                },
            }
        },
        "category1": {
            'field0': rand_float(),
            'cat1_field0': {
                'cat1_field0_subfield0': {
                    'cat1_field0_subsubfield0': rand_str(),
                    'cat1_field0_subsubfield1': rand_float(),
                },
                'cat1_field0_subfield1': rand_str(),
            },
        },
    }
    return record

In [4]:
# Generate a list of records
n_records = 20
levels = ['category', 'field', 'subfield', 'subsubfield']
records = []
for _ in range(n_records):
    identifier = rand_str()
    records.append(make_record(identifier))

In [5]:
# Create flat DataFrame
df = pt.df_from_records(records, index='identifier', levels=levels, flat=True)

In [6]:
# not very pretty...
df.head()

Unnamed: 0_level_0,category0.cat0_field0,category0.cat0_field1,category0.cat0_field2.cat0_field2_subfield0,category0.cat0_field2.cat0_field2_subfield1,category0.cat0_field2.cat0_field2_subfield2.cat0_field2_subsubfield0,category0.field0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield1,category1.cat1_field0.cat1_field0_subfield1,category1.field0
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6OG73D,0.829836,1,08Q3YH,0.31514,75,0.073855,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,0.797732,22,5CSUF5,0.23611,93,0.129008,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,0.650676,84,IKEEL1,0.010769,95,0.532688,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,0.860281,8,7F6IYZ,0.746575,64,0.62473,1GBGA3,0.777581,9QVQL6,0.057096
PZZ7OV,0.167129,39,49SL8K,0.273909,84,0.157868,OPVQSP,0.118995,C3BV04,0.342933


In [7]:
# Create MultiIndexed DataFrame
df = pt.df_from_records(records, index='identifier', levels=levels, flat=False)

In [8]:
# Uneven nesting results in nan levels
df.head()

category,category0,category0,category0,category0,category0,category0,category1,category1,category1,category1
field,cat0_field0,cat0_field1,cat0_field2,cat0_field2,cat0_field2,field0,cat1_field0,cat1_field0,cat1_field0,field0
subfield,NaN,NaN,cat0_field2_subfield0,cat0_field2_subfield1,cat0_field2_subfield2,NaN,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,NaN,NaN,NaN,NaN,cat0_field2_subsubfield0,NaN,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
6OG73D,0.829836,1,08Q3YH,0.31514,75,0.073855,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,0.797732,22,5CSUF5,0.23611,93,0.129008,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,0.650676,84,IKEEL1,0.010769,95,0.532688,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,0.860281,8,7F6IYZ,0.746575,64,0.62473,1GBGA3,0.777581,9QVQL6,0.057096
PZZ7OV,0.167129,39,49SL8K,0.273909,84,0.157868,OPVQSP,0.118995,C3BV04,0.342933


In [9]:
# direct access to a single columns, not that useful
print(df['category0', 'cat0_field1', nan, nan])

identifier
6OG73D     1
QGGYMX    22
BLTNC6    84
0GSHWF     8
PZZ7OV    39
346U8C    65
XMKLHB    75
EG0POW    74
FODRRN    88
DD4EW6    65
0QROTA    88
XPJF2B    93
07DGSV    44
04FTT3    70
EQ7Q8Q    28
PIN5YZ    37
S1MHGQ    61
MHDR5H    67
KZV5OF    97
A0LP67    82
Name: (category0, cat0_field1, nan, nan), dtype: int64


In [10]:
print(df['category0', 'cat0_field1', nan, nan] is df[('category0', 'cat0_field1', nan, nan)])

True


In [11]:
df.T

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,identifier,6OG73D,QGGYMX,BLTNC6,0GSHWF,PZZ7OV,346U8C,XMKLHB,EG0POW,FODRRN,DD4EW6,0QROTA,XPJF2B,07DGSV,04FTT3,EQ7Q8Q,PIN5YZ,S1MHGQ,MHDR5H,KZV5OF,A0LP67
category,field,subfield,subsubfield,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
category0,cat0_field0,,,0.829836,0.797732,0.650676,0.860281,0.167129,0.402352,0.477864,0.888406,0.365331,0.509452,0.499796,0.864509,0.60243,0.277194,0.258307,0.0883084,0.328404,0.0186003,0.611418,0.420673
category0,cat0_field1,,,1,22,84,8,39,65,75,74,88,65,88,93,44,70,28,37,61,67,97,82
category0,cat0_field2,cat0_field2_subfield0,,08Q3YH,5CSUF5,IKEEL1,7F6IYZ,49SL8K,9SG5X4,0Y8ZT8,QOEB9L,14BI9O,RD840T,FKXLZD,I1JR7E,5SRY0Y,33D41T,20BFTC,ZYGUCA,2PYA9W,Z00AA6,CJYZ4K,OZKH1J
category0,cat0_field2,cat0_field2_subfield1,,0.31514,0.23611,0.0107689,0.746575,0.273909,0.644465,0.833957,0.115406,0.206583,0.33797,0.165385,0.323096,0.706863,0.593175,0.976734,0.0640289,0.598735,0.95734,0.613344,0.585091
category0,cat0_field2,cat0_field2_subfield2,cat0_field2_subsubfield0,75,93,95,64,84,70,98,22,50,35,58,96,6,24,65,60,58,56,91,22
category0,field0,,,0.0738547,0.129008,0.532688,0.62473,0.157868,0.345313,0.825667,0.854146,0.950249,0.864639,0.0489509,0.377769,0.547385,0.751581,0.672986,0.362963,0.497037,0.986266,0.529438,0.167172
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield0,1K9GF1,XM4LNQ,EMIIP8,1GBGA3,OPVQSP,OL2BBL,ITFEQ5,W4MFYF,A82GNZ,KWICOK,YW4GX0,F247SQ,FMJ3G6,PZMS4O,BZZSDF,SF7IWE,Q022SX,X61Z2O,0987HF,8EOROE
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield1,0.883122,0.735944,0.484757,0.777581,0.118995,0.993908,0.126963,0.27787,0.40676,0.549251,0.117123,0.642893,0.574413,0.766901,0.786028,0.799713,0.790778,0.0635174,0.789845,0.147905
category1,cat1_field0,cat1_field0_subfield1,,4N2E3R,TF05UI,9WFPA1,9QVQL6,C3BV04,5QCSJE,GFCJLS,JUVS36,G5N5LA,GO6YKH,IKUPIT,RTK3AI,ED8GHE,D5D9TN,9Z2KX9,URETUA,BUE51V,5HNHMM,B9OSDU,OOYRZ8
category1,field0,,,0.395438,0.674213,0.474376,0.0570964,0.342933,0.155704,0.645117,0.97768,0.823857,0.253858,0.0146478,0.584202,0.117734,0.776566,0.533778,0.655678,0.597158,0.853809,0.562852,0.92026


In [12]:
pt.cross_section(df, category='category1').head()

field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
6OG73D,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,1GBGA3,0.777581,9QVQL6,0.0570964
PZZ7OV,OPVQSP,0.118995,C3BV04,0.342933


In [13]:
pt.cross_section(df, category='category1', drop_level=False).head()

category,category1,category1,category1,category1
field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
6OG73D,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,1GBGA3,0.777581,9QVQL6,0.0570964
PZZ7OV,OPVQSP,0.118995,C3BV04,0.342933


In [14]:
pt.cross_section(df, field='field0').head()

category,category0,category1
subfield,NaN,NaN
subsubfield,NaN,NaN
identifier,Unnamed: 1_level_3,Unnamed: 2_level_3
6OG73D,0.073855,0.395438
QGGYMX,0.129008,0.674213
BLTNC6,0.532688,0.474376
0GSHWF,0.62473,0.057096
PZZ7OV,0.157868,0.342933


In [15]:
pt.cross_section(df, category='category1', field=['field0', 'cat1_field0']).head()

category,category1,category1,category1,category1
field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
6OG73D,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,1GBGA3,0.777581,9QVQL6,0.0570964
PZZ7OV,OPVQSP,0.118995,C3BV04,0.342933


In [16]:
pt.cross_section(df.T, category='category1', field=['field0', 'cat1_field0']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,identifier,6OG73D,QGGYMX,BLTNC6,0GSHWF,PZZ7OV,346U8C,XMKLHB,EG0POW,FODRRN,DD4EW6,0QROTA,XPJF2B,07DGSV,04FTT3,EQ7Q8Q,PIN5YZ,S1MHGQ,MHDR5H,KZV5OF,A0LP67
category,field,subfield,subsubfield,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield0,1K9GF1,XM4LNQ,EMIIP8,1GBGA3,OPVQSP,OL2BBL,ITFEQ5,W4MFYF,A82GNZ,KWICOK,YW4GX0,F247SQ,FMJ3G6,PZMS4O,BZZSDF,SF7IWE,Q022SX,X61Z2O,0987HF,8EOROE
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield1,0.883122,0.735944,0.484757,0.777581,0.118995,0.993908,0.126963,0.27787,0.40676,0.549251,0.117123,0.642893,0.574413,0.766901,0.786028,0.799713,0.790778,0.0635174,0.789845,0.147905
category1,cat1_field0,cat1_field0_subfield1,,4N2E3R,TF05UI,9WFPA1,9QVQL6,C3BV04,5QCSJE,GFCJLS,JUVS36,G5N5LA,GO6YKH,IKUPIT,RTK3AI,ED8GHE,D5D9TN,9Z2KX9,URETUA,BUE51V,5HNHMM,B9OSDU,OOYRZ8
category1,field0,,,0.395438,0.674213,0.474376,0.0570964,0.342933,0.155704,0.645117,0.97768,0.823857,0.253858,0.0146478,0.584202,0.117734,0.776566,0.533778,0.655678,0.597158,0.853809,0.562852,0.92026


In [17]:
pt.flatten(df).head()

Unnamed: 0_level_0,category0.cat0_field0,category0.cat0_field1,category0.cat0_field2.cat0_field2_subfield0,category0.cat0_field2.cat0_field2_subfield1,category0.cat0_field2.cat0_field2_subfield2.cat0_field2_subsubfield0,category0.field0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield1,category1.cat1_field0.cat1_field0_subfield1,category1.field0
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6OG73D,0.829836,1,08Q3YH,0.31514,75,0.073855,1K9GF1,0.883122,4N2E3R,0.395438
QGGYMX,0.797732,22,5CSUF5,0.23611,93,0.129008,XM4LNQ,0.735944,TF05UI,0.674213
BLTNC6,0.650676,84,IKEEL1,0.010769,95,0.532688,EMIIP8,0.484757,9WFPA1,0.474376
0GSHWF,0.860281,8,7F6IYZ,0.746575,64,0.62473,1GBGA3,0.777581,9QVQL6,0.057096
PZZ7OV,0.167129,39,49SL8K,0.273909,84,0.157868,OPVQSP,0.118995,C3BV04,0.342933


In [18]:
pt.flatten(df.T).head()

identifier,6OG73D,QGGYMX,BLTNC6,0GSHWF,PZZ7OV,346U8C,XMKLHB,EG0POW,FODRRN,DD4EW6,0QROTA,XPJF2B,07DGSV,04FTT3,EQ7Q8Q,PIN5YZ,S1MHGQ,MHDR5H,KZV5OF,A0LP67
category0.cat0_field0,0.829836,0.797732,0.650676,0.860281,0.167129,0.402352,0.477864,0.888406,0.365331,0.509452,0.499796,0.864509,0.60243,0.277194,0.258307,0.0883084,0.328404,0.0186003,0.611418,0.420673
category0.cat0_field1,1,22,84,8,39,65,75,74,88,65,88,93,44,70,28,37,61,67,97,82
category0.cat0_field2.cat0_field2_subfield0,08Q3YH,5CSUF5,IKEEL1,7F6IYZ,49SL8K,9SG5X4,0Y8ZT8,QOEB9L,14BI9O,RD840T,FKXLZD,I1JR7E,5SRY0Y,33D41T,20BFTC,ZYGUCA,2PYA9W,Z00AA6,CJYZ4K,OZKH1J
category0.cat0_field2.cat0_field2_subfield1,0.31514,0.23611,0.0107689,0.746575,0.273909,0.644465,0.833957,0.115406,0.206583,0.33797,0.165385,0.323096,0.706863,0.593175,0.976734,0.0640289,0.598735,0.95734,0.613344,0.585091
category0.cat0_field2.cat0_field2_subfield2.cat0_field2_subsubfield0,75,93,95,64,84,70,98,22,50,35,58,96,6,24,65,60,58,56,91,22
