In [1]:
import pandas as pd
from numpy import nan
import string
import random
import pandas_tools as pt

In [2]:
# Set ourselves up to make a collection of "records",
# each of which contains data from a single observation in a nested dict format.
def rand_str(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for x in range(size))

rand_float = random.random
rand_int = lambda: random.randint(0,100)
# rand_value = lambda: random.choice([rand_str, rand_float, rand_int])()

In [3]:
def make_nasty_record(identifier):
    record = {
        'identifier': identifier,
        'category0': {
            'field0': rand_float(),
            'cat0_field0': rand_float(),
            'cat0_field1': rand_int(),
            'cat0_field2': {
                'cat0_field2_subfield0': rand_str(),
                'cat0_field2_subfield1': rand_float(),
                'cat0_field2_subfield2': {
                    'cat0_field2_subsubfield0': rand_int()
                },
            }
        },
        "category1": {
            'field0': rand_float(),
            'cat1_field0': {
                'cat1_field0_subfield0': {
                    'cat1_field0_subsubfield0': rand_str(),
                    'cat1_field0_subsubfield1': rand_float(),
                },
                'cat1_field0_subfield1': rand_str(),
            },
        },
    }
    return record

In [4]:
# Generate a list of records
n_records = 20
levels = ['category', 'field', 'subfield', 'subsubfield']
records = []
for _ in range(n_records):
    identifier = rand_str()
    records.append(make_nasty_record(identifier))

In [5]:
# the naive approach doesn't do what we want
df = pd.DataFrame.from_records(records).set_index('identifier')
df.head()
# could use some more structure...

Unnamed: 0_level_0,category0,category1
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1
AY60M0,"{'field0': 0.7159498552077563, 'cat0_field0': ...","{'field0': 0.9383307483509256, 'cat1_field0': ..."
TU1S2F,"{'field0': 0.4123825135514271, 'cat0_field0': ...","{'field0': 0.9707514663021392, 'cat1_field0': ..."
NLEYKY,"{'field0': 0.5451587630399891, 'cat0_field0': ...","{'field0': 0.721439752200831, 'cat1_field0': {..."
LY7Z66,"{'field0': 0.03318838961716197, 'cat0_field0':...","{'field0': 0.9960775735273647, 'cat1_field0': ..."
Y1TSEJ,"{'field0': 0.015210012795852101, 'cat0_field0'...","{'field0': 0.6372873705461016, 'cat1_field0': ..."


In [6]:
# Create flat DataFrame
df = pt.df_from_records(records, index='identifier', levels=levels, flat=True)

In [7]:
# better, but still not very pretty...
df.head()

Unnamed: 0_level_0,category0.cat0_field0,category0.cat0_field1,category0.cat0_field2.cat0_field2_subfield0,category0.cat0_field2.cat0_field2_subfield1,category0.cat0_field2.cat0_field2_subfield2.cat0_field2_subsubfield0,category0.field0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield1,category1.cat1_field0.cat1_field0_subfield1,category1.field0
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AY60M0,0.385168,35,PLHXCX,0.255968,27,0.71595,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,0.447821,65,IOP8CB,0.09692,47,0.412383,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,0.32507,17,A2UISQ,0.132807,60,0.545159,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,0.357351,42,KRT9BP,0.38205,40,0.033188,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,0.417987,13,NODNOH,0.085444,70,0.01521,NTXVMI,0.856113,12L2O0,0.637287


In [8]:
# Create MultiIndexed DataFrame
df = pt.df_from_records(records, index='identifier', levels=levels, flat=False)

In [9]:
# Uneven nesting results in nan levels
df.head()

category,category0,category0,category0,category0,category0,category0,category1,category1,category1,category1
field,cat0_field0,cat0_field1,cat0_field2,cat0_field2,cat0_field2,field0,cat1_field0,cat1_field0,cat1_field0,field0
subfield,NaN,NaN,cat0_field2_subfield0,cat0_field2_subfield1,cat0_field2_subfield2,NaN,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,NaN,NaN,NaN,NaN,cat0_field2_subsubfield0,NaN,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4
AY60M0,0.385168,35,PLHXCX,0.255968,27,0.71595,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,0.447821,65,IOP8CB,0.09692,47,0.412383,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,0.32507,17,A2UISQ,0.132807,60,0.545159,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,0.357351,42,KRT9BP,0.38205,40,0.033188,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,0.417987,13,NODNOH,0.085444,70,0.01521,NTXVMI,0.856113,12L2O0,0.637287


In [10]:
df.T.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,identifier,AY60M0,TU1S2F,NLEYKY,LY7Z66,Y1TSEJ,L54MFG,HFNEXJ,440EK2,2NMNJ6,TKXLH6,BPYUZL,NBXVTA,FE3Y1N,3TE8MU,UV8D63,HWXP1D,PXB6U5,ND1ZE2,R73KAR,STF76H
category,field,subfield,subsubfield,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
category0,cat0_field0,,,0.385168,0.447821,0.32507,0.357351,0.417987,0.562536,0.428113,0.0957001,0.287681,0.59059,0.405969,0.663312,0.360263,0.736502,0.950897,0.947354,0.906212,0.0357385,0.186766,0.488405
category0,cat0_field1,,,35,65,17,42,13,12,42,15,71,61,51,0,11,28,88,9,89,37,95,56
category0,cat0_field2,cat0_field2_subfield0,,PLHXCX,IOP8CB,A2UISQ,KRT9BP,NODNOH,9XJ41G,8VKT2Z,TQGY8S,HV9UAY,ROO2GK,JPPKLZ,5XF5VV,0VR8FL,8ZH2ZL,FSSHBX,VRJRZ3,3SI2FZ,7IO3NZ,H6Q9F3,VXLO88
category0,cat0_field2,cat0_field2_subfield1,,0.255968,0.0969197,0.132807,0.38205,0.0854444,0.647378,0.176737,0.626153,0.87496,0.678579,0.369423,0.106248,0.254578,0.0495011,0.697781,0.292897,0.994572,0.672165,0.151839,0.790194
category0,cat0_field2,cat0_field2_subfield2,cat0_field2_subsubfield0,27,47,60,40,70,26,3,22,92,82,5,65,60,49,87,86,11,94,53,91


In [11]:
# direct access to a single columns, not that useful
print(df['category0', 'cat0_field1', nan, nan])

identifier
AY60M0    35
TU1S2F    65
NLEYKY    17
LY7Z66    42
Y1TSEJ    13
L54MFG    12
HFNEXJ    42
440EK2    15
2NMNJ6    71
TKXLH6    61
BPYUZL    51
NBXVTA     0
FE3Y1N    11
3TE8MU    28
UV8D63    88
HWXP1D     9
PXB6U5    89
ND1ZE2    37
R73KAR    95
STF76H    56
Name: (category0, cat0_field1, nan, nan), dtype: int64


In [12]:
print(df['category0', 'cat0_field1', nan, nan] is df[('category0', 'cat0_field1', nan, nan)])

True


In [13]:
# cross_section, a wrapper around pandas.DataFrame.xs, is a very useful function
pt.cross_section(df, category='category1').head()

field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
AY60M0,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,NTXVMI,0.856113,12L2O0,0.637287


In [14]:
pt.cross_section(df, category='category1', drop_level=False).head()

category,category1,category1,category1,category1
field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
AY60M0,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,NTXVMI,0.856113,12L2O0,0.637287


In [15]:
pt.cross_section(df, category='category1', field=['field0', 'cat1_field0']).head()

category,category1,category1,category1,category1
field,cat1_field0,cat1_field0,cat1_field0,field0
subfield,cat1_field0_subfield0,cat1_field0_subfield0,cat1_field0_subfield1,NaN
subsubfield,cat1_field0_subsubfield0,cat1_field0_subsubfield1,NaN,NaN
identifier,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4
AY60M0,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,NTXVMI,0.856113,12L2O0,0.637287


In [16]:
# thanks to the handle_transpose decorator, operations work as expected on either df or df.T
pt.cross_section(df.T, category='category1', field=['field0', 'cat1_field0']).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,identifier,AY60M0,TU1S2F,NLEYKY,LY7Z66,Y1TSEJ,L54MFG,HFNEXJ,440EK2,2NMNJ6,TKXLH6,BPYUZL,NBXVTA,FE3Y1N,3TE8MU,UV8D63,HWXP1D,PXB6U5,ND1ZE2,R73KAR,STF76H
category,field,subfield,subsubfield,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield0,OG8PPK,M3TYAP,M8SJP2,2CUUPM,NTXVMI,1UVA8D,2GG9MT,V73MEA,C0QPKU,ECE49W,8HNA1G,5BKER1,HVTG04,SINFQO,OO6A3F,A2US05,046DDL,1ZKZI9,6Z1G2G,BWECPD
category1,cat1_field0,cat1_field0_subfield0,cat1_field0_subsubfield1,0.706275,0.29207,0.297235,0.714844,0.856113,0.67196,0.50569,0.112308,0.696624,0.205364,0.698738,0.638854,0.272181,0.698339,0.385319,0.669463,0.19501,0.0159193,0.311029,0.193853
category1,cat1_field0,cat1_field0_subfield1,,479X8M,E12N75,OVVLT2,RWK4PP,12L2O0,ZX50W0,L3LOLG,MWJJDJ,GXUNDR,AZQYP0,37V27J,HEBPFN,OCQ6QV,3ZCPL6,MSBPY3,O8OUV1,V4PA7L,HOLDEI,63WU07,KHYVLI
category1,field0,,,0.938331,0.970751,0.72144,0.996078,0.637287,0.805208,0.80846,0.303609,0.186492,0.112042,0.0934625,0.0964868,0.932292,0.405459,0.492861,0.345558,0.663512,0.0260501,0.191304,0.879931


In [17]:
pt.flatten(df).head()

Unnamed: 0_level_0,category0.cat0_field0,category0.cat0_field1,category0.cat0_field2.cat0_field2_subfield0,category0.cat0_field2.cat0_field2_subfield1,category0.cat0_field2.cat0_field2_subfield2.cat0_field2_subsubfield0,category0.field0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield0,category1.cat1_field0.cat1_field0_subfield0.cat1_field0_subsubfield1,category1.cat1_field0.cat1_field0_subfield1,category1.field0
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AY60M0,0.385168,35,PLHXCX,0.255968,27,0.71595,OG8PPK,0.706275,479X8M,0.938331
TU1S2F,0.447821,65,IOP8CB,0.09692,47,0.412383,M3TYAP,0.29207,E12N75,0.970751
NLEYKY,0.32507,17,A2UISQ,0.132807,60,0.545159,M8SJP2,0.297235,OVVLT2,0.72144
LY7Z66,0.357351,42,KRT9BP,0.38205,40,0.033188,2CUUPM,0.714844,RWK4PP,0.996078
Y1TSEJ,0.417987,13,NODNOH,0.085444,70,0.01521,NTXVMI,0.856113,12L2O0,0.637287
