In [2]:
import pandas as pd
import numpy as np

# I've returned

It's been a few weeks now that I've been looking at SQL and Rust while avoiding pandas and scikit learn. My adventures have taught me a few things, but I must get back to the task at hand.

It might seem like I'm abandoning the project I called Queruntine. And that's because I am. Perhaps I should put a note in the README there that says so.

Anyhow, I've played with OOP, TDD and Python packaging. I've played with dynammic libraries and Python's ctypes module. Perhaps I'll come back to those some day.

Today I'm going to look at Data Wrangling with Python. Chapter 8 is called **Join, Combine and Reshape**.

## Hierarchical Indexing

In [3]:
data = pd.Series(np.random.randn(9),
                index=[
                    'a a a b b c c d d'.split(),
                    [1, 2, 3, 1, 3, 1, 2, 2, 3]
                ])

In [4]:
data

a  1    1.859851
   2    0.082733
   3    1.414130
b  1    1.554929
   3    0.305905
c  1    0.096548
   2   -0.355811
d  2   -0.141533
   3   -0.224956
dtype: float64

In [5]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

We can select by index

In [6]:
data['b']

1    1.554929
3    0.305905
dtype: float64

We can slice by index. Whoa nice.

In [7]:
data['b':'c']

b  1    1.554929
   3    0.305905
c  1    0.096548
   2   -0.355811
dtype: float64

We can select multiple subsets by index.

In [9]:
data[['b','d']]

b  1    1.554929
   3    0.305905
d  2   -0.141533
   3   -0.224956
dtype: float64

We can even use `loc`. Here we're saying:  

    give me everything in the top level and only the '2's in the second level.  

Note that this is not the 2nd item in the second level, it is the item labeled '2'.

In [4]:
data

a  1    1.859851
   2    0.082733
   3    1.414130
b  1    1.554929
   3    0.305905
c  1    0.096548
   2   -0.355811
d  2   -0.141533
   3   -0.224956
dtype: float64

In [10]:
data.loc[:,2]

a    0.082733
c   -0.355811
d   -0.141533
dtype: float64

In [11]:
data.unstack()

Unnamed: 0,1,2,3
a,1.859851,0.082733,1.41413
b,1.554929,,0.305905
c,0.096548,-0.355811,
d,,-0.141533,-0.224956


In [12]:
data.unstack().stack()

a  1    1.859851
   2    0.082733
   3    1.414130
b  1    1.554929
   3    0.305905
c  1    0.096548
   2   -0.355811
d  2   -0.141533
   3   -0.224956
dtype: float64

## Either axis can have a heirarchical index

In [13]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
        index=['a a b b'.split(), [1, 2, 1, 2]],
        columns=['Ohio Ohio Colorado'.split(),1
                'Green Red Green'.split()]
    )

In [14]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


The levels can be named. The names show up when printed out. Nice.

In [16]:
frame.index.names = ['key1', 'key2']

In [17]:
frame.columns.names = ['state', 'color']

In [18]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


Partial indexing

In [19]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


`MultiIndex` can be created and reused.

In [21]:
states = 'Ohio Ohio Colorado'.split()
colors = 'Green Red Green'.split()
names = ['state', 'color']

my_mix = pd.MultiIndex.from_arrays(
    [states, colors],
    names=names
)

In [22]:
my_mix

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

## Reorder and sorting

In [23]:
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [24]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [25]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


Summary statistics by level

In [26]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [27]:
frame.sum(level='color', axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## Indexing with a DataFrame's columns

In [28]:
frame = pd.DataFrame({
    'a': range(7),
    'b': range(7,0,-1),
    'c': 'one one one two two two two'.split(),
    'd': [0, 1, 2, 0, 1, 2, 3]
})

In [29]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [30]:
frame2 = frame.set_index(['c', 'd'])

In [31]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [32]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [33]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
