In [2]:
import pandas as pd
import numpy as np

## Hierarchical Indexing

### Reordering and Sorting levels

In [3]:
data = pd.Series(np.random.randn(9),
                index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                [1, 2, 3, 1, 3, 1, 2, 2, 3]])

In [4]:
data

a  1   -1.261009
   2   -0.034555
   3   -0.882336
b  1    0.187133
   3    0.473821
c  1    0.619018
   2   -0.332330
d  2    0.183084
   3   -0.226375
dtype: float64

In [5]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [6]:
data['b']

1    0.187133
3    0.473821
dtype: float64

In [7]:
data.loc[:,2]

a   -0.034555
c   -0.332330
d    0.183084
dtype: float64

In [8]:
data.unstack()

Unnamed: 0,1,2,3
a,-1.261009,-0.034555,-0.882336
b,0.187133,,0.473821
c,0.619018,-0.33233,
d,,0.183084,-0.226375


In [9]:
data.unstack().stack()

a  1   -1.261009
   2   -0.034555
   3   -0.882336
b  1    0.187133
   3    0.473821
c  1    0.619018
   2   -0.332330
d  2    0.183084
   3   -0.226375
dtype: float64

In [11]:
df = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                           ['Green', 'Red', 'Green']])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [14]:
df.index
#df.columns

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [17]:
df.index.names = ['key1', 'key2']
df.columns.names = ['state', 'color']
df

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [23]:
df['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [24]:
df.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [28]:
df.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [32]:
df.swaplevel(0,1).sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


### Summary Statisrics by Level

In [33]:
df.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


### Indexing with a DataFrame’s columns

In [34]:
df = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]})
df

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [39]:
df2 = df.set_index(['c','d'])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [40]:
df2.reset_index() #It does the opposite of set_index, the hierarchical index level are moved into the columns

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [38]:
df.set_index(['c','d'],drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


## Combinning and Merging Datasets

pandas.merge connects rows in DataFrame based on one or more keys. This will be familiar to users of SQL or 
orther relational dataases, as it implements database join operations.

|Operation|Description|
|---|---|
||
||
|| 
|| 
|| 
||
||
||

### many to one
\begin{itemize}
    \item df1 has mutiple rows labeled a and b, whereas df2 has only one row for each value in the key column
    \item Java
\end{itemize}

In [42]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [44]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [46]:
#merge uses the overlapping columnnames as the keys
pd.merge(df1, df2,on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [47]:
# if the column names are different in each object,you can specify them separately
#for example 
df3 = pd.DataFrame({'key3': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'key4': ['a', 'b', 'd'],
                    'data2': range(3)})

In [48]:
pd.merge(df3, df4, left_on='key3', right_on='key4')

Unnamed: 0,key3,data1,key4,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


我们可以看到合并后'c','d'消失了，事实上我们可以指定合并的方式  
How Arguments  

|Arguments|Description|
|---|---|
|'inner'| Default
|'outer'| 
|'left'| 
|'right'| 

In [52]:
pd.merge(df3, df4, left_on='key3', right_on='key4',how='outer')

Unnamed: 0,key3,data1,key4,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [None]:
''

### many to many

Many-to-many joins form the Cartesian product of the rows. 笛卡尔积，这样好理解多了

In [54]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [55]:
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [56]:
pd.merge(df1,df2,on='key',how='left')

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [60]:
# if the column names are different in each object,you can specify them separately
#for example 
df3 = pd.DataFrame({'key3': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'key4': ['a', 'b', 'a', 'b', 'd'],
                    'data2': range(5)})

In [68]:
print(df3['key3'].value_counts())
print(df4['key4'].value_counts())

a    3
b    3
c    1
Name: key3, dtype: int64
a    2
b    2
d    1
Name: key4, dtype: int64


从上述代码不难看出，如果我们选择how='left’的话，那么合并的数据集中将有3*2个a

In [59]:
pd.merge(df3,df4,left_on='key3',right_on='key4',how='left')

Unnamed: 0,key3,data1,key4,data2
0,b,0,b,1.0
1,b,0,b,3.0
2,b,1,b,1.0
3,b,1,b,3.0
4,a,2,a,0.0
5,a,2,a,2.0
6,c,3,,
7,a,4,a,0.0
8,a,4,a,2.0
9,a,5,a,0.0
