In [1]:
import pandas as pd
import numpy as np
import datetime

## 合并客户表和订单表

In [2]:
customers= {'CustomerID':[10, 11], 
            'Name':['Mike', 'Marcia'],
           'Address':['Address for Mike', 'Address for Marcia']}
customers = pd.DataFrame(customers)
customers

Unnamed: 0,Address,CustomerID,Name
0,Address for Mike,10,Mike
1,Address for Marcia,11,Marcia


In [3]:
orders = {'CustomerID':[10,11,10],
         'OrderDate':[datetime.datetime(2016, 12,1),
                     datetime.datetime(2016, 12, 1),
                     datetime.datetime(2016,12,2)]}
orders = pd.DataFrame(orders)
orders

Unnamed: 0,CustomerID,OrderDate
0,10,2016-12-01
1,11,2016-12-01
2,10,2016-12-02


In [4]:
customers.merge(orders)

Unnamed: 0,Address,CustomerID,Name,OrderDate
0,Address for Mike,10,Mike,2016-12-01
1,Address for Mike,10,Mike,2016-12-02
2,Address for Marcia,11,Marcia,2016-12-01


In [5]:
customers.map(orders)    # 看来map试试用在原地替换，而不是关联的！ 

AttributeError: 'DataFrame' object has no attribute 'map'

In [6]:
customers.merge?

In [7]:
left_data = {'key1':['a', 'b', 'c'],
            'key2':['x', 'y', 'z'],
             'lval1':[0, 1, 2]
            }
right_data = {'key1':['a', 'b', 'c'],
             'key2':['x', 'a', 'z'],
              'rval1': [6, 7, 8]
             }
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [8]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [9]:
left.merge(right)

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


> 有关做索引的话，如果两个表的索引的名称是相同的，可以直接使用**`on`**参数进行制定;而如果两边的索引的名称是不同的时候，需要使用**`left_on`**来指定左边的索引，用**`right_on`**来指定右边的索引。通过这种方式进行**merge**操作。

In [11]:
left.merge(right, on='key1')    # 对其他两边都有的列名会自动进行处理。

Unnamed: 0,key1,key2_x,lval1,key2_y,rval1
0,a,x,0,x,6
1,b,y,1,a,7
2,c,z,2,z,8


In [12]:
left.merge(right, on=['key1', 'key2'])    # 和缺省的情况下是相同的

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [13]:
# 而且可以通过index来进行merge
pd.merge(left, right, left_index=True, right_index=True)

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


## merge进阶(实际上和数据库表的操作是类似的)
* inner:两个DataFrame的key交集；
* outer:两个DataFrame的key并集；
* left:只使用左边的DataFrame的key；
* right:只使用右边的DataFrame的key；

In [14]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [15]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [16]:
left.merge(right, how='outer')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6.0
1,b,y,1.0,
2,c,z,2.0,8.0
3,b,a,,7.0


In [17]:
left.merge(right, how='left')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6.0
1,b,y,1,
2,c,z,2,8.0


In [18]:
left.merge(right, how='right')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6
1,c,z,2.0,8
2,b,a,,7


# join
**使用 *`index label`* **

In [19]:
left.join?

In [20]:
left.join(right, lsuffix='_left', rsuffix='_right')   # 在这个地方使用的是缺省的how，也就是left，如果有相同的列，需要给出后缀

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0,,,
1,b,y,1,a,x,6.0
2,c,z,2,b,a,7.0


In [21]:
left.join(right,  lsuffix='_left', rsuffix='_right', how='inner')

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


In [22]:
left.join(right,  lsuffix='_left', rsuffix='_right', how='outer')

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0.0,,,
1,b,y,1.0,a,x,6.0
2,c,z,2.0,b,a,7.0
3,,,,c,z,8.0


# concat

In [33]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), columns=['a', 'b', 'c'])

In [34]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [35]:
df2

Unnamed: 0,a,b,c
0,9,10,11
1,12,13,14
2,15,16,17


In [36]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
0,9,10,11
1,12,13,14
2,15,16,17


In [37]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3), columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3), columns=['a', 'c', 'd'])

In [38]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [39]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [40]:
pd.concat([df1, df2])    # 会自动进行扩展

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [42]:
c = pd.concat([df1, df2], keys=['df1', 'df2'])    # 等于是添加了一层index
c

Unnamed: 0,Unnamed: 1,a,b,c,d
df1,0,0,1.0,2,
df1,1,3,4.0,5,
df1,2,6,7.0,8,
df2,0,9,,10,11.0
df2,1,12,,13,14.0
df2,2,15,,16,17.0


In [43]:
c.ix['df2']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


> 在是用`concat`的时候，也可以使用**`axis`**,通过设定值为1，进行水平合并，默认是0，垂直合并。

In [44]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,a,b,c,a.1,c.1,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17
