In [1]:
import pandas as pd

# 使用merge 来处理非同等情况下的进行合并
df1 = pd.DataFrame(
    ['10','20','25','30'],
    index=['a','b','c','d'],
    columns=['PE']
)
df1

Unnamed: 0,PE
a,10
b,20
c,25
d,30


In [2]:
df2 = pd.DataFrame(
    ['2.5','3.2','2.5','2'],
    index=['a','b','c','d'],
    columns=['PB']
)
df2

Unnamed: 0,PB
a,2.5
b,3.2
c,2.5
d,2.0


In [3]:
ROE = pd.Series(
    [0.12,0.06,0.08,0.02],
    index=['a','b','c','d']
)
df1['ROE'] = ROE
df2['ROE'] = ROE

In [4]:
df1

Unnamed: 0,PE,ROE
a,10,0.12
b,20,0.06
c,25,0.08
d,30,0.02


In [5]:
df2

Unnamed: 0,PB,ROE
a,2.5,0.12
b,3.2,0.06
c,2.5,0.08
d,2.0,0.02


In [6]:
# index 索引被重置
pd.merge(df1,df2,on='ROE')  # 会自动根据重复排列merge，index会被重置

Unnamed: 0,PE,ROE,PB
0,10,0.12,2.5
1,20,0.06,3.2
2,25,0.08,2.5
3,30,0.02,2.0


In [7]:
# 左右索引分开显示
df1.join(df2,lsuffix='_l',rsuffix='_r')

Unnamed: 0,PE,ROE_l,PB,ROE_r
a,10,0.12,2.5,0.12
b,20,0.06,3.2,0.06
c,25,0.08,2.5,0.08
d,30,0.02,2.0,0.02


In [8]:
roe1 = pd.Series([0.12,0.06,0.08,0.02],index=['a','b','c','d'])
roe2 = pd.Series([0.2,0.06,0.08,0.02],index=['a','b','c','d'])
df1['ROE'] = roe1
df2['ROE'] = roe2

In [9]:
df1

Unnamed: 0,PE,ROE
a,10,0.12
b,20,0.06
c,25,0.08
d,30,0.02


In [10]:
df2

Unnamed: 0,PB,ROE
a,2.5,0.2
b,3.2,0.06
c,2.5,0.08
d,2.0,0.02


In [11]:
# 可以不需要注明前缀和后缀
pd.merge(df1,df2,
         left_index=True,
         right_index=True,
         how='outer',
         suffixes=['_df1','_df2'])

Unnamed: 0,PE,ROE_df1,PB,ROE_df2
a,10,0.12,2.5,0.2
b,20,0.06,3.2,0.06
c,25,0.08,2.5,0.08
d,30,0.02,2.0,0.02


In [12]:
#pd.merge(df1,df2,left_on,right_on)
import numpy as np
# 层次化索引
df = pd.Series(
    np.random.randn(5),
    index=[['a','a','b','b','b'],[1,2,1,2,3]]
)

In [13]:
df

a  1   -0.698100
   2    0.439249
b  1    0.446036
   2   -0.438677
   3   -1.076864
dtype: float64

In [14]:
df.index

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2),
            ('b', 3)],
           )

In [15]:
df['b']

1    0.446036
2   -0.438677
3   -1.076864
dtype: float64

In [16]:
# 层次化索引在数据重塑中的应用
df.unstack()  # 对数据进行重新排列

Unnamed: 0,1,2,3
a,-0.6981,0.439249,
b,0.446036,-0.438677,-1.076864


In [18]:
df.unstack().stack()

a  1   -0.698100
   2    0.439249
b  1    0.446036
   2   -0.438677
   3   -1.076864
dtype: float64

In [19]:
df.sum(level=0)

a   -0.258852
b   -1.069505
dtype: float64

In [20]:
df.groupby(level=0).sum()

a   -0.258852
b   -1.069505
dtype: float64

In [21]:
df.sum(level=1)

1   -0.252065
2    0.000572
3   -1.076864
dtype: float64