In [1]:
import pandas as pd
import numpy as np

In [2]:
# Part1 行标签与值作为条件
df1 = pd.DataFrame(np.array([4,5,6,7,10,20,30,40,100,50,-30,-50]).reshape((3,4)).T,\
                   columns=['AAA','BBB','CCC'])
print(df1)

   AAA  BBB  CCC
0    4   10  100
1    5   20   50
2    6   30  -30
3    7   40  -50


In [3]:
a = df1[(df1['AAA'] <= 6) & (df1.index.isin([0,2,4]))]
print(a)

   AAA  BBB  CCC
0    4   10  100
2    6   30  -30


In [4]:
# Part2 标签切片用loc,位置切片用 iloc
df1 = pd.DataFrame(np.array([4,5,6,7,10,20,30,40,100,50,-30,-50]).reshape((3,4)).T,\
                   columns=['AAA','BBB','CCC'],index=['foo','bar','boo','kar'])
print(df1)

     AAA  BBB  CCC
foo    4   10  100
bar    5   20   50
boo    6   30  -30
kar    7   40  -50


In [5]:
# Part2-1 标签切片，非 Python 切片风格，包括结尾数据
print(df1.loc['bar':'kar'])

     AAA  BBB  CCC
bar    5   20   50
boo    6   30  -30
kar    7   40  -50


In [6]:
# Part2-2 位置切片，Python 切片风格，不包括结尾数据
print(df1.iloc[0:3])

     AAA  BBB  CCC
foo    4   10  100
bar    5   20   50
boo    6   30  -30


In [7]:
# Part2-3 包含整数，且不从 0 开始的索引，或不是逐步递增的索引会引发歧义
df2 = df1.copy()
df2.index = range(1,5)
print(df2)

   AAA  BBB  CCC
1    4   10  100
2    5   20   50
3    6   30  -30
4    7   40  -50


In [8]:
# 注意无论 index 名字怎么变，iloc仍是按位置索引
a = df2.iloc[1:3]
print(a)

   AAA  BBB  CCC
2    5   20   50
3    6   30  -30


In [9]:
# loc 则是按照标签名字来索引，需注意 索引名字
b = df2.loc[1:3]
print(b)

   AAA  BBB  CCC
1    4   10  100
2    5   20   50
3    6   30  -30


In [10]:
# Part2-4 用逆运算符 (~)提取掩码的反向内容
a = df1[~((df1['AAA'] <= 6) & (df1.index.isin(['foo','boo','kar'])))]
print(a)

     AAA  BBB  CCC
bar    5   20   50
kar    7   40  -50


In [11]:
# Part3 生成新列
# Part3-1 用 applymap 高效动态生成新列
df3 = pd.DataFrame({'AAA':[1,2,1,3],\
                    'BBB':[1,1,2,2],\
                    'CCC':[2,1,3,1]})
print(df3)

   AAA  BBB  CCC
0    1    1    2
1    2    1    1
2    1    2    3
3    3    2    1


In [12]:
source_cols = df3.columns
new_cols = [str(x) + '_cat' for x in source_cols]
categories = {1:'Alpha',2:'Beta',3:'Charlie'}
df3[new_cols] = df3[source_cols].applymap(categories.get)
print(df3)

   AAA  BBB  CCC  AAA_cat BBB_cat  CCC_cat
0    1    1    2    Alpha   Alpha     Beta
1    2    1    1     Beta   Alpha    Alpha
2    1    2    3    Alpha    Beta  Charlie
3    3    2    1  Charlie    Beta    Alpha


In [13]:
# Part3-2 分组时用 min()
df4 = pd.DataFrame({'AAA':[1,1,1,2,2,2,3,3],\
                    'BBB':[2,1,3,4,5,1,2,3]})
print(df4)

   AAA  BBB
0    1    2
1    1    1
2    1    3
3    2    4
4    2    5
5    2    1
6    3    2
7    3    3


In [14]:
# 方法一 用 idxmin() 提取每组最小值的索引
a = df4.loc[df4.groupby('AAA')['BBB'].idxmin()]
print(a)

   AAA  BBB
1    1    1
5    2    1
6    3    2


In [15]:
# 方法二 先排序，再提取每组的第一个值
a = df4.sort_values(by='BBB').groupby('AAA',as_index=False).first()
print(a)

   AAA  BBB
0    1    1
1    2    1
2    3    2
