# dataframe 的常见操作

In [2]:
import pandas as pd

In [3]:
df1 = pd.DataFrame({'x1': ["a", "b", "c"], "x2": [
                   "11.2", "23.2", "23.4"], "x3": ["11.2", "23.2", "23.4"]})
print(df1)
df2 = pd.DataFrame({'x1': ["a", "b", "d"], "x2": ["20.23", 'NaN', "90.4"]})
print(df2)


  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4
  x1     x2
0  a  20.23
1  b    NaN
2  d   90.4


## 一、索引的最常见操作

### 1. 设置索引

In [22]:
df1.set_index('x1')

# 设置多重索引
print(df1.set_index(['x1', 'x3']))

# drop=False 表示保留原先"YY"列的数据
df1_drop = df1.set_index("x1", drop=False)  
print(df1_drop)

# index索引对象可以转换成列表类型
print(list(df1.index))  # ['ll', 'll', 'mm']


           x2
x1 x3        
a  11.2  11.2
b  23.2  23.2
c  23.4  23.4
   x1    x2    x3
x1               
a   a  11.2  11.2
b   b  23.2  23.2
c   c  23.4  23.4
[0, 1, 2]


### 2. 初始设置索引

In [23]:
my_df = pd.DataFrame({
    'Person': ['Alice', 'Steven', 'Neesham', 'Chris', 'Alice'],
    'City': ['Berlin', 'Montreal', 'Toronto', 'Rome', 'Munich'],
    'Mother Tongue': ['German', 'French', 'English', 'Italian', 'German'],
    'Age':  [37, 20, 38, 23, 35],

}, index=["A", "B", "C", "D", "E"])

print(my_df)

    Person      City Mother Tongue  Age
A    Alice    Berlin        German   37
B   Steven  Montreal        French   20
C  Neesham   Toronto       English   38
D    Chris      Rome       Italian   23
E    Alice    Munich        German   35


### 3. 使用 reset_index() 方法删除 Pandas DataFrame 的索引

In [24]:
my_df = pd.DataFrame({
    'Person': ['Alice', 'Steven', 'Neesham', 'Chris', 'Alice'],
    'City': ['Berlin', 'Montreal', 'Toronto', 'Rome', 'Munich'],
    'Mother Tongue': ['German', 'French', 'English', 'Italian', 'German'],
    'Age':  [37, 20, 38, 23, 35],

}, index=["A", "B", "C", "D", "E"])

df_reset = my_df.reset_index()

print("Before reseting Index:")
print(my_df, "\n")

print("After reseting Index:")
print(df_reset)


Before reseting Index:
    Person      City Mother Tongue  Age
A    Alice    Berlin        German   37
B   Steven  Montreal        French   20
C  Neesham   Toronto       English   38
D    Chris      Rome       Italian   23
E    Alice    Munich        German   35 

After reseting Index:
  index   Person      City Mother Tongue  Age
0     A    Alice    Berlin        German   37
1     B   Steven  Montreal        French   20
2     C  Neesham   Toronto       English   38
3     D    Chris      Rome       Italian   23
4     E    Alice    Munich        German   35


> 它将重置 DataFrame 的索引，但现在的索引将显示为 index 列。如果我们想删除 index 列，我们可以在 reset_index() 方法中设置 drop=True

In [26]:
my_df = pd.DataFrame({
    'Person': ['Alice', 'Steven', 'Neesham', 'Chris', 'Alice'],
    'City': ['Berlin', 'Montreal', 'Toronto', 'Rome', 'Munich'],
    'Mother Tongue': ['German', 'French', 'English', 'Italian', 'German'],
    'Age':  [37, 20, 38, 23, 35],

},index=["A","B","C","D","E"])

# 重置索引，并且删除原来的索引
df_reset=my_df.reset_index(drop=True)

print("Before reseting Index:")
print(my_df,"\n")

print("After reseting Index:")
print(df_reset)

Before reseting Index:
    Person      City Mother Tongue  Age
A    Alice    Berlin        German   37
B   Steven  Montreal        French   20
C  Neesham   Toronto       English   38
D    Chris      Rome       Italian   23
E    Alice    Munich        German   35 

After reseting Index:
    Person      City Mother Tongue  Age
0    Alice    Berlin        German   37
1   Steven  Montreal        French   20
2  Neesham   Toronto       English   38
3    Chris      Rome       Italian   23
4    Alice    Munich        German   35


## 二、合并数据

In [11]:
# 以左边列为基准
pd.merge(df1, df2, how='left', on='x1')

# 以右边列为基准
pd.merge(df1, df2, how='right', on='x1')

# 已交集
pd.merge(df1, df2, how='inner', on='x1')

# 全部
pd.merge(df1, df2, how='outer', on='x1')


Unnamed: 0,x1,x2_x,x2_y
0,a,11.2,20.23
1,b,23.2,
2,c,23.4,
3,d,,90.4


## 三、行、列的转置

> 原先的索引变成了行，所以说要转置时，先设置一下索引，不然转换后colums 变成了数字索引

In [27]:
print(df1)
# 
print(df1.T)

  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4
       0     1     2
x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4


### 正确的方式


In [32]:
print(df1.set_index('x1'))

print('\n 转换后:\n')

print(df1.set_index('x1').T)


      x2    x3
x1            
a   11.2  11.2
b   23.2  23.2
c   23.4  23.4

 转换后:

x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4


## 三、行列获取

### 1. 列获取

方式一： 按标签（最常用）

In [48]:
print(df1)

print('获取单列')
print(df1['x1'])

print('\n获取多列\n')
print(df1[['x1','x3']])

  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4
获取单列
0    a
1    b
2    c
Name: x1, dtype: object

获取多列

  x1    x3
0  a  11.2
1  b  23.2
2  c  23.4


方式二： 按索引

In [59]:
# 单索引
print(df1.iloc[:, 0])

# 多索引
print(df1.iloc[:, 0:])

0    a
1    b
2    c
Name: x1, dtype: object
  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4


### 2. 行获取

方式一： 按标签获取：

In [41]:
df3 = df1.set_index('x1').T
print(df3)
# 按标签获取
df3.loc['x2']


x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4


x1
a    11.2
b    23.2
c    23.4
Name: x2, dtype: object

方式二：按索引获取

In [45]:
print('正常获取：')
print(df3.iloc[0])

print('\n区间获取\n')
print(df3.iloc[0:1])


正常获取：
x1
a    11.2
b    23.2
c    23.4
Name: x2, dtype: object

区间获取

x1     a     b     c
x2  11.2  23.2  23.4


### 3. 精准取值

In [66]:
print(df1)

print('方法一：按索引精准取值')
name = df1.iloc[0, 0]  # 获取第一行，指定列数据
print("\n第一行第一列的值：" + name)

print('\n方法二：按标签精准取值\n')
name = df1.loc[0, 'x1']
print(name)
print("\n第一行第一列的值：" + name)

  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4
方法一：按索引精准取值

第一行第一列的值：a

方法二：按标签精准取值

a


## 四、行列删除

axis=0 : 行删除(默认)

axis=1 : 列删除 

In [27]:
# print(df1)
print(df1)
print('\n 删除行\n')
print(df1.drop(['x1','x3']))

print('\n 删除列数据\n')
print(df1.drop([0], axis=1))


       0     1     2
x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4

 删除行

       0     1     2
x2  11.2  23.2  23.4
       1     2
x1     b     c
x2  23.2  23.4
x3  23.2  23.4


## 五、列的数据格式化操作

### 1. map方式

In [80]:
# map 方式一
print(df1)

def test(x):
  return f'{x}1'


df1.columns = df1.columns.map(test)
print(df1)



  x1    x2    x3
0  a  11.2  11.2
1  b  23.2  23.2
2  c  23.4  23.4
  x11   x21   x31
0   a  11.2  11.2
1   b  23.2  23.2
2   c  23.4  23.4


In [85]:
# map 方式二
df1['x11'] = df1['x11'].map({
  'a': 'a1',
  'b': 'b1',
  'c': 'c1',
})

print(df1)

   x11   x21   x31
0  NaN  11.2  11.2
1  NaN  23.2  23.2
2  NaN  23.4  23.4


## 六、行、列名称更改

### 1. 单独更改

In [37]:
print(df1)

print('\n---指定行更改--- \n')

print(df1.rename({ 'x1': 'x11' }))

print('\n---指定行、列更改--- \n')

df1.rename(index={'x2': 'bj'}, columns={
                 0: 11})  # 为某个 index 单独修改名称


       0     1     2
x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4

---指定行更改--- 

        0     1     2
x11     a     b     c
x2   11.2  23.2  23.4
x3   11.2  23.2  23.4

---指定行、列更改--- 



Unnamed: 0,11,1,2
x1,a,b,c
bj,11.2,23.2,23.4
x3,11.2,23.2,23.4


### 2. 批量更改

In [39]:
print(df1)

# 自定义map函数
def test_map(x):
    return x+'_ABC'


# print(df1.index.map(test_map))
# 输出 Index(['BEIJING_ABC', 'SHANGHAI_ABC', 'GUANGZHOU_ABC'], dtype='object')

print(df1.rename(index=test_map))


       0     1     2
x1     a     b     c
x2  11.2  23.2  23.4
x3  11.2  23.2  23.4
           0     1     2
x1_ABC     a     b     c
x2_ABC  11.2  23.2  23.4
x3_ABC  11.2  23.2  23.4
