数据转换

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame
from pandas import Series

# DataFrame.drop_duplicates - 移除重复数据

In [2]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates() # 删除重复的行


Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [7]:
data['v1'] = range(7)
data.drop_duplicates(['k1']) # 根据行上指定列的值是否重复删除


Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


# Series.map - 利用函数或映射进行数据转换

In [11]:
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'],
                  'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})


meat_to_animal = {'bacon': 'pig',
                  'pulled pork': 'pig',
                  'pastrami': 'cow',
                  'corned beef': 'cow',
                  'honey ham': 'pig',
                  'nova lox': 'salmon'} # 动物来源

data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [14]:
data['food'].map(lambda x: meat_to_animal[x.lower()])
# cc：
# map(f, sq) 函数将 f 作用到 sq 的每个元素上去，并返回结果组成的列表，相当于：
# [f(s) for s in sq]


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

# Series.replace - 替换值

In [15]:
data = Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [16]:
data.replace(-999, np.nan)


0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [17]:
data.replace([-999, -1000], np.nan) # 若干个值统一替换为NAN


0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [18]:
data.replace([-999, -1000], ['JJJ', 'Thousand']) # 一一对应的替换


0         1.0
1         JJJ
2         2.0
3         JJJ
4    Thousand
5         3.0
dtype: object

In [19]:
data.replace({-999: 'JJJ', -1000: 'Thousand'}) # 与上面等价


0         1.0
1         JJJ
2         2.0
3         JJJ
4    Thousand
5         3.0
dtype: object

# DataFrame.rename - 重命名轴索引

In [20]:
data = DataFrame(np.arange(12).reshape((3, 4)),
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data.index.map(str.upper)

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [21]:
data.index = data.index.map(str.upper)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [22]:
data.rename(index=str.title, columns=str.upper)


Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [23]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'}) # 选择性重命名

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [24]:
_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True) # 原地重命名
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


# pd.cut / pd.qcut - 离散化和面元划分

In [25]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins) # 设置ages每个值根据bins计算的区间
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [26]:
cats.codes # labels不再使用


array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [27]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False) # right设置开闭区间


[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [28]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior'] # 每个区间的名字
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [29]:
data = np.random.rand(20)
cats = pd.qcut(data, 4) # 划4分位
cats

[(0.729, 0.928], (0.535, 0.729], (0.535, 0.729], (0.0418, 0.232], (0.232, 0.535], ..., (0.0418, 0.232], (0.535, 0.729], (0.232, 0.535], (0.0418, 0.232], (0.232, 0.535]]
Length: 20
Categories (4, interval[float64]): [(0.0418, 0.232] < (0.232, 0.535] < (0.535, 0.729] < (0.729, 0.928]]

In [30]:
pd.value_counts(cats) # 区间统计


(0.0418, 0.232]    5
(0.232, 0.535]     5
(0.535, 0.729]     5
(0.729, 0.928]     5
dtype: int64

In [31]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) # 自定义区间段


[(0.878, 0.928], (0.535, 0.878], (0.535, 0.878], (0.0418, 0.15], (0.15, 0.535], ..., (0.15, 0.535], (0.535, 0.878], (0.15, 0.535], (0.15, 0.535], (0.15, 0.535]]
Length: 20
Categories (4, interval[float64]): [(0.0418, 0.15] < (0.15, 0.535] < (0.535, 0.878] < (0.878, 0.928]]

# 检测和过滤异常值

In [33]:
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [34]:
col = data[3]
col[np.abs(col) > 3] # 找到该列绝对值大于3的值

97     3.927528
305   -3.399312
400   -3.745356
Name: 3, dtype: float64

In [35]:
data[(np.abs(data) > 3).any(1)] # 任意行只要有值绝对值大于3


Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [36]:
data[np.abs(data) > 3] = np.sign(data) * 3 # 把绝对值大于3的元素，修改为正负3
data.describe()

# cc：sign()是Python的Numpy中的取数字符号（数字前的正负号）的函数。

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


# DataFrame.take - 排列和随机采样

In [37]:
df = DataFrame(np.arange(5 * 4).reshape(5, 4))
sampler = np.random.permutation(5)
print(df)
print(sampler)

    0   1   2   3
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
[1 0 2 3 4]


In [38]:
df.take(sampler) # 根据随机采样调整行顺序


Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [40]:
df.take(np.random.permutation(len(df))[:3]) # 随机选取3行


Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3


In [47]:
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
sampler

array([0, 4, 4, 4, 3, 4, 2, 2, 4, 3])

In [48]:
draws = bag.take(sampler)
draws

array([ 5,  4,  4,  4,  6,  4, -1, -1,  4,  6])

# pd.get_dummies - 计算指标/哑变量

get_dummies 是 pandas 实现one hot encode的方式。

one-hot的基本思想：将离散型特征的每一种取值都看成一种状态，若你的这一特征中有N个不相同的取值，那么我们就可以将该特征抽象成N种不同的状态，one-hot编码保证了每一个取值只会使得一种状态处于“激活态”，也就是说这N种状态中只有一个状态位值为1，其他状态位都是0。

In [49]:
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                'data1': range(6)})
print(df)
pd.get_dummies(df['key'])
# 解释一下，key的取值是abc，那么取值为b的话,ac自然对应0。

  key  data1
0   b      0
1   b      1
2   a      2
3   c      3
4   a      4
5   b      5


Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [50]:
dummies = pd.get_dummies(df['key'], prefix='key') # 添加列名前缀
print(dummies)
df_with_dummy= df[['data1']].join(dummies) # 根据data1列的值做join，dummies使用索引去匹配data1的值
df_with_dummy

   key_a  key_b  key_c
0      0      1      0
1      0      1      0
2      1      0      0
3      0      0      1
4      1      0      0
5      0      1      0


Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0
