In [1]:
import pandas as pd
import numpy as np
from numpy import nan as NA

## Handling Missing Data

Python内置None与np.nan同样被视作NA

|Operation|Description|
|---|---|
|x.dropna()|
|x.fillna()|
|x.isnull()| 
|| 
|| 
||
||
|| 

### 删除缺失值

In [3]:
data = pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [4]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [5]:
~ data.isnull()

0     True
1    False
2     True
3    False
4     True
dtype: bool

In [6]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [41]:
data.dropna() #删除带有na的行(axis=1代表删除带有NA的列)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [42]:
data.dropna(how='all') #只有某行全是NA是才会被删除

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [8]:
df = pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,-1.453392,0.472913,-0.501346
1,-0.444413,1.017139,0.036336
2,-0.405712,-0.226625,-0.724988
3,-0.054696,-1.83637,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [9]:
df.iloc[:4,1] = NA
df

Unnamed: 0,0,1,2
0,-1.453392,,-0.501346
1,-0.444413,,0.036336
2,-0.405712,,-0.724988
3,-0.054696,,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [10]:
df.iloc[:2,2] = NA
df

Unnamed: 0,0,1,2
0,-1.453392,,
1,-0.444413,,
2,-0.405712,,-0.724988
3,-0.054696,,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [11]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [12]:
df.dropna(thresh=2) # 只有NA数大于等于的行才会被删除

Unnamed: 0,0,1,2
2,-0.405712,,-0.724988
3,-0.054696,,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


### 缺失值填补

缺失值填补主要使用 df.fillna()  

|Arguments|Description|
|---|---|
|value|
|method|'ffill','bfill'
|limit| 填充的数量
|axis| 默认对行进行操作 
|inplace| 
|| 

In [13]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.453392,0.0,0.0
1,-0.444413,0.0,0.0
2,-0.405712,0.0,-0.724988
3,-0.054696,0.0,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [20]:
df.fillna({1:pd.Series([1,2]), 2:0})

Unnamed: 0,0,1,2
0,-1.453392,1.0,0.0
1,-0.444413,2.0,0.0
2,-0.405712,,-0.724988
3,-0.054696,,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [21]:
df.fillna(0,inplace=True)
df

Unnamed: 0,0,1,2
0,-1.453392,0.0,0.0
1,-0.444413,0.0,0.0
2,-0.405712,0.0,-0.724988
3,-0.054696,0.0,-0.026491
4,-0.892594,-0.630672,1.493214
5,-0.992107,-0.925506,-1.772655
6,0.876843,0.537129,-0.507653


In [22]:
df = pd.DataFrame(np.random.randn(6,3))

In [23]:
df

Unnamed: 0,0,1,2
0,0.188122,0.427843,-0.842252
1,-0.326904,1.35949,1.042908
2,0.548933,-0.087215,1.041747
3,-1.977306,0.163115,-1.282022
4,-0.09961,-1.129355,-1.358225
5,-1.250438,-1.241016,-0.113104


In [24]:
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA

In [25]:
df

Unnamed: 0,0,1,2
0,0.188122,0.427843,-0.842252
1,-0.326904,1.35949,1.042908
2,0.548933,,1.041747
3,-1.977306,,-1.282022
4,-0.09961,,
5,-1.250438,,


In [26]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.188122,0.427843,-0.842252
1,-0.326904,1.35949,1.042908
2,0.548933,1.35949,1.041747
3,-1.977306,1.35949,-1.282022
4,-0.09961,1.35949,-1.282022
5,-1.250438,1.35949,-1.282022


In [29]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.188122,0.427843,-0.842252
1,-0.326904,1.35949,1.042908
2,0.548933,1.35949,1.041747
3,-1.977306,1.35949,-1.282022
4,-0.09961,,-1.282022
5,-1.250438,,-1.282022


In [30]:
df.apply(lambda x:x.sum())

0   -2.917203
1    1.787333
2   -0.039619
dtype: float64

In [63]:
df.apply(lambda x:x.fillna(x.mean()))

Unnamed: 0,0,1,2
0,-1.60317,-0.079563,-0.500835
1,-0.957619,-0.366135,-0.999799
2,-0.800368,-0.222849,2.265898
3,1.075965,-0.222849,-1.541992
4,0.582558,-0.222849,-0.194182
5,0.513905,-0.222849,-0.194182


## Data Transformation

### Removing Duplicates

The DataFrame method duplicated returns a boolean Series indicating whether each row   
is a duplicate (has been observed in a previous row) or not
当然也可以输入重复的条件

|Operation|Description|
|---|---|
|data.duplicated()|
||
|| 
|| 
|| 
||
||
||

In [31]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [32]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [33]:
data.drop_duplicates()
#data[~data.duplicated()] #等价命令

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [34]:
data['v1'] = range(7)

In [35]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [36]:
data.drop_duplicates(['k1']) #可指定条件

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


### Transforming Data Using a  Mapping

In [37]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                              'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [38]:
# Constructing a map from meat to animal
meat_to_animal = {
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [39]:
data['animal'] = data['food'].str.lower().map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [83]:
data['food'].apply(lambda x:meat_to_animal[x.lower()])
#data['food'].map(lambda x:meat_to_animal[x.lower()]) #等价操作

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values

In [40]:
data = pd.Series([1,-999,2,-999,-1000,3])
data

0       1
1    -999
2       2
3    -999
4   -1000
5       3
dtype: int64

In [41]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [42]:
data.replace([-999,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

### Renaming Axis Indexes

In [43]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [44]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


### Discretization and Binning

1. pd.cut(value,bins,right=False,labels)
2. pd.cut(value,n,precision=2) #均匀划分至4个区间
3. pd.qcut(value,n) #按照分位数进行划分

In [45]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100] 

In [46]:
cats = pd.cut(ages,bins,right=False)
cats

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [47]:
pd.value_counts(cats)

[18, 25)     4
[25, 35)     4
[35, 60)     3
[60, 100)    1
dtype: int64

In [48]:
pd.cut(ages,bins,right=False,labels=['Youth', 'YoungAdult', 'MiddleAged', 'Senior'])

['Youth', 'Youth', 'YoungAdult', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [49]:
data = np.random.rand(20)
pd.cut(data,4,precision=2,right=False)

[[0.73, 0.95), [0.73, 0.95), [0.73, 0.95), [0.52, 0.73), [0.52, 0.73), ..., [0.73, 0.95), [0.52, 0.73), [0.077, 0.3), [0.3, 0.52), [0.3, 0.52)]
Length: 20
Categories (4, interval[float64]): [[0.077, 0.3) < [0.3, 0.52) < [0.52, 0.73) < [0.73, 0.95)]

In [50]:
data = np.random.randn(1000)
cats = pd.qcut(data,4)
cats

[(-0.71, 0.00902], (-3.436, -0.71], (0.00902, 0.711], (0.711, 4.248], (0.00902, 0.711], ..., (-0.71, 0.00902], (0.00902, 0.711], (-3.436, -0.71], (-3.436, -0.71], (0.711, 4.248]]
Length: 1000
Categories (4, interval[float64]): [(-3.436, -0.71] < (-0.71, 0.00902] < (0.00902, 0.711] < (0.711, 4.248]]

In [51]:
cats.value_counts()

(-3.436, -0.71]     250
(-0.71, 0.00902]    250
(0.00902, 0.711]    250
(0.711, 4.248]      250
dtype: int64

### Detecting and Filtering Outliers

In [52]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.002906,0.024739,0.058855,-0.019709
std,0.99828,1.004116,0.99661,1.007609
min,-2.660602,-3.407326,-2.733763,-4.272054
25%,-0.701761,-0.649228,-0.667293,-0.692557
50%,0.017954,0.016423,0.070055,-0.032327
75%,0.655848,0.718847,0.76837,0.619928
max,3.396289,3.265242,3.01964,2.950316


In [55]:
data[2][np.abs(data[2])>3]

518    3.01964
Name: 2, dtype: float64

In [67]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
130,-0.977149,1.148275,1.217327,-3.024307
207,3.085382,0.929086,1.771039,0.805121
327,-0.067051,0.778013,-0.370949,-3.044222
412,-0.684635,0.114746,1.035879,-4.272054
518,-0.928886,-0.01045,3.01964,1.168011
590,3.396289,-2.052999,-1.533283,-0.476666
620,-1.995117,-3.407326,0.525342,-1.084951
980,0.048915,3.265242,-0.464412,-3.021557


In [71]:
data.apply(lambda x:((np.abs(x) > 3)*1).sum(),axis=1).value_counts()

0    992
1      7
2      1
dtype: int64

In [72]:
data[data.apply(lambda x:((np.abs(x) > 3)*1).sum()>=1,axis=1)]

Unnamed: 0,0,1,2,3
130,-0.977149,1.148275,1.217327,-3.024307
207,3.085382,0.929086,1.771039,0.805121
327,-0.067051,0.778013,-0.370949,-3.044222
412,-0.684635,0.114746,1.035879,-4.272054
518,-0.928886,-0.01045,3.01964,1.168011
590,3.396289,-2.052999,-1.533283,-0.476666
620,-1.995117,-3.407326,0.525342,-1.084951
980,0.048915,3.265242,-0.464412,-3.021557


In [125]:
data[np.abs(data) > 3] = np.sign(data)*3

In [126]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.023485,0.053406,-0.016465,0.034582
std,1.035202,1.008535,0.991336,0.996691
min,-3.0,-3.0,-3.0,-3.0
25%,-0.704672,-0.623233,-0.670571,-0.607741
50%,-0.001411,0.015089,-0.02299,0.072292
75%,0.657321,0.734557,0.628689,0.705554
max,3.0,3.0,3.0,3.0


### Permutation and Random Sampling

样本选择在机器学习中极为重要，因此后续我会补充这一章节的内容

In [128]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [157]:
sampler = np.random.permutation(5)
sampler

array([0, 1, 3, 4, 2])

In [184]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
0,0,1,2,3


### Computing Indiicator/Dummy Variables

In [86]:
df = pd.DataFrame({'keys': ['b','b','a','c','a','b'],
                   'data1': range(6)})
df

Unnamed: 0,keys,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [87]:
pd.get_dummies(df['keys'],prefix='key',prefix_sep='_') #所谓的one-hot编码

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [198]:
dummies = pd.get_dummies(df['keys'],prefix='key',prefix_sep='_')

In [201]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


## String Manipulation

### String Object Methods

|Operation|Description|
|---|---|
|x.count()|
|x.endswith()|
|x.join()| 
|x.strip()| 
|x.split()| 
|x.lower()|
||
||

In [89]:
val = 'a,b,  guido'

In [90]:
val.split(',')

['a', 'b', '  guido']

In [91]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [92]:
pieces[0]+'::'+pieces[1]+'::'+pieces[2]

'a::b::guido'

In [93]:
'::'.join(pieces)

'a::b::guido'

In [94]:
val.count(',')

2

In [95]:
val.replace(' ','')

'a,b,guido'

In [96]:
val.replace(',','::')

'a::b::  guido'

### Regular Expressions