# Data Cleaning and Preparation

In [1]:
import numpy as np
import pandas as pd

## Handling Missing Data

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None#Python内置的None值也会被当做NA值处理
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [5]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()#过滤掉缺失数据，dropna返回一个仅含非空数据和索引值的Series

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
data[data.notnull()]#通过布尔型数据也可以达到目的

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()#对于DataFrame对象，dropna默认丢弃任何含有缺失值的行
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
data.dropna(how='all')#参数how='all'将只丢弃全为NA的行

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [11]:
data.dropna(axis=1, how='all')#传入轴，根据轴操作数据

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [35]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.332883,,
1,-1.541996,,
2,0.28635,,-0.753887
3,0.331286,,0.069877
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


In [36]:
df.dropna()

Unnamed: 0,0,1,2
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


In [37]:
df.dropna(thresh=1)#利用thresh,留下一部分观测数据，thresh=1等于1表示每行数据至少有一个非NA值

Unnamed: 0,0,1,2
0,0.332883,,
1,-1.541996,,
2,0.28635,,-0.753887
3,0.331286,,0.069877
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


### Filling In Missing Data

In [38]:
df.fillna(0)#用0填充NA值

Unnamed: 0,0,1,2
0,0.332883,0.0,0.0
1,-1.541996,0.0,0.0
2,0.28635,0.0,-0.753887
3,0.331286,0.0,0.069877
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


In [39]:
df.fillna({1: 0.5, 2: 3})#通过一个字典调用fillna，就可以实现对不同的列填充不同的值

Unnamed: 0,0,1,2
0,0.332883,0.5,3.0
1,-1.541996,0.5,3.0
2,0.28635,0.5,-0.753887
3,0.331286,0.5,0.069877
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


In [40]:
df.fillna(0, inplace=True)#fillna默认会返回一个新对象，要实现对原始对象的修改，需要用参数inplace
df

Unnamed: 0,0,1,2
0,0.332883,0.0,0.0
1,-1.541996,0.0,0.0
2,0.28635,0.0,-0.753887
3,0.331286,0.0,0.069877
4,0.246674,-0.011862,1.004812
5,1.327195,-0.919262,-1.549106
6,0.022185,0.758363,-0.660524


In [41]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,0.86258,-0.010032,0.050009
1,0.670216,0.852965,-0.955869
2,-0.023493,,-0.652469
3,-1.218302,,1.074623
4,0.723642,,
5,-0.503087,,


In [42]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.86258,-0.010032,0.050009
1,0.670216,0.852965,-0.955869
2,-0.023493,0.852965,-0.652469
3,-1.218302,0.852965,1.074623
4,0.723642,0.852965,1.074623
5,-0.503087,0.852965,1.074623


In [43]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.86258,-0.010032,0.050009
1,0.670216,0.852965,-0.955869
2,-0.023493,0.852965,-0.652469
3,-1.218302,0.852965,1.074623
4,0.723642,,1.074623
5,-0.503087,,1.074623


In [44]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())#传入数据的平均值

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

# 层次化索引

In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])

In [4]:
data

a  1    0.924914
   2   -0.296786
   3    0.955626
b  1    0.814573
   2   -0.492970
   3    1.240019
c  1   -1.244214
   2    0.640298
d  2   -0.173886
   3    0.533197
dtype: float64

In [5]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 1, 2]])

In [6]:
data['b']

1    0.814573
2   -0.492970
3    1.240019
dtype: float64

In [7]:
data['b':'c']

b  1    0.814573
   2   -0.492970
   3    1.240019
c  1   -1.244214
   2    0.640298
dtype: float64

In [8]:
data.loc[['b','d']]

b  1    0.814573
   2   -0.492970
   3    1.240019
d  2   -0.173886
   3    0.533197
dtype: float64

In [9]:
data[:,2]

a   -0.296786
b   -0.492970
c    0.640298
d   -0.173886
dtype: float64

In [10]:
data.unstack()#层次化索引在数据重塑和基于分组的操作（如透视表生成）中扮演着重要的角色

Unnamed: 0,1,2,3
a,0.924914,-0.296786,0.955626
b,0.814573,-0.49297,1.240019
c,-1.244214,0.640298,
d,,-0.173886,0.533197


In [11]:
data.unstack().stack()#unstack的逆运算是stack

a  1    0.924914
   2   -0.296786
   3    0.955626
b  1    0.814573
   2   -0.492970
   3    1.240019
c  1   -1.244214
   2    0.640298
d  2   -0.173886
   3    0.533197
dtype: float64

In [13]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
                     index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])

In [14]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [17]:
frame.index.names=['key1','key2']

In [18]:
frame.columns.names=['state','color']

In [19]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [21]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [27]:
frame.sortlevel(1)#sortlevel根据单个级别中值对数据进行排序

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [31]:
frame.sortlevel(0)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [32]:
frame.swaplevel('key1','key2')#swaplevel接受两个级别编号或名称，并返回一个互换了级别的新对象（数据不会发生变化）

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [33]:
frame.swaplevel(0,1).sortlevel(0)

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


In [34]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [35]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [36]:
frame.sum(level='color',axis=1)

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


In [45]:
frame=pd.DataFrame({'a':range(7),'b':range(7,0,-1),
                'c':['one','one','one','two','two','two','two'],
                'd':[0,1,2,0,1,2,3]})
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [46]:
frame1 = frame.set_index(['c','d'])#set_index函数会将其一个或多个列转换为行索引，并创建一个新的DataFrame
frame1

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [47]:
frame2 = frame.set_index(['c','d'],drop=False)#默认那些列会从DataFrame中移除，但也可以将其保留下来
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [48]:
frame1.reset_index()#层次化索引的级别会被转移到列里面

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


In [4]:
import pandas_datareader.data as web
import fix_yahoo_finance as fy
fy.pdr_override()

In [5]:
all_data = pd.Panel(dict((ticker,web.get_data_yahoo(ticker,'2000-01-01','2010-01-01'))
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']))

[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded


Panel is deprecated and will be removed in a future version.
The recommended way to represent these types of 3-dimensional data are with a MultiIndex on a DataFrame, via the Panel.to_frame() method
Alternatively, you can use the xarray package http://xarray.pydata.org/en/stable/.
Pandas provides a `.to_xarray()` method to help automate this conversion.

  exec(code_obj, self.user_global_ns, self.user_ns)


In [7]:
all_data

<class 'pandas.core.panel.Panel'>
Dimensions: 4 (items) x 2516 (major_axis) x 6 (minor_axis)
Items axis: AAPL to GOOG
Major_axis axis: 1999-12-31 00:00:00 to 2009-12-31 00:00:00
Minor_axis axis: Open to Volume

## Data Transformation

### Removing Duplicates

In [2]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [4]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [5]:
data['v1'] = range(7)
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [6]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [7]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [8]:
meat_to_animal = { #编写一个不同肉类到动物的映射：
  'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}

In [10]:
lowercased = data['food'].str.lower()#使用Series的str.lower方法，将各个值转换为小写：
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [11]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [12]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values

In [15]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [16]:
data.replace(-999, np.nan)#-999这个值可能是一个表示缺失数据的标记值。要将其替换为pandas能够理解的NA值

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [17]:
data.replace([-999, -1000], np.nan)#一次性替换多个值，可以传入一个由待替换值组成的列表以及一个替换值

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [18]:
data.replace([-999, -1000], [np.nan, 0])#要让每个值有不同的替换值，可以传递一个替换列表

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [19]:
data.replace({-999: np.nan, -1000: 0})#传入的参数也可以是字典

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [21]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [22]:
transform = lambda x: x[:4].upper()
transform

<function __main__.<lambda>(x)>

In [23]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [24]:
data.index = data.index.map(transform)#跟Series一样，轴索引也有一个map方法
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [25]:
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [26]:
data.rename(index={'OHIO': 'INDIANA'},
            columns={'three': 'peekaboo'})#rename可以结合字典型对象实现对部分轴标签的更新

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [27]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)#修改某个数据集，传入inplace=True即可
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

In [28]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [29]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [30]:
cats.codes#年龄数据的标签

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [31]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [32]:
pd.value_counts(cats)#pd.value_counts(cats)是pandas.cut结果的面元计数

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [33]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)#哪边是闭端可以通过right=False进行修改

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [34]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)#通过传递一个列表或数组到labels，设置自己的面元名称

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [36]:
data = np.random.rand(20)
data

array([0.06050571, 0.02155674, 0.3000682 , 0.33754147, 0.51050683,
       0.55660192, 0.04217927, 0.41251207, 0.81602687, 0.70377713,
       0.44140983, 0.85304182, 0.85205771, 0.42852607, 0.11041682,
       0.33298874, 0.43681807, 0.04207933, 0.42985678, 0.82782209])

In [37]:
pd.cut(data, 4, precision=2)#将均匀分布的数据分成四组，precision=2，限定小数只有两位

[(0.021, 0.23], (0.021, 0.23], (0.23, 0.44], (0.23, 0.44], (0.44, 0.65], ..., (0.23, 0.44], (0.23, 0.44], (0.021, 0.23], (0.23, 0.44], (0.65, 0.85]]
Length: 20
Categories (4, interval[float64]): [(0.021, 0.23] < (0.23, 0.44] < (0.44, 0.65] < (0.65, 0.85]]

In [38]:
data = np.random.randn(1000)  # Normally distributed
cats = pd.qcut(data, 4)  # Cut into quartiles
cats

[(0.0665, 0.717], (0.0665, 0.717], (-0.603, 0.0665], (0.0665, 0.717], (-0.603, 0.0665], ..., (-0.603, 0.0665], (0.717, 3.863], (-0.603, 0.0665], (0.0665, 0.717], (0.0665, 0.717]]
Length: 1000
Categories (4, interval[float64]): [(-2.862, -0.603] < (-0.603, 0.0665] < (0.0665, 0.717] < (0.717, 3.863]]

In [39]:
pd.value_counts(cats)

(0.717, 3.863]      250
(0.0665, 0.717]     250
(-0.603, 0.0665]    250
(-2.862, -0.603]    250
dtype: int64

In [40]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(0.0665, 1.319], (0.0665, 1.319], (-1.184, 0.0665], (0.0665, 1.319], (-1.184, 0.0665], ..., (-1.184, 0.0665], (1.319, 3.863], (-1.184, 0.0665], (0.0665, 1.319], (0.0665, 1.319]]
Length: 1000
Categories (4, interval[float64]): [(-2.862, -1.184] < (-1.184, 0.0665] < (0.0665, 1.319] < (1.319, 3.863]]

### Detecting and Filtering Outliers

In [58]:
data = pd.DataFrame(np.random.randn(1000, 4))#含有正态分布数据的DataFrame
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.053085,-0.022446,-0.005432,-0.034049
std,1.021336,0.985415,0.990964,1.027572
min,-2.965843,-2.97247,-3.403758,-3.480482
25%,-0.748893,-0.678581,-0.644625,-0.716541
50%,-0.13099,-0.012038,-0.020301,-0.039693
75%,0.574873,0.633187,0.66601,0.698538
max,3.235062,3.119554,3.306575,3.154539


In [59]:
col = data[2]
col[np.abs(col) > 3]#找出某列中绝对值大小超过3的值

68     3.211230
142   -3.403758
335    3.006063
349    3.306575
418    3.118993
550    3.192697
Name: 2, dtype: float64

In [60]:
(np.abs(data) > 3).any(1)

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
970    False
971    False
972    False
973    False
974    False
975    False
976    False
977    False
978    False
979    False
980    False
981    False
982    False
983    False
984    False
985    False
986    False
987    False
988    False
989    False
990    False
991    False
992    False
993    False
994    False
995    False
996    False
997    False
998    False
999    False
Length: 1000, dtype: bool

In [61]:
np.abs(data) > 3

Unnamed: 0,0,1,2,3
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
5,False,False,False,False
6,False,False,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [62]:
data[(np.abs(data) > 3).any(1)]#要选出全部含有“超过3或－3的值”的行

Unnamed: 0,0,1,2,3
68,-0.062575,-0.462621,3.21123,0.919112
142,-0.686097,-0.393451,-3.403758,0.185783
251,3.078268,1.838858,-0.158636,0.816858
335,2.415031,-0.309732,3.006063,-0.296616
349,-0.935891,1.190067,3.306575,0.114232
418,-0.310585,1.61536,3.118993,-2.182223
517,1.117814,0.577946,-1.057513,-3.480482
550,-0.383575,-0.583709,3.192697,0.895604
562,3.183962,0.378528,-0.025929,0.065486
600,0.703335,-0.385199,0.788114,3.154539


In [64]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.053582,-0.022566,-0.005864,-0.033723
std,1.0198,0.98504,0.98702,1.025601
min,-2.965843,-2.97247,-3.0,-3.0
25%,-0.748893,-0.678581,-0.644625,-0.716541
50%,-0.13099,-0.012038,-0.020301,-0.039693
75%,0.574873,0.633187,0.66601,0.698538
max,3.0,3.0,3.0,3.0


In [56]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,1.0,-1.0,1.0,1.0
2,1.0,-1.0,-1.0,1.0
3,-1.0,1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,1.0


### Permutation and Random Sampling

In [75]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
sampler#产生一个表示新顺序的整数数组

array([4, 0, 1, 3, 2])

In [76]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [77]:
df.take(sampler)#在基于iloc的索引操作或take函数中使用该数组

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
2,8,9,10,11


In [82]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7


In [71]:
choices = pd.Series([5, 7, -1, 6, 4])
draws = choices.sample(n=10, replace=True)#要通过替换的方式产生样本（允许重复选择），可以传递replace=True到sample
draws

3    6
4    4
2   -1
1    7
2   -1
0    5
0    5
4    4
3    6
4    4
dtype: int64

### Computing Indicator/Dummy Variables

In [83]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                   'data1': range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [84]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [85]:
dummies = pd.get_dummies(df['key'], prefix='key')#添加前缀
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [86]:
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [88]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('C:/Users/owolf/Desktop/pydata/datasets/movielens/movies.dat', sep='::',
                       header=None, names=mnames)
movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [89]:
all_genres = []
for x in movies.genres:
    all_genres.extend(x.split('|'))#extend() 函数用于在列表末尾一次性追加另一个序列中的多个值
all_genres

['Animation',
 "Children's",
 'Comedy',
 'Adventure',
 "Children's",
 'Fantasy',
 'Comedy',
 'Romance',
 'Comedy',
 'Drama',
 'Comedy',
 'Action',
 'Crime',
 'Thriller',
 'Comedy',
 'Romance',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Thriller',
 'Comedy',
 'Drama',
 'Romance',
 'Comedy',
 'Horror',
 'Animation',
 "Children's",
 'Drama',
 'Action',
 'Adventure',
 'Romance',
 'Drama',
 'Thriller',
 'Drama',
 'Romance',
 'Thriller',
 'Comedy',
 'Action',
 'Action',
 'Comedy',
 'Drama',
 'Crime',
 'Drama',
 'Thriller',
 'Thriller',
 'Drama',
 'Sci-Fi',
 'Drama',
 'Romance',
 'Drama',
 'Drama',
 'Romance',
 'Adventure',
 'Sci-Fi',
 'Drama',
 'Drama',
 'Drama',
 'Sci-Fi',
 'Adventure',
 'Romance',
 "Children's",
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Drama',
 'Documentary',
 'Comedy',
 'Comedy',
 'Romance',
 'Drama',
 'Drama',
 'War',
 'Action',
 'Crime',
 'Drama',
 'Drama',
 'Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Drama',
 'Romance',
 'Crime',
 'Thrill

In [90]:
genres = pd.unique(all_genres)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [92]:
zero_matrix = np.zeros((len(movies), len(genres)))#从一个全零DataFrame开始
zero_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
zero_matrix.shape

(3883, 18)

In [107]:
dummies = pd.DataFrame(zero_matrix, columns=genres)
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
gen = movies.genres[0]
gen

"Animation|Children's|Comedy"

In [97]:
gen.split('|')

['Animation', "Children's", 'Comedy']

In [98]:
dummies.columns.get_indexer(gen.split('|'))#使用dummies.columns来计算每个类型的列索引

array([0, 1, 2], dtype=int64)

In [109]:
for i, gen in enumerate(movies.genres):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1
dummies

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))#添加前缀，再与movies合并起来
movies_windic 

Unnamed: 0,movie_id,title,genres,Genre_Animation,Genre_Children's,Genre_Comedy,Genre_Adventure,Genre_Fantasy,Genre_Romance,Genre_Drama,...,Genre_Crime,Genre_Thriller,Genre_Horror,Genre_Sci-Fi,Genre_Documentary,Genre_War,Genre_Musical,Genre_Mystery,Genre_Film-Noir,Genre_Western
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,Heat (1995),Action|Crime|Thriller,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,Sabrina (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Tom and Huck (1995),Adventure|Children's,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9,Sudden Death (1995),Action,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10,GoldenEye (1995),Action|Adventure|Thriller,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
movies_windic.iloc[0]

movie_id                                       1
title                           Toy Story (1995)
genres               Animation|Children's|Comedy
Genre_Animation                                1
Genre_Children's                               1
Genre_Comedy                                   1
Genre_Adventure                                0
Genre_Fantasy                                  0
Genre_Romance                                  0
Genre_Drama                                    0
Genre_Action                                   0
Genre_Crime                                    0
Genre_Thriller                                 0
Genre_Horror                                   0
Genre_Sci-Fi                                   0
Genre_Documentary                              0
Genre_War                                      0
Genre_Musical                                  0
Genre_Mystery                                  0
Genre_Film-Noir                                0
Genre_Western       

In [113]:
np.random.seed(12345)
values = np.random.rand(10)
values


array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [114]:
pd.cut(values, bins)

[(0.8, 1.0], (0.2, 0.4], (0.0, 0.2], (0.2, 0.4], (0.4, 0.6], (0.4, 0.6], (0.8, 1.0], (0.6, 0.8], (0.6, 0.8], (0.6, 0.8]]
Categories (5, interval[float64]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [106]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


## String Manipulation

### String Object Methods

In [115]:
val = 'a,b,  guido'
val.split(',')#以逗号分隔的字符串可以用split拆分成数段

['a', 'b', '  guido']

In [116]:
pieces = [x.strip() for x in val.split(',')]#split常常与strip一起使用，以去除空白符（包括换行符）：
pieces

['a', 'b', 'guido']

In [117]:
first, second, third = pieces
first + '::' + second + '::' + third#利用加法，可以将这些子字符串以双冒号分隔符的形式连接起来，这种方式并不是很实用

'a::b::guido'

In [118]:
'::'.join(pieces)#向字符串"::"的join方法传入一个列表或元组

'a::b::guido'

In [119]:
'guido' in val#检测子串的最佳方式是利用Python的in关键字，还可以使用index和find：

True

In [120]:
val.index(',')

1

In [123]:
val.find(',')

1

In [125]:
val.find(':')

-1

In [124]:
val.index(':')#find和index的区别：如果找不到字符串，index将会引发一个异常（而不是返回－1）：

ValueError: substring not found

In [126]:
val.count(',')#count可以返回指定子串的出现次数

2

In [127]:
val

'a,b,  guido'

In [128]:
val.replace(',', '::')#replace用于将指定模式替换为另一个模式

'a::b::  guido'

In [129]:
val.replace(',', '')#通过传入空字符串，它也常常用于删除模式

'ab  guido'

### Regular Expressions

In [130]:
import re
text = "foo    bar\t baz  \tqux"
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [131]:
regex = re.compile('\s+')
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [132]:
regex.findall(text)

['    ', '\t ', '  \t']

In [133]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

# re.IGNORECASE makes the regex case-insensitive
regex = re.compile(pattern, flags=re.IGNORECASE)

In [134]:
regex.findall(text)#对text使用findall将得到一组电子邮件地址

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [135]:
m = regex.search(text)#search返回的是文本中第一个电子邮件地址
m#匹配项对象只能告诉我们模式在原字符串中的起始和结束位置

<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>

In [136]:
text[m.start():m.end()]

'dave@google.com'

In [137]:
print(regex.match(text))#regex.match则将返回None，因为它只匹配出现在字符串开头的模式

None


In [138]:
print(regex.sub('REDACTED', text))#sub方法可以将匹配到的模式替换为指定字符串，并返回所得到的新字符串

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [139]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags=re.IGNORECASE)

In [140]:
m = regex.match('wesm@bright.net')
m.groups()#通过其groups方法返回一个由模式各段组成的元组

('wesm', 'bright', 'net')

In [141]:
regex.findall(text)#对于带有分组功能的模式，findall会返回一个元组列表

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [143]:
text

'Dave dave@google.com\nSteve steve@gmail.com\nRob rob@gmail.com\nRyan ryan@yahoo.com\n'

In [142]:
print(regex.sub(r'Username: \1, Domain: \2, Suffix: \3', text))
#sub还能通过诸如\1、\2之类的特殊符号访问各匹配项中的分组。符号\1对应第一个匹配的组，\2对应第二个匹配的组

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



### Vectorized String Functions in pandas

In [144]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [145]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [146]:
data.str.contains('gmail')#str.contains检查各个电子邮件地址是否含有"gmail"

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [147]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [148]:
data.str.findall(pattern, flags=re.IGNORECASE)#也可以使用正则表达式，还可以加上任意re选项（如IGNORECASE）

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [149]:
matches = data.str.match(pattern, flags=re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [152]:
data.str[:5]#对字符串进行截取

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [157]:
data.str.get(0)

Dave       d
Steve      s
Rob        r
Wes      NaN
dtype: object

In [158]:
data.str.get(1)

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

In [159]:
data.str[1]

Dave       a
Steve      t
Rob        o
Wes      NaN
dtype: object

## Conclusion