In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
import matplotlib.pyplot as plt

# 一、创建对象

1. 可以通过传递一个list对象来创建一个Series，pandas会默认创建整型索引：

In [4]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

2.通过传递一个numpy array，时间索引以及列标签来创建一个DataFrame：

In [6]:
dates = pd.date_range('20130101', periods=6)

In [7]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [9]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


3.通过传递一个能够被转换成类似序列结构的字典对象来创建一个DataFrame：

In [10]:
df2 = pd.DataFrame({'A':1.,
                    'B':pd.Timestamp('20130102'),
                    'C':pd.Series(1, index=list(range(4)),dtype='float32'),
                    'D':np.array([3] * 4, dtype='int32'),
                    'E':pd.Categorical(['test','train', 'test','train']),
                    'F':'foo'
                   })

In [11]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


4.查看不同列的数据类型：

In [12]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

5.使用Tab自动补全功能会自动识别所有的属性以及自定义的列

# 二、查看数据

1.查看Frame中头部和尾部的行：

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963


In [14]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


2.显示索引、列和底层的numpy数据：

In [15]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [17]:
df.values

array([[ 0.44206516,  1.87175904,  0.68952605,  0.59916045],
       [ 1.56644317, -1.13939627,  0.98201792,  0.85803086],
       [ 1.62864103,  0.93313214, -0.45349953,  0.30856721],
       [-0.8711091 , -1.16594867,  0.87481922,  0.68709076],
       [-0.11202451, -0.51065026,  0.31986275, -1.10096304],
       [-0.7892594 , -0.93273128, -1.08564992, -0.58875731]])

3.describe()函数对于数据的快速统计汇总：

In [18]:
df.describe()
#平均值，标准差，最小值，最大值
#第25百分位数又称第一个四分位数（First Quartile），
#用Q1表示；第50百分位数又称第二个四分位数（Second Quartile），
#用Q2表示；第75百分位数又称第三个四分位数（Third Quartile）,用Q3表示。若求得第p百分位数为小数，可完整为整数。

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.310793,-0.157306,0.221179,0.127188
std,1.106019,1.26603,0.824403,0.79047
min,-0.871109,-1.165949,-1.08565,-1.100963
25%,-0.619951,-1.08773,-0.260159,-0.364426
50%,0.16502,-0.721691,0.504694,0.453864
75%,1.285349,0.572187,0.828496,0.665108
max,1.628641,1.871759,0.982018,0.858031


4.对数据的转置(tranverse):

In [19]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.442065,1.566443,1.628641,-0.871109,-0.112025,-0.789259
B,1.871759,-1.139396,0.933132,-1.165949,-0.51065,-0.932731
C,0.689526,0.982018,-0.4535,0.874819,0.319863,-1.08565
D,0.59916,0.858031,0.308567,0.687091,-1.100963,-0.588757


5.按轴进行排序

In [20]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.59916,0.689526,1.871759,0.442065
2013-01-02,0.858031,0.982018,-1.139396,1.566443
2013-01-03,0.308567,-0.4535,0.933132,1.628641
2013-01-04,0.687091,0.874819,-1.165949,-0.871109
2013-01-05,-1.100963,0.319863,-0.51065,-0.112025
2013-01-06,-0.588757,-1.08565,-0.932731,-0.789259


6.按值进行排序

In [21]:
df['A'].sort_values

<bound method Series.sort_values of 2013-01-01    0.442065
2013-01-02    1.566443
2013-01-03    1.628641
2013-01-04   -0.871109
2013-01-05   -0.112025
2013-01-06   -0.789259
Freq: D, Name: A, dtype: float64>

# 三、选择

## 获取

1.选择一个单独的列，这将会返回一个Series，等同于df.A:

In [22]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


In [23]:
df['A']

2013-01-01    0.442065
2013-01-02    1.566443
2013-01-03    1.628641
2013-01-04   -0.871109
2013-01-05   -0.112025
2013-01-06   -0.789259
Freq: D, Name: A, dtype: float64

2.通过[]进行选择，这将会对行进行切片

In [24]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567


In [25]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091


## 通过标签选择

1.使用标签来获取一个交叉的区域

In [26]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


In [27]:
dates
df.loc[dates[0]]

A    0.442065
B    1.871759
C    0.689526
D    0.599160
Name: 2013-01-01 00:00:00, dtype: float64

2.通过标签来在多个轴上进行选择

In [28]:
df.loc[:,['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.442065,1.871759
2013-01-02,1.566443,-1.139396
2013-01-03,1.628641,0.933132
2013-01-04,-0.871109,-1.165949
2013-01-05,-0.112025,-0.51065
2013-01-06,-0.789259,-0.932731


3.标签切片

In [29]:
df.loc['20130102':'20130104', ['A','B']]

Unnamed: 0,A,B
2013-01-02,1.566443,-1.139396
2013-01-03,1.628641,0.933132
2013-01-04,-0.871109,-1.165949


4.对于返回的对象进行维度缩减

In [30]:
df.loc['20130102', ['A','B']]

A    1.566443
B   -1.139396
Name: 2013-01-02 00:00:00, dtype: float64

5.获取一个标量

In [31]:
df.loc[dates[0], 'A']

0.44206516374128041

6.快速访问一个标量（与上一个方法等价）

In [32]:
df.at[dates[0], 'A']

0.44206516374128041

## 通过位置选择

1.通过传递数值进行位置选择（选择的是行）

In [33]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


In [34]:
df.iloc[3]

A   -0.871109
B   -1.165949
C    0.874819
D    0.687091
Name: 2013-01-04 00:00:00, dtype: float64

2.通过数值进行切片，与numpy/python中的情况类似

In [35]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.871109,-1.165949
2013-01-05,-0.112025,-0.51065


3.通过指定一个位置的列表，与numpy/python中的情况类似

In [36]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,1.566443,0.982018
2013-01-03,1.628641,-0.4535
2013-01-05,-0.112025,0.319863


4.对行进行切片

In [37]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567


5.对列进行切片

In [38]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,1.871759,0.689526
2013-01-02,-1.139396,0.982018
2013-01-03,0.933132,-0.4535
2013-01-04,-1.165949,0.874819
2013-01-05,-0.51065,0.319863
2013-01-06,-0.932731,-1.08565


6.获取特定的值

In [39]:
df.iloc[1,1]
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


In [40]:
df.iat[1,1]
#df

-1.1393962726697382

## 布尔索引

In [42]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567
2013-01-04,-0.871109,-1.165949,0.874819,0.687091
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757


1.使用一个单独列的值来选择数据：

In [43]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,-1.139396,0.982018,0.858031
2013-01-03,1.628641,0.933132,-0.4535,0.308567


2.使用where操作来选择数据：

In [44]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,0.442065,1.871759,0.689526,0.59916
2013-01-02,1.566443,,0.982018,0.858031
2013-01-03,1.628641,0.933132,,0.308567
2013-01-04,,,0.874819,0.687091
2013-01-05,,,0.319863,
2013-01-06,,,,


3.使用isin()方法来过滤：

In [45]:
df2 = df.copy()

In [46]:
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']

In [47]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,0.442065,1.871759,0.689526,0.59916,one
2013-01-02,1.566443,-1.139396,0.982018,0.858031,one
2013-01-03,1.628641,0.933132,-0.4535,0.308567,two
2013-01-04,-0.871109,-1.165949,0.874819,0.687091,three
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,four
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757,three


In [48]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.628641,0.933132,-0.4535,0.308567,two
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,four


## 设置

1.设置一个新的列：

In [49]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [50]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [51]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.442065,1.871759,0.689526,0.59916,
2013-01-02,1.566443,-1.139396,0.982018,0.858031,1.0
2013-01-03,1.628641,0.933132,-0.4535,0.308567,2.0
2013-01-04,-0.871109,-1.165949,0.874819,0.687091,3.0
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757,5.0


In [52]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.442065,1.871759,0.689526,0.59916,
2013-01-02,1.566443,-1.139396,0.982018,0.858031,1.0
2013-01-03,1.628641,0.933132,-0.4535,0.308567,2.0
2013-01-04,-0.871109,-1.165949,0.874819,0.687091,3.0
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757,5.0


2.通过标签设置新的值：

In [53]:
df.at[dates[0],'A'] = 1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,1.871759,0.689526,0.59916,
2013-01-02,1.566443,-1.139396,0.982018,0.858031,1.0
2013-01-03,1.628641,0.933132,-0.4535,0.308567,2.0
2013-01-04,-0.871109,-1.165949,0.874819,0.687091,3.0
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757,5.0


3.通过位置设置新的值：

In [54]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,0.59916,
2013-01-02,1.566443,-1.139396,0.982018,0.858031,1.0
2013-01-03,1.628641,0.933132,-0.4535,0.308567,2.0
2013-01-04,-0.871109,-1.165949,0.874819,0.687091,3.0
2013-01-05,-0.112025,-0.51065,0.319863,-1.100963,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,-0.588757,5.0


4.通过一个numpy数值设置一组新值：

In [55]:
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0
2013-01-05,-0.112025,-0.51065,0.319863,5,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,5,5.0


上述操作结果如下：

In [56]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0
2013-01-05,-0.112025,-0.51065,0.319863,5,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,5,5.0


5.通过where操作来设置新的值

In [57]:
df2 = df.copy()

In [58]:
df2[df2 > 0] = -df2

In [59]:
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.0,0.0,-0.689526,-5,
2013-01-02,-1.566443,-1.139396,-0.982018,-5,-1.0
2013-01-03,-1.628641,-0.933132,-0.4535,-5,-2.0
2013-01-04,-0.871109,-1.165949,-0.874819,-5,-3.0
2013-01-05,-0.112025,-0.51065,-0.319863,-5,-4.0
2013-01-06,-0.789259,-0.932731,-1.08565,-5,-5.0


# 四、缺失值处理

In [60]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0
2013-01-05,-0.112025,-0.51065,0.319863,5,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,5,5.0


1.reindex()方法可以对指定轴上的索引进行改变/增加/删除操作，这将返回原始数据的一个拷贝：

In [61]:
df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + ['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1.0,0.0,0.689526,5,,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0,
2013-01-03,1.628641,0.933132,-0.4535,5,2.0,
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0,


In [62]:
df1.loc[dates[0]:dates[1], 'E'] = 1

In [63]:
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1.0,0.0,0.689526,5,,1.0
2013-01-02,1.566443,-1.139396,0.982018,5,1.0,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0,
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0,


2.去掉包含缺失值的行：

In [64]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1.566443,-1.139396,0.982018,5,1.0,1.0


3.对缺失值进行填充：

In [65]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,1.0,0.0,0.689526,5,5.0,1.0
2013-01-02,1.566443,-1.139396,0.982018,5,1.0,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0,5.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0,5.0


4.对数据进行布尔填充：

In [66]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


# 五、相关操作

### 统计（相关操作通常情况下不包括缺失值）

In [67]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0
2013-01-05,-0.112025,-0.51065,0.319863,5,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,5,5.0


1.执行描述性统计：

In [68]:
df.mean()

A    0.403782
B   -0.469266
C    0.221179
D    5.000000
F    3.000000
dtype: float64

2.在其他轴上进行相同的操作：

In [69]:
df.mean(1)

2013-01-01    1.672382
2013-01-02    1.481813
2013-01-03    1.821655
2013-01-04    1.367552
2013-01-05    1.739438
2013-01-06    1.438472
Freq: D, dtype: float64

3.对于拥有不同维度，需要对齐的对象进行操作，pandas会自动的沿着指定的维度进行广播

In [70]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [71]:
s = pd.Series([1,3,4,np.nan,6,8], index=dates).shift(2)

In [72]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    4.0
2013-01-06    NaN
Freq: D, dtype: float64

### Apply

1.对数据应用函数：

In [73]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,1.566443,-1.139396,0.982018,5,1.0
2013-01-03,1.628641,0.933132,-0.4535,5,2.0
2013-01-04,-0.871109,-1.165949,0.874819,5,3.0
2013-01-05,-0.112025,-0.51065,0.319863,5,4.0
2013-01-06,-0.789259,-0.932731,-1.08565,5,5.0


In [74]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,1.0,0.0,0.689526,5,
2013-01-02,2.566443,-1.139396,1.671544,10,1.0
2013-01-03,4.195084,-0.206264,1.218044,15,3.0
2013-01-04,3.323975,-1.372213,2.092864,20,6.0
2013-01-05,3.211951,-1.882863,2.412726,25,10.0
2013-01-06,2.422691,-2.815594,1.327076,30,15.0


In [75]:
df.apply(lambda x: x.max() - x.min())

A    2.499750
B    2.099081
C    2.067668
D    0.000000
F    4.000000
dtype: float64

### 字符串方法

In [76]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [77]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

# 六、合并

### Concat

In [78]:
df = pd.DataFrame(np.random.randn(10, 4))

In [79]:
df

Unnamed: 0,0,1,2,3
0,0.852709,1.307638,-1.241765,-0.758652
1,-0.37515,-0.618729,-0.114554,-0.497521
2,-0.010219,1.716184,1.497151,-0.439226
3,1.341002,-1.943102,-0.324484,-0.149623
4,-0.93016,-0.702275,0.638499,-0.195413
5,1.271237,0.500441,-0.589175,0.457156
6,-0.923047,-0.20415,0.893807,-1.130128
7,0.465922,0.325187,-0.984255,-0.053131
8,-0.398562,-0.437032,0.464287,1.404858
9,1.638671,0.278995,0.866129,0.670588


In [80]:
pieces = [df[:3], df[3:7], df[7:]]

In [81]:
# break it into pieces
pieces

[          0         1         2         3
 0  0.852709  1.307638 -1.241765 -0.758652
 1 -0.375150 -0.618729 -0.114554 -0.497521
 2 -0.010219  1.716184  1.497151 -0.439226,
           0         1         2         3
 3  1.341002 -1.943102 -0.324484 -0.149623
 4 -0.930160 -0.702275  0.638499 -0.195413
 5  1.271237  0.500441 -0.589175  0.457156
 6 -0.923047 -0.204150  0.893807 -1.130128,
           0         1         2         3
 7  0.465922  0.325187 -0.984255 -0.053131
 8 -0.398562 -0.437032  0.464287  1.404858
 9  1.638671  0.278995  0.866129  0.670588]

In [82]:
type(pieces)

list

In [83]:
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.852709,1.307638,-1.241765,-0.758652
1,-0.37515,-0.618729,-0.114554,-0.497521
2,-0.010219,1.716184,1.497151,-0.439226
3,1.341002,-1.943102,-0.324484,-0.149623
4,-0.93016,-0.702275,0.638499,-0.195413
5,1.271237,0.500441,-0.589175,0.457156
6,-0.923047,-0.20415,0.893807,-1.130128
7,0.465922,0.325187,-0.984255,-0.053131
8,-0.398562,-0.437032,0.464287,1.404858
9,1.638671,0.278995,0.866129,0.670588


### Join类似于SQL类型的合并

In [84]:
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1,2]})

In [85]:
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [86]:
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [87]:
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [88]:
pd.merge(left, right, on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5


### Append将一行连接到一个DataFrame上，

In [89]:
df
df = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])

In [90]:
df

Unnamed: 0,A,B,C,D
0,-1.638798,-1.286304,-0.596458,1.145713
1,0.1304,0.269957,0.224982,0.70152
2,-0.764139,0.224541,-0.459561,0.752854
3,-1.410062,0.616932,-0.362604,0.070799
4,-0.813619,0.947622,-0.139044,0.152099
5,-1.793255,2.562868,2.082493,-1.655158
6,-0.339012,-0.725321,-0.74959,0.566787
7,0.99949,1.668385,-0.948115,-0.021901


In [91]:
s = df.iloc[3]
s

A   -1.410062
B    0.616932
C   -0.362604
D    0.070799
Name: 3, dtype: float64

In [92]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,-1.638798,-1.286304,-0.596458,1.145713
1,0.1304,0.269957,0.224982,0.70152
2,-0.764139,0.224541,-0.459561,0.752854
3,-1.410062,0.616932,-0.362604,0.070799
4,-0.813619,0.947622,-0.139044,0.152099
5,-1.793255,2.562868,2.082493,-1.655158
6,-0.339012,-0.725321,-0.74959,0.566787
7,0.99949,1.668385,-0.948115,-0.021901
8,-1.410062,0.616932,-0.362604,0.070799


# 七、分组

对于“group by”操作，我们通常是指以下一个或多个操作步骤：
+ （Splitting）按照一些规则将数据分为不同的组
+ （Applying）对于每组数据分别执行一个函数
+ （Combining）将结果组合刀一个数据结构中


In [93]:
df = pd.DataFrame({
        'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
        'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
        'C': np.random.randn(8),
        'D': np.random.randn(8)
    })

In [94]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-2.041893,0.861878
1,bar,one,-0.242472,-0.591112
2,foo,two,-1.635621,-0.332347
3,bar,three,-0.830087,-0.909797
4,foo,two,0.344128,-0.558188
5,bar,two,0.38346,-0.028417
6,foo,one,0.464912,0.920408
7,foo,three,-0.271126,0.307632


1.分组并对每个分组执行sum函数：

In [95]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.689099,-1.529326
foo,-3.139599,1.199383


2.通过多个列进行分组形成一个层次索引，然后执行函数：

In [96]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.242472,-0.591112
bar,three,-0.830087,-0.909797
bar,two,0.38346,-0.028417
foo,one,-1.576981,1.782286
foo,three,-0.271126,0.307632
foo,two,-1.291493,-0.890535


### 数据透视表

In [97]:
df = pd.DataFrame({
        'A': ['one', 'one', 'two', 'three'] * 3,
        'B': ['A', 'B', 'C'] * 4,
        'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
        'D': np.random.randn(12),
        'E': np.random.randn(12)
    })

In [98]:
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.498207,0.529522
1,one,B,foo,0.491093,-2.049362
2,two,C,foo,-0.028618,-0.72758
3,three,A,bar,-0.691521,0.826481
4,one,B,bar,-0.330742,0.276043
5,one,C,bar,1.476844,0.214604
6,two,A,foo,-0.885335,-0.996861
7,three,B,foo,1.729131,0.794151
8,one,C,foo,-1.04755,-0.341753
9,one,A,bar,1.335092,0.372476


可以从这个数据中轻松的生成数据透视表：

In [100]:
pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,1.335092,-0.498207
one,B,-0.330742,0.491093
one,C,1.476844,-1.04755
three,A,-0.691521,
three,B,,1.729131
three,C,-1.313441,
two,A,,-0.885335
two,B,-0.384095,
two,C,,-0.028618


# 九、时间序列

pandas在对频率转换进行重新采样时拥有简单、强大且高效的功能（如将按秒采样的数据转换为按5分钟为单位进行采样的数据）。这种操作在金融领域非常常见

In [101]:
rng = pd.date_range('1/1/2012', periods=100, freq='S')

In [102]:
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [103]:
new_ts = ts.resample('1Min', how='sum')
new_ts

the new syntax is .resample(...).sum()
  """Entry point for launching an IPython kernel.


2012-01-01 00:00:00    14271
2012-01-01 00:01:00    10236
Freq: T, dtype: int32

In [104]:
new_ts

2012-01-01 00:00:00    14271
2012-01-01 00:01:00    10236
Freq: T, dtype: int32

# 十、文件读写

1.写入csv文件

In [105]:

df.to_csv('foo.csv',index=n)



SyntaxError: invalid syntax (<ipython-input-105-521dcf5f0a00>, line 2)

2.从csv文件中读取：

In [197]:
pd.read_csv('foo.csv',index_col=None)
df

Unnamed: 0,A,B,C,D,E
0,one,A,foo,-0.224964,0.065033
1,one,B,foo,0.336278,-0.103077
2,two,C,foo,0.406325,-1.149243
3,three,A,bar,0.304723,-0.778987
4,one,B,bar,0.108799,0.71347
5,one,C,bar,0.834365,-0.692851
6,two,A,foo,0.607851,-0.281253
7,three,B,foo,-0.767675,-0.867703
8,one,C,foo,0.946897,1.491578
9,one,A,bar,0.472619,0.038084
