pandas 是基于 Numpy 构建的，让以 Numpy 为中心的应用变得更加简单。
pandas主要包括三类数据结构，分别是：

Series：一维数组，与Numpy中的一维array类似。二者与Python基本的数据结构List也很相近，其区别是：List中的元素可以是不同的数据类型，而Array和Series中则只允许存储相同的数据类型，这样可以更有效的使用内存，提高运算效率。

DataFrame：二维的表格型数据结构。很多功能与R中的data.frame类似。可以将DataFrame理解为Series的容器。以下的内容主要以DataFrame为主。

Panel ：三维的数组，可以理解为DataFrame的容器。

In [2]:
import pandas as pd
import numpy as np

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
s.index

RangeIndex(start=0, stop=6, step=1)

In [5]:
dates = pd.date_range('20130101', periods = 6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.rand(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.703329,0.281207,0.063754,0.883012
2013-01-02,0.02708,0.662146,0.492727,0.350462
2013-01-03,0.856018,0.649287,0.395601,0.170433
2013-01-04,0.913669,0.699145,0.868384,0.062347
2013-01-05,0.492939,0.412321,0.201269,0.782908
2013-01-06,0.754802,0.559775,0.337048,0.392036


In [12]:
df2 = pd.DataFrame({'A':1, 
                    'B':pd.Timestamp('20130102'), 
                    'C':pd.Series(1, index=list(range(4)),dtype='float32'),
                    'D':np.array([3]*4, dtype='int32'),
                    'E':'foo'})
df2

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo
3,1,2013-01-02,1.0,3,foo


In [13]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object

In [14]:
df2.head()

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo
3,1,2013-01-02,1.0,3,foo


In [17]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [18]:
df2.columns

Index([u'A', u'B', u'C', u'D', u'E'], dtype='object')

In [19]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1,1,1,1
D,3,3,3,3
E,foo,foo,foo,foo


选取列

In [23]:
df2['B']

0   2013-01-02
1   2013-01-02
2   2013-01-02
3   2013-01-02
Name: B, dtype: datetime64[ns]

In [24]:
df2[['B','D']]

Unnamed: 0,B,D
0,2013-01-02,3
1,2013-01-02,3
2,2013-01-02,3
3,2013-01-02,3


选取行

In [22]:
df2[1:3]

Unnamed: 0,A,B,C,D,E
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo


In [25]:
df2

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo
3,1,2013-01-02,1.0,3,foo


In [26]:
df2.loc[0,'B']

Timestamp('2013-01-02 00:00:00')

In [30]:
df2.loc[0:2]

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo


In [28]:
df2.loc[:,['A','C']]

Unnamed: 0,A,C
0,1,1.0
1,1,1.0
2,1,1.0
3,1,1.0


In [29]:
df2.loc[:,'D']

0    3
1    3
2    3
3    3
Name: D, dtype: int32

In [33]:
df2

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
1,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo
3,1,2013-01-02,1.0,3,foo


In [35]:
df2.iloc[1,2]

1.0

In [36]:
df2.iloc[[0,2],:]

Unnamed: 0,A,B,C,D,E
0,1,2013-01-02,1.0,3,foo
2,1,2013-01-02,1.0,3,foo


In [40]:
df2.iloc[:,1]

0   2013-01-02
1   2013-01-02
2   2013-01-02
3   2013-01-02
Name: B, dtype: datetime64[ns]

In [41]:
df2.iloc[1,:]

A                      1
B    2013-01-02 00:00:00
C                      1
D                      3
E                    foo
Name: 1, dtype: object

In [43]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.703329,0.281207,0.063754,0.883012
2013-01-02,0.02708,0.662146,0.492727,0.350462
2013-01-03,0.856018,0.649287,0.395601,0.170433
2013-01-04,0.913669,0.699145,0.868384,0.062347
2013-01-05,0.492939,0.412321,0.201269,0.782908
2013-01-06,0.754802,0.559775,0.337048,0.392036


In [44]:
df[df.A > 0.5]

Unnamed: 0,A,B,C,D
2013-01-01,0.703329,0.281207,0.063754,0.883012
2013-01-03,0.856018,0.649287,0.395601,0.170433
2013-01-04,0.913669,0.699145,0.868384,0.062347
2013-01-06,0.754802,0.559775,0.337048,0.392036


In [45]:
df[(df.A > 0.5) & (df.B < 0.6)]

Unnamed: 0,A,B,C,D
2013-01-01,0.703329,0.281207,0.063754,0.883012
2013-01-06,0.754802,0.559775,0.337048,0.392036


In [46]:
df[df > 0.5]

Unnamed: 0,A,B,C,D
2013-01-01,0.703329,,,0.883012
2013-01-02,,0.662146,,
2013-01-03,0.856018,0.649287,,
2013-01-04,0.913669,0.699145,0.868384,
2013-01-05,,,,0.782908
2013-01-06,0.754802,0.559775,,


In [48]:
df[df > 0.5] = 0
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.281207,0.063754,0.0
2013-01-02,0.02708,0.0,0.492727,0.350462
2013-01-03,0.0,0.0,0.395601,0.170433
2013-01-04,0.0,0.0,0.0,0.062347
2013-01-05,0.492939,0.412321,0.201269,0.0
2013-01-06,0.0,0.0,0.337048,0.392036


In [49]:
df[df['B'].isin([0,0.4])]

Unnamed: 0,A,B,C,D
2013-01-02,0.02708,0.0,0.492727,0.350462
2013-01-03,0.0,0.0,0.395601,0.170433
2013-01-04,0.0,0.0,0.0,0.062347
2013-01-06,0.0,0.0,0.337048,0.392036


In [51]:
df['B'].isin([0,0.4])

2013-01-01    False
2013-01-02     True
2013-01-03     True
2013-01-04     True
2013-01-05    False
2013-01-06     True
Freq: D, Name: B, dtype: bool

In [62]:
df = pd.DataFrame(np.random.rand(24).reshape(4,6),
                  index=['A','B','C','D'],
                  columns=['aa','bb','cc','dd','ee','ff'])
df

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [67]:
df1 = df[df > 0.1]
df1

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,,0.414991,,0.870996,0.958523
C,0.537097,0.144073,0.334811,,,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [68]:
df1.dropna()

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [76]:
df1.fillna(0)

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,,0.414991,,0.870996,0.958523
C,0.537097,0.144073,0.334811,,,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [71]:
pd.isnull(df1)

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,False,False,False,False,False,False
B,False,True,False,True,False,False
C,False,False,False,True,True,False
D,False,False,False,False,False,False


In [73]:
df.mean()

aa    0.351775
bb    0.261152
cc    0.426735
dd    0.368801
ee    0.511624
ff    0.560260
dtype: float64

In [74]:
df.mean(1)

A    0.461779
B    0.443614
C    0.303360
D    0.444811
dtype: float64

In [75]:
df1.mean()

aa    0.351775
bb    0.345389
cc    0.426735
dd    0.714091
ee    0.651792
ff    0.560260
dtype: float64

In [77]:
df1.mean(axis=1, skipna = False)

A    0.461779
B         NaN
C         NaN
D    0.444811
dtype: float64

In [78]:
pd.concat([df,df1])

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,,0.414991,,0.870996,0.958523
C,0.537097,0.144073,0.334811,,,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [84]:
s = df.iloc[0]
s

aa    0.252356
bb    0.742167
cc    0.405788
dd    0.531968
ee    0.367456
ff    0.470940
Name: A, dtype: float64

In [85]:
df

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [91]:
z = df.append(s)
z

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094


In [92]:
z.duplicated()

A    False
B    False
C    False
D    False
A     True
dtype: bool

In [93]:
df.append(s, ignore_index=True)

Unnamed: 0,aa,bb,cc,dd,ee,ff
0,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
1,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
2,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
3,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746
4,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094


In [94]:
df

Unnamed: 0,aa,bb,cc,dd,ee,ff
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746


In [96]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,aa,bb,cc,dd,ee,ff
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094


In [97]:
df.sort_values(by='cc')

Unnamed: 0,aa,bb,cc,dd,ee,ff
C,0.537097,0.144073,0.334811,0.025231,0.09112,0.68783
A,0.252356,0.742167,0.405788,0.531968,0.367456,0.47094
B,0.386946,0.008439,0.414991,0.021791,0.870996,0.958523
D,0.230702,0.149929,0.55135,0.896215,0.716926,0.123746
