In [1]:
# 引包
import pandas as pd
import numpy as np
import warnings;
warnings.simplefilter('ignore')

In [2]:
df = pd.DataFrame([30,40,20,48,28],columns=['Scores'],index=['a','b','c','d','e'],dtype='f')
df

Unnamed: 0,Scores
a,30.0
b,40.0
c,20.0
d,48.0
e,28.0


In [3]:
# 查看dtype
df.dtypes

Scores    float32
dtype: object

In [4]:
# 查看index索引
df.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
# 查看列名
df.columns

Index(['Scores'], dtype='object')

In [6]:
df.ix['c']

Scores    20.0
Name: c, dtype: float32

In [7]:
df.ix[['a','c','e']]

Unnamed: 0,Scores
a,30.0
c,20.0
e,28.0


In [8]:
# 可以看出‘Scores’是Series
df.Scores.dtype

dtype('float32')

In [9]:
# 筛选index范围[b,c]
df.ix[df.index[1:3]]

Unnamed: 0,Scores
b,40.0
c,20.0


In [10]:
df.ix[1:3]

Unnamed: 0,Scores
b,40.0
c,20.0


In [11]:
df.iloc[1:3]

Unnamed: 0,Scores
b,40.0
c,20.0


In [12]:
# 按行求和
df.sum(axis=1)

a    30.0
b    40.0
c    20.0
d    48.0
e    28.0
dtype: float32

In [13]:
# 按列求和
df.sum(axis=0)

Scores    166.0
dtype: float32

In [14]:
# 对DataFrame里面的列名进行修改，inplace 替换原始图
df.rename(columns={'Scores':'score'},inplace=True)
df

Unnamed: 0,score
a,30.0
b,40.0
c,20.0
d,48.0
e,28.0


In [15]:
# 修改元素的类型，比如浮点型转换程整型
df['score'].astype('int')

a    30
b    40
c    20
d    48
e    28
Name: score, dtype: int64

In [16]:
# 上面修改的方法，并没有永久保存
df['score']

a    30.0
b    40.0
c    20.0
d    48.0
e    28.0
Name: score, dtype: float32

In [17]:
# 如果需要永久保存，则需要如下操作：
df['score'] = df['score'].astype('int')
df

Unnamed: 0,score
a,30
b,40
c,20
d,48
e,28


In [19]:
df['score'].dtype

dtype('int64')

In [20]:
### 修改DataFrame
dict_data = {
    'Date': pd.datetime(2017,8,30),
    'Number': pd.Series([6,7,8,9]),
    'Course_name': pd.Series(['python','Finance','CFA']),
    'Company': 'JCAQF'
}

df = pd.DataFrame(dict_data)
df

Unnamed: 0,Date,Number,Course_name,Company
0,2017-08-30,6,python,JCAQF
1,2017-08-30,7,Finance,JCAQF
2,2017-08-30,8,CFA,JCAQF
3,2017-08-30,9,,JCAQF


In [21]:
# 增加列
df['Period'] = range(21,25)
df.head()

Unnamed: 0,Date,Number,Course_name,Company,Period
0,2017-08-30,6,python,JCAQF,21
1,2017-08-30,7,Finance,JCAQF,22
2,2017-08-30,8,CFA,JCAQF,23
3,2017-08-30,9,,JCAQF,24


In [23]:
# 删除列
del(df['Period'])
df

Unnamed: 0,Date,Number,Course_name,Company
0,2017-08-30,6,python,JCAQF
1,2017-08-30,7,Finance,JCAQF
2,2017-08-30,8,CFA,JCAQF
3,2017-08-30,9,,JCAQF


In [24]:
df = pd.DataFrame([80,5,90,58,88],columns=['Scores'],index=['a','b','c','d','e'])
df

Unnamed: 0,Scores
a,80
b,5
c,90
d,58
e,88


In [26]:
df['Hours'] = (2.1,1.5,1.6,2.2,3.4)
df

Unnamed: 0,Scores,Hours
a,80,2.1
b,5,1.5
c,90,1.6
d,58,2.2
e,88,3.4


In [31]:
df['Names'] = pd.DataFrame(['Alpha','Beta','Gamma','Theta','lambda'],index=['a','d','e','b','c'])
df

Unnamed: 0,Scores,Hours,Names
a,80,2.1,Alpha
b,5,1.5,Theta
c,90,1.6,lambda
d,58,2.2,Beta
e,88,3.4,Gamma


In [32]:
##  DataFrame对齐操作
df1 = pd.DataFrame(np.random.random([6,6]),columns=['a','b','c','d','e','f'])
df2 = pd.DataFrame(np.random.random([3,3]),columns=['a','b','c'])

In [33]:
df1

Unnamed: 0,a,b,c,d,e,f
0,0.465408,0.495879,0.505336,0.386017,0.74347,0.069921
1,0.578116,0.892323,0.384626,0.487768,0.239119,0.67613
2,0.797274,0.449718,0.108592,0.077277,0.62328,0.639454
3,0.697353,0.692568,0.460249,0.161062,0.101751,0.733001
4,0.851391,0.195252,0.302354,0.641758,0.736826,0.508903
5,0.35665,0.613143,0.938661,0.415965,0.769477,0.059336


In [34]:
df2

Unnamed: 0,a,b,c
0,0.72612,0.056678,0.789474
1,0.374774,0.163061,0.753447
2,0.729537,0.590851,0.859514


In [41]:
df4 = df1 + df2
df4

Unnamed: 0,a,b,c,d,e,f
0,1.191528,0.552557,1.294809,,,
1,0.95289,1.055383,1.138072,,,
2,1.526811,1.040569,0.968106,,,
3,,,,,,
4,,,,,,
5,,,,,,


In [42]:
df4.fillna(0,inplace=True)

In [43]:
df4

Unnamed: 0,a,b,c,d,e,f
0,1.191528,0.552557,1.294809,0.0,0.0,0.0
1,0.95289,1.055383,1.138072,0.0,0.0,0.0
2,1.526811,1.040569,0.968106,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# 将pandas的功能应用到股票投资上
d = {
    'PE': pd.Series([10,20,30,40],index=['Company a','Company b','Company c','Company d']),
    'PB': pd.Series([2.,3.,2.5,4.],index=['Company a','Company b','Company c','Company d']),
    'ROE': pd.Series([0.06,0.1,0.08,0.02],index=['Company a','Company b','Company c','Company d'])
}

df = pd.DataFrame(d)
df

Unnamed: 0,PE,PB,ROE
Company a,10,2.0,0.06
Company b,20,3.0,0.1
Company c,30,2.5,0.08
Company d,40,4.0,0.02


In [46]:
#  下列计算的结果是布尔值
df.PE < 25

Company a     True
Company b     True
Company c    False
Company d    False
Name: PE, dtype: bool

In [48]:
# 如果想赋值0或者1进行分类，则可以：
(df.PE < 25) * 1

Company a    1
Company b    1
Company c    0
Company d    0
Name: PE, dtype: int64

In [47]:
# 选择PE < 25 而且 PB < 3
df[(df.PE < 25) & (df.PB < 3)]

Unnamed: 0,PE,PB,ROE
Company a,10,2.0,0.06


In [49]:
# 按照两个条件选择股票
df[(df.PE < 25)*1 + (df.PB < 3)*1 == 2]

Unnamed: 0,PE,PB,ROE
Company a,10,2.0,0.06


In [50]:
# 按照三个条件选择股票
df[(df.PE < 25)*1 + (df.PB < 3)*1 + (df.ROE > 0.07)*1 == 2]

Unnamed: 0,PE,PB,ROE
Company a,10,2.0,0.06
Company b,20,3.0,0.1
Company c,30,2.5,0.08


### apply运算

In [52]:
a = np.random.randn(9,6)
df = pd.DataFrame(a)
df

Unnamed: 0,0,1,2,3,4,5
0,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617
1,1.095561,-1.76603,-0.032056,-1.53837,-2.428174,-0.728698
2,-0.760391,0.03763,1.813364,-0.670896,-0.049036,0.329247
3,0.051095,0.205866,0.02798,-0.035998,0.232137,-0.682931
4,1.091159,-1.194238,0.36905,-0.515901,-0.020182,0.804268
5,-0.864316,0.158061,0.049852,0.459555,1.050492,-1.356752
6,0.693988,-0.204393,1.783967,-0.971828,-0.36175,-0.35612
7,1.025137,0.238043,1.28789,0.720181,0.317643,-0.880538
8,-0.384215,-0.512327,-1.681151,0.020829,-0.386279,0.228696


In [70]:
df.columns = ['a','b','c','d','e','f']
df

Unnamed: 0,a,b,c,d,e,f
2017-01-31,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617
2017-02-28,1.095561,-1.76603,-0.032056,-1.53837,-2.428174,-0.728698
2017-03-31,-0.760391,0.03763,1.813364,-0.670896,-0.049036,0.329247
2017-04-30,0.051095,0.205866,0.02798,-0.035998,0.232137,-0.682931
2017-05-31,1.091159,-1.194238,0.36905,-0.515901,-0.020182,0.804268
2017-06-30,-0.864316,0.158061,0.049852,0.459555,1.050492,-1.356752
2017-07-31,0.693988,-0.204393,1.783967,-0.971828,-0.36175,-0.35612
2017-08-31,1.025137,0.238043,1.28789,0.720181,0.317643,-0.880538
2017-09-30,-0.384215,-0.512327,-1.681151,0.020829,-0.386279,0.228696


In [71]:
dates = pd.date_range('2017-1-1',periods=9,freq='M')
df.index = dates
df

Unnamed: 0,a,b,c,d,e,f
2017-01-31,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617
2017-02-28,1.095561,-1.76603,-0.032056,-1.53837,-2.428174,-0.728698
2017-03-31,-0.760391,0.03763,1.813364,-0.670896,-0.049036,0.329247
2017-04-30,0.051095,0.205866,0.02798,-0.035998,0.232137,-0.682931
2017-05-31,1.091159,-1.194238,0.36905,-0.515901,-0.020182,0.804268
2017-06-30,-0.864316,0.158061,0.049852,0.459555,1.050492,-1.356752
2017-07-31,0.693988,-0.204393,1.783967,-0.971828,-0.36175,-0.35612
2017-08-31,1.025137,0.238043,1.28789,0.720181,0.317643,-0.880538
2017-09-30,-0.384215,-0.512327,-1.681151,0.020829,-0.386279,0.228696


In [72]:
def square_fun(x):
    return x**2

In [73]:
# 对所有元素进行平方
df.apply(square_fun)

Unnamed: 0,a,b,c,d,e,f
2017-01-31,1.590562,0.222673,0.244337,0.290109,0.182111,3.376836
2017-02-28,1.200253,3.118863,0.001028,2.366582,5.896027,0.531001
2017-03-31,0.578194,0.001416,3.28829,0.450102,0.002404,0.108403
2017-04-30,0.002611,0.042381,0.000783,0.001296,0.053888,0.466395
2017-05-31,1.190627,1.426204,0.136198,0.266153,0.000407,0.646846
2017-06-30,0.747041,0.024983,0.002485,0.21119,1.103532,1.840776
2017-07-31,0.481619,0.041776,3.182539,0.944449,0.130863,0.126821
2017-08-31,1.050906,0.056664,1.65866,0.518661,0.100897,0.775348
2017-09-30,0.147621,0.262479,2.82627,0.000434,0.149212,0.052302


In [74]:
df.apply(lambda x: x.min(),axis=1)

2017-01-31   -1.261175
2017-02-28   -2.428174
2017-03-31   -0.760391
2017-04-30   -0.682931
2017-05-31   -1.194238
2017-06-30   -1.356752
2017-07-31   -0.971828
2017-08-31   -0.880538
2017-09-30   -1.681151
Freq: M, dtype: float64

In [75]:
df.apply(lambda x: x**0.5)

Unnamed: 0,a,b,c,d,e,f
2017-01-31,,,,,0.653257,1.355587
2017-02-28,1.04669,,,,,
2017-03-31,,0.193985,1.346612,,,0.5738
2017-04-30,0.226042,0.453724,0.167273,,0.481806,
2017-05-31,1.044585,,0.607495,,,0.89681
2017-06-30,,0.397569,0.223276,0.677905,1.024935,
2017-07-31,0.833059,,1.335652,,,
2017-08-31,1.01249,0.487896,1.134852,0.848635,0.563598,
2017-09-30,,,,0.144322,,0.478222


In [76]:
def find_min(x):
    return x.min()

# axis=0 按行计算，axis=1 按列计算
df.apply(find_min,axis=1)

2017-01-31   -1.261175
2017-02-28   -2.428174
2017-03-31   -0.760391
2017-04-30   -0.682931
2017-05-31   -1.194238
2017-06-30   -1.356752
2017-07-31   -0.971828
2017-08-31   -0.880538
2017-09-30   -1.681151
Freq: M, dtype: float64

In [77]:
# 按照index进行排序,降序排列
df.sort_index(ascending=False)

Unnamed: 0,a,b,c,d,e,f
2017-09-30,-0.384215,-0.512327,-1.681151,0.020829,-0.386279,0.228696
2017-08-31,1.025137,0.238043,1.28789,0.720181,0.317643,-0.880538
2017-07-31,0.693988,-0.204393,1.783967,-0.971828,-0.36175,-0.35612
2017-06-30,-0.864316,0.158061,0.049852,0.459555,1.050492,-1.356752
2017-05-31,1.091159,-1.194238,0.36905,-0.515901,-0.020182,0.804268
2017-04-30,0.051095,0.205866,0.02798,-0.035998,0.232137,-0.682931
2017-03-31,-0.760391,0.03763,1.813364,-0.670896,-0.049036,0.329247
2017-02-28,1.095561,-1.76603,-0.032056,-1.53837,-2.428174,-0.728698
2017-01-31,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617


In [78]:
df.sort_index(axis=1,ascending=False)

Unnamed: 0,f,e,d,c,b,a
2017-01-31,1.837617,0.426745,-0.538618,-0.494304,-0.471882,-1.261175
2017-02-28,-0.728698,-2.428174,-1.53837,-0.032056,-1.76603,1.095561
2017-03-31,0.329247,-0.049036,-0.670896,1.813364,0.03763,-0.760391
2017-04-30,-0.682931,0.232137,-0.035998,0.02798,0.205866,0.051095
2017-05-31,0.804268,-0.020182,-0.515901,0.36905,-1.194238,1.091159
2017-06-30,-1.356752,1.050492,0.459555,0.049852,0.158061,-0.864316
2017-07-31,-0.35612,-0.36175,-0.971828,1.783967,-0.204393,0.693988
2017-08-31,-0.880538,0.317643,0.720181,1.28789,0.238043,1.025137
2017-09-30,0.228696,-0.386279,0.020829,-1.681151,-0.512327,-0.384215


In [80]:
# 按照值进行排序,降序
df.sort_values(by='a',ascending=False)

Unnamed: 0,a,b,c,d,e,f
2017-02-28,1.095561,-1.76603,-0.032056,-1.53837,-2.428174,-0.728698
2017-05-31,1.091159,-1.194238,0.36905,-0.515901,-0.020182,0.804268
2017-08-31,1.025137,0.238043,1.28789,0.720181,0.317643,-0.880538
2017-07-31,0.693988,-0.204393,1.783967,-0.971828,-0.36175,-0.35612
2017-04-30,0.051095,0.205866,0.02798,-0.035998,0.232137,-0.682931
2017-09-30,-0.384215,-0.512327,-1.681151,0.020829,-0.386279,0.228696
2017-03-31,-0.760391,0.03763,1.813364,-0.670896,-0.049036,0.329247
2017-06-30,-0.864316,0.158061,0.049852,0.459555,1.050492,-1.356752
2017-01-31,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617


In [81]:
# 通用函数计算,按照行排序
df.sum(axis=1)

2017-01-31   -0.501618
2017-02-28   -5.397767
2017-03-31    0.699918
2017-04-30   -0.201851
2017-05-31    0.534156
2017-06-30   -0.503109
2017-07-31    0.583865
2017-08-31    2.708355
2017-09-30   -2.714448
Freq: M, dtype: float64

In [82]:
# 求平均值
df.mean()

a    0.076316
b   -0.389919
c    0.347177
d   -0.341227
e   -0.135378
f   -0.089468
dtype: float64

In [83]:
df.cumsum()

Unnamed: 0,a,b,c,d,e,f
2017-01-31,-1.261175,-0.471882,-0.494304,-0.538618,0.426745,1.837617
2017-02-28,-0.165614,-2.237912,-0.52636,-2.076988,-2.001429,1.108919
2017-03-31,-0.926005,-2.200282,1.287004,-2.747884,-2.050465,1.438165
2017-04-30,-0.874911,-1.994416,1.314985,-2.783882,-1.818327,0.755234
2017-05-31,0.216248,-3.188654,1.684034,-3.299782,-1.838509,1.559502
2017-06-30,-0.648067,-3.030593,1.733886,-2.840228,-0.788018,0.20275
2017-07-31,0.04592,-3.234986,3.517854,-3.812055,-1.149768,-0.15337
2017-08-31,1.071057,-2.996943,4.805743,-3.091874,-0.832125,-1.033908
2017-09-30,0.686843,-3.50927,3.124592,-3.071045,-1.218404,-0.805212


In [84]:
# 对df的描述
df.describe()

Unnamed: 0,a,b,c,d,e,f
count,9.0,9.0,9.0,9.0,9.0,9.0
mean,0.076316,-0.389919,0.347177,-0.341227,-0.135378,-0.089468
std,0.931197,0.691837,1.132244,0.7072,0.965298,0.99075
min,-1.261175,-1.76603,-1.681151,-1.53837,-2.428174,-1.356752
25%,-0.760391,-0.512327,-0.032056,-0.670896,-0.36175,-0.728698
50%,0.051095,-0.204393,0.049852,-0.515901,-0.020182,-0.35612
75%,1.025137,0.158061,1.28789,0.020829,0.317643,0.329247
max,1.095561,0.238043,1.813364,0.720181,1.050492,1.837617
