In [40]:
import pandas as pd
import numpy as np 
from pandas import DataFrame
from pandas import Series

|函数|描述|
|---|---|
|pd.Series||
|pd.DataFrame||


## 金融数据结构

### sequence

1.sequence支持向量化计算

In [2]:
numbers = [1,2,3,4,5,6,7,-1,-2,-3]

In [4]:
print(numbers[0])
print(numbers[1])
print(numbers[0:6])

1
2
[1, 2, 3, 4, 5, 6]


In [5]:
np.exp(numbers)

array([2.71828183e+00, 7.38905610e+00, 2.00855369e+01, 5.45981500e+01,
       1.48413159e+02, 4.03428793e+02, 1.09663316e+03, 3.67879441e-01,
       1.35335283e-01, 4.97870684e-02])

### Series

Series 作为Sequence的进阶版，不仅保留了Sequence的性质，还支持如同R语言中的向量化运算

In [6]:
numbers = [1,2,3,4,5,6,7,-1,-2,-3]
obj1 = Series([numbers])
print(obj1)
print(obj1.values)
print(obj1.index)

0    [1, 2, 3, 4, 5, 6, 7, -1, -2, -3]
dtype: object
[list([1, 2, 3, 4, 5, 6, 7, -1, -2, -3])]
RangeIndex(start=0, stop=1, step=1)


In [7]:
obj2 = Series([1,-1,2,4],index=['a','b','c','d'])
print(obj2)
print(obj2.values)
print(obj2.index)

a    1
b   -1
c    2
d    4
dtype: int64
[ 1 -1  2  4]
Index(['a', 'b', 'c', 'd'], dtype='object')


In [12]:
#索引
print(obj2[0:2])  #python的区间为左闭右开
print(obj2['a'])
print(obj2>0) # 返回Boole量
print(obj2[obj2>0]) # 可用Boole量进行索引

a    1
b   -1
dtype: int64
1
a     True
b    False
c     True
d     True
dtype: bool
a    1
c    2
d    4
dtype: int64


In [103]:
print('a' in obj2) # 索引

True


TypeError: 'Index' object is not callable

In [18]:
print(2*obj2)
print(obj2+obj2)
print(np.exp(obj2))

a    2
b   -2
c    4
d    8
dtype: int64
a    2
b   -2
c    4
d    8
dtype: int64
a     2.718282
b     0.367879
c     7.389056
d    54.598150
dtype: float64


Dictionary to Series

In [20]:
# Dict -> Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [21]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states) # 自动与dict的key匹配
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [25]:
print(pd.isna(obj4))
print(pd.isnull(obj4))

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [27]:
obj3+obj4 #数据能够自动对齐

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [33]:
obj4.name = 'Population'
obj4.index.name = 'State'
obj4

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

### DataFrame

DataFrame可以看作

|函数|描述|
|---|---|
|df.sort_values||
|df.columns||
|df.index||
|df.loc||
|df.iloc||

In [None]:
pd.DataFrame

In [111]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data) # key对应frame的列名
df

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [117]:
#df的特征
print(df.columns)
print(df.index)

Index(['state', 'year', 'pop'], dtype='object')
RangeIndex(start=0, stop=5, step=1)


In [46]:
#可按照指定列重排
df = pd.DataFrame(data,columns=['year','state','pop'])
df.sort_values('year')

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
3,2001,Nevada,2.4
2,2002,Ohio,3.6
4,2002,Nevada,2.9


In [58]:
#关于DataFrame的索引
#print(df['year'])
#print(df[['year','pop']])
print(df.state)

0      Ohio
1      Ohio
2      Ohio
3    Nevada
4    Nevada
Name: state, dtype: object


In [109]:
#df.loc
df.loc[:,'year'] #取year列
df.loc[0,:] #取0行
df.loc[0:2,['year','pop']]

Unnamed: 0,year,pop
0,2000,1.5
1,2001,1.7
2,2002,3.6


df.iloc|df.loc改版后需要记住如下命令    
list(df.columns).index('year')


In [110]:
#df.iloc
#通过行列标进行索引()
list(df.columns).index('year') #!!!!!!!!!
df.iloc[:,list(df.columns).index('year')]

0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64

In [56]:
df2 = pd.DataFrame(data,
                   columns=['year', 'state', 'pop', 'debt'],
                   index=['one', 'two', 'three', 'four', 'five']) # 分别指定行列名字，缺失值自动填充，比如debt列。
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [120]:
df2['debt'] = 16.5
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [121]:
val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df2['debt'] = val # 索引不匹配的话自动补NaN
df2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [140]:
df2['Eastern'] = df2['state'] == 'Ohio'
df2

Unnamed: 0,year,state,pop,debt,Eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [138]:
del df2['Eastern']

In [141]:
df2.T

Unnamed: 0,one,two,three,four,five
year,2000,2001,2002,2001,2002
state,Ohio,Ohio,Ohio,Nevada,Nevada
pop,1.5,1.7,3.6,2.4,2.9
debt,,-1.2,,-1.5,-1.7
Eastern,True,True,True,False,False


#### 关于index的操作

|函数|描述|
|---|---|
|append|      连接另一个Index对象，产生一个新的Index|
|diff|     计算差集，并得到一个Index
|intersection|计算交集
|union|       计算并集
|isin|        计算一个指示各值是否都包含在参数集合中的布尔型数组
|delete|      删除索引i处的元素，并得到新的Index
|drop|       删除传入的值，并得到新的Index
|insert|      将元素插入到索引i处，并得到新的Index
|is_monotonic|如果单调增长，返回True
|is_unique|   当Index没有重复值时，返回True
|unique|      计算Index中唯一值得数组

In [148]:
df2.set_index('year')

Unnamed: 0,year,state,pop,debt,Eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [162]:
df2.index.isin(['one','two'])

array([ True,  True, False, False, False])