### 对象创建

In [58]:
import pandas as pd
import numpy as np

# create Series
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [59]:
# create DataFrame
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,-0.349544
2013-01-02,1.717532,1.30838,-1.182478,0.796164
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442
2013-01-04,0.281134,-0.769068,-0.958983,0.163895
2013-01-05,0.863026,-0.044164,0.240164,-2.380434
2013-01-06,0.320326,0.662902,0.197434,0.292789


### 查看数据

In [60]:
# 查看前n个数据，默认n=5
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,-0.349544
2013-01-02,1.717532,1.30838,-1.182478,0.796164
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442
2013-01-04,0.281134,-0.769068,-0.958983,0.163895
2013-01-05,0.863026,-0.044164,0.240164,-2.380434


In [61]:
# 查看后n个数据
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.281134,-0.769068,-0.958983,0.163895
2013-01-05,0.863026,-0.044164,0.240164,-2.380434
2013-01-06,0.320326,0.662902,0.197434,0.292789


In [62]:
# 查看索引
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [63]:
# 查看列名
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [64]:
# 转化为numpy矩阵
df.to_numpy()

array([[ 1.7464178 ,  0.25582752,  0.82497431, -0.34954362],
       [ 1.71753196,  1.30837982, -1.18247794,  0.79616363],
       [-0.07750804, -0.76372391, -2.19267895,  0.98444217],
       [ 0.28113376, -0.76906785, -0.95898314,  0.16389452],
       [ 0.86302563, -0.04416414,  0.24016352, -2.38043375],
       [ 0.32032598,  0.66290198,  0.19743418,  0.29278868]])

In [65]:
# 统计信息
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.808488,0.108359,-0.511928,-0.082115
std,0.775943,0.815228,1.125217,1.221784
min,-0.077508,-0.769068,-2.192679,-2.380434
25%,0.290932,-0.583834,-1.126604,-0.221184
50%,0.591676,0.105832,-0.380774,0.228342
75%,1.503905,0.561133,0.229481,0.67032
max,1.746418,1.30838,0.824974,0.984442


In [66]:
# 数据转置
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.746418,1.717532,-0.077508,0.281134,0.863026,0.320326
B,0.255828,1.30838,-0.763724,-0.769068,-0.044164,0.662902
C,0.824974,-1.182478,-2.192679,-0.958983,0.240164,0.197434
D,-0.349544,0.796164,0.984442,0.163895,-2.380434,0.292789


In [67]:
# 对index进行排序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.349544,0.824974,0.255828,1.746418
2013-01-02,0.796164,-1.182478,1.30838,1.717532
2013-01-03,0.984442,-2.192679,-0.763724,-0.077508
2013-01-04,0.163895,-0.958983,-0.769068,0.281134
2013-01-05,-2.380434,0.240164,-0.044164,0.863026
2013-01-06,0.292789,0.197434,0.662902,0.320326


In [68]:
# 对数据进行排序
df.sort_values(by=['A', 'B'], ascending=[False, True])

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,-0.349544
2013-01-02,1.717532,1.30838,-1.182478,0.796164
2013-01-05,0.863026,-0.044164,0.240164,-2.380434
2013-01-06,0.320326,0.662902,0.197434,0.292789
2013-01-04,0.281134,-0.769068,-0.958983,0.163895
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442


### 数据选择

In [69]:
# 选取列名为A的列，等同于df.A
df['A']

2013-01-01    1.746418
2013-01-02    1.717532
2013-01-03   -0.077508
2013-01-04    0.281134
2013-01-05    0.863026
2013-01-06    0.320326
Freq: D, Name: A, dtype: float64

In [70]:
# 选取0-3行
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,-0.349544
2013-01-02,1.717532,1.30838,-1.182478,0.796164
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442


In [71]:
# loc按照行列的label选取数据
df.loc[dates[0], ['A','B']]

A    1.746418
B    0.255828
Name: 2013-01-01 00:00:00, dtype: float64

In [72]:
# iloc按照行列的索引值进行数据选取
df.iloc[0, [0, 1]]

A    1.746418
B    0.255828
Name: 2013-01-01 00:00:00, dtype: float64

In [73]:
# 布尔表达式
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,-0.349544
2013-01-02,1.717532,1.30838,-1.182478,0.796164
2013-01-04,0.281134,-0.769068,-0.958983,0.163895
2013-01-05,0.863026,-0.044164,0.240164,-2.380434
2013-01-06,0.320326,0.662902,0.197434,0.292789


In [74]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.746418,0.255828,0.824974,
2013-01-02,1.717532,1.30838,,0.796164
2013-01-03,,,,0.984442
2013-01-04,0.281134,,,0.163895
2013-01-05,0.863026,,0.240164,
2013-01-06,0.320326,0.662902,0.197434,0.292789


In [75]:
# isin
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2[df2["E"].isin(["two", "four"])]

Unnamed: 0,A,B,C,D,E
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442,two
2013-01-05,0.863026,-0.044164,0.240164,-2.380434,four


### 数据setting

In [76]:
# 插入新列
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6))
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.746418,0.255828,0.824974,-0.349544,
2013-01-02,1.717532,1.30838,-1.182478,0.796164,1.0
2013-01-03,-0.077508,-0.763724,-2.192679,0.984442,2.0
2013-01-04,0.281134,-0.769068,-0.958983,0.163895,3.0
2013-01-05,0.863026,-0.044164,0.240164,-2.380434,4.0
2013-01-06,0.320326,0.662902,0.197434,0.292789,5.0


In [77]:
# 使用label修改某个位置的值
# 注：loc一次可以选取多行多列，at只能选取指定位置的值
df.at[dates[0], 'A'] = 0

In [78]:
# 使用索引值修改某个位置的值
df.iat[0, 1] = 0

In [79]:
# 使用numpy进行赋值
df.loc[:, 'D'] = np.array([5] * len(df))

In [80]:
# 布尔表达式赋值
df2 = df.copy()
df2[df2 > 0] = -df2
df2

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.824974,-5,
2013-01-02,-1.717532,-1.30838,-1.182478,-5,-1.0
2013-01-03,-0.077508,-0.763724,-2.192679,-5,-2.0
2013-01-04,-0.281134,-0.769068,-0.958983,-5,-3.0
2013-01-05,-0.863026,-0.044164,-0.240164,-5,-4.0
2013-01-06,-0.320326,-0.662902,-0.197434,-5,-5.0


### 缺失值处理

In [81]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.824974,5,,
2013-01-02,1.717532,1.30838,-1.182478,5,1.0,
2013-01-03,-0.077508,-0.763724,-2.192679,5,2.0,
2013-01-04,0.281134,-0.769068,-0.958983,5,3.0,


In [82]:
# dropna
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E


In [83]:
# fillna
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.824974,5,5.0,5.0
2013-01-02,1.717532,1.30838,-1.182478,5,1.0,5.0
2013-01-03,-0.077508,-0.763724,-2.192679,5,2.0,5.0
2013-01-04,0.281134,-0.769068,-0.958983,5,3.0,5.0


In [84]:
# isna
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,True,True
2013-01-02,False,False,False,False,False,True
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


### 统计

In [90]:
# 按列取平均
df.mean()

A    0.517418
B    0.065721
C   -0.511928
D    5.000000
F    3.000000
dtype: float64

In [87]:
# 按行取平均
df.mean(1)

2013-01-01    1.456244
2013-01-02    1.568687
2013-01-03    0.793218
2013-01-04    1.310617
2013-01-05    2.011805
2013-01-06    2.236132
Freq: D, dtype: float64

In [93]:
# Histogramming
s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()

6    3
1    3
2    2
5    1
4    1
dtype: int64

### apply

In [94]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.824974,5,
2013-01-02,1.717532,1.30838,-0.357504,10,1.0
2013-01-03,1.640024,0.544656,-2.550183,15,3.0
2013-01-04,1.921158,-0.224412,-3.509166,20,6.0
2013-01-05,2.784183,-0.268576,-3.269002,25,10.0
2013-01-06,3.104509,0.394326,-3.071568,30,15.0


In [95]:
df.apply(lambda x: x.max() - x.min())

A    1.795040
B    2.077448
C    3.017653
D    0.000000
F    4.000000
dtype: float64