# 汇总和计算描述统计


In [14]:
import numpy as np
import pandas as pd
import tushare as ts
from pandas import DataFrame
from pandas import Series

In [5]:
df = DataFrame([[1.4, np.nan],
                [7.1, -4.5],
                [np.nan, np.nan],
                [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [6]:
df.sum() # 默认沿着行方向，从上到下，也就是对每列求和。


one    9.25
two   -5.80
dtype: float64

In [7]:
df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

## 约简方法常用选项


`axis`：  轴的方向，0=从上到下行方向，1=从左到右列方向。

`skipna`：排除缺失值，默认为True。

`level`： 如果使用层次化索引（MultiIndex），则根据level分组约简。

In [9]:
df.idxmax() # 每一列最大元素对应的行索引

one    b
two    d
dtype: object

In [10]:
df.cumsum() # cumsum函数理解，因为默认axis=0，所以沿着行方向。


# 对于列one，
# one['a'] = 1.4
# one['b'] += one['a'] = 1.4 + 7.1 = 8.5
# one['c'] = NaN不动
# one['d'] += one['b'] = 8.5 + 0.75 = 9.25（因为one['c']为NaN，所以向上找。）

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [11]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [12]:
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe() # describe可以作用于非数值型

count     16
unique     3
top        a
freq       8
dtype: object

### 描述和汇总统计

||作用|
|-|-|
|1|**基本属性**
|`count`|非NA值得数量
|`describe`|针对Series和DataFrame列计算汇总统计
|`min/max`|最小/最大值
|`argmin/argmax`|计算能够获取最小/最大值的索引位置（整数）
|`idxmin/idxmax`|计算能够获取最小/最大值的索引值
|`quantile`|计算样本的分位数（0到1）
|`sum`|求和
|`mean`|求平均
|`median`|中位数
|`mad`|根据平均值计算平均绝对离差
|`var`|样本的方差
|`std`|样本的标准差
|`skew`|样本值的偏度（三阶矩）
|`kurt`|样本值的峰度（四阶矩）
|`cumsum`|累计和
|`cummin/cummax`|累计最小最大值
|`cumprod`|累计积
|`diff`|计算一阶差分（对时间序列很有用）
|`pct_change`|计算百分数变化

## 相关系数与协方差

In [15]:
gsyh = ts.get_hist_data('601398',start='2017-01-01',end='2017-06-30') # 工商银行
jsyh = ts.get_hist_data('601939',start='2017-01-01',end='2017-06-30') # 建设银行
jtyh = ts.get_hist_data('601328',start='2017-01-01',end='2017-06-30') # 交通银行

本接口即将停止更新，请尽快使用Pro版接口：https://tushare.pro/document/2
本接口即将停止更新，请尽快使用Pro版接口：https://tushare.pro/document/2
本接口即将停止更新，请尽快使用Pro版接口：https://tushare.pro/document/2


In [34]:
# cc：以上接口为老接口，现在用不了了。
ts.set_token('xxxx') # 可以注册tushare账号，初始化100积分+ 个人资料20 积分，即可有权限访问最基础的数据；
pro = ts.pro_api()


In [50]:
gsyh = pro.daily(ts_code='601398.SH', start_date='20200101', end_date='20200110')

jsyh = pro.daily(ts_code='601939.SH', start_date='20200101', end_date='20200110')

jtyh = pro.daily(ts_code='601328.SH', start_date='20200101', end_date='20200110')


In [53]:
gsyh

Unnamed: 0,ts_code,trade_date,open,high,low,close,pre_close,change,pct_chg,vol,amount
0,601398.SH,20200110,5.91,5.93,5.86,5.91,5.91,0.0,0.0,978377.73,576580.178
1,601398.SH,20200109,5.95,5.96,5.88,5.91,5.91,0.0,0.0,1377134.29,813806.686
2,601398.SH,20200108,5.96,5.97,5.9,5.91,6.01,-0.1,-1.6639,1585590.9,940552.725
3,601398.SH,20200107,5.98,6.04,5.98,6.01,5.97,0.04,0.67,1168043.53,701615.827
4,601398.SH,20200106,5.96,6.05,5.95,5.97,5.99,-0.02,-0.3339,2265097.05,1359917.424
5,601398.SH,20200103,5.97,6.02,5.96,5.99,5.97,0.02,0.335,1522130.47,911951.562
6,601398.SH,20200102,5.92,6.03,5.91,5.97,5.88,0.09,1.5306,2349493.97,1404442.753


In [54]:
change_df = DataFrame({'gsyh': gsyh['change'], 'jsyh': jsyh['change'], 'jtyh': jtyh['change']}, 
                      columns=['gsyh', 'jsyh', 'jtyh'])
change_df = change_df.iloc[::-1,]
change_df.head()

Unnamed: 0,gsyh,jsyh,jtyh
6,0.09,0.08,0.05
5,0.02,0.0,0.01
4,-0.02,-0.07,-0.03
3,0.04,-0.01,0.03
2,-0.1,-0.09,-0.05


In [55]:
change_df.corr() # 相关系数矩阵

Unnamed: 0,gsyh,jsyh,jtyh
gsyh,1.0,0.884864,0.953617
jsyh,0.884864,1.0,0.90879
jtyh,0.953617,0.90879,1.0


In [56]:
change_df.corrwith(change_df.gsyh) # 计算与某一列数据的相关性


gsyh    1.000000
jsyh    0.884864
jtyh    0.953617
dtype: float64

## 唯一值、值计数以及成员资格

- `isin`：        计算一个表示“Series各值是否包含于传入的值序列中”的布尔型数组
- `unique`：      计算Series中的唯一值数组，按发现的顺序返回。
- `value_counts`：返回一个Series，其索引为唯一值，其值为频率，按计数值降序排列。

In [57]:
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [58]:
obj.value_counts() # 计数


c    3
a    3
b    2
d    1
dtype: int64

In [59]:
pd.value_counts(obj.values, sort=False) # 统计结果不排序


c    3
b    2
d    1
a    3
dtype: int64