In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### 分位数与桶分析

In [2]:
frame = pd.DataFrame({'data1': np.random.randn(1000),
                      'data2': np.random.randn(1000)})
quartiles = pd.cut(frame.data1, 4)
quartiles[:10]

0    (-1.282, 0.0959]
1     (0.0959, 1.474]
2    (-1.282, 0.0959]
3      (1.474, 2.852]
4    (-2.666, -1.282]
5     (0.0959, 1.474]
6    (-1.282, 0.0959]
7     (0.0959, 1.474]
8     (0.0959, 1.474]
9      (1.474, 2.852]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.666, -1.282] < (-1.282, 0.0959] < (0.0959, 1.474] < (1.474, 2.852]]

In [3]:
def get_stats(group):
    return {'min':group.min(), 'max':group.max(), 'count':group.count(), 'mean':group.mean()}

grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.666, -1.282]",-2.013013,2.51143,99.0,-0.004946
"(-1.282, 0.0959]",-3.760824,2.772838,445.0,-0.002909
"(0.0959, 1.474]",-2.942111,3.542781,385.0,0.007129
"(1.474, 2.852]",-2.25138,2.792812,71.0,-0.064327


In [4]:
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-2.013013,2.51143,100.0,0.009398
1,-3.760824,2.772838,100.0,-0.103921
2,-2.125327,2.427292,100.0,-0.013019
3,-2.344342,2.117623,100.0,-0.013584
4,-2.331338,2.391834,100.0,0.088147
5,-2.942111,1.779084,100.0,-0.015261
6,-1.92299,2.752777,100.0,-0.082236
7,-2.315468,3.542781,100.0,0.060979
8,-2.814398,2.022416,100.0,0.012922
9,-2.25138,3.056323,100.0,0.020511


### 使用指定分组值填充缺失值

In [5]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan

s.fillna(s.mean())  # 使用平均值填充NA值

0   -0.185305
1    0.632230
2   -0.185305
3   -0.590040
4   -0.185305
5   -0.598105
dtype: float64

In [6]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
data = pd.Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

group_key = ['East'] * 4 + ['West'] * 4
data.groupby(group_key).mean()

East    0.002137
West   -1.396665
dtype: float64

In [7]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)  # 每一组都使用平均值来填充NA值

Ohio          0.251238
New York     -0.870607
Vermont       0.002137
Florida       0.625779
Oregon       -0.753870
Nevada       -1.396665
California   -2.039459
Idaho        -1.396665
dtype: float64

In [8]:
fill_values = {'East': 0.5, 'West': -1}  # 为每个分组预定义填充值
fill_func = lambda g: g.fillna(fill_values[g.name])

data.groupby(group_key).apply(fill_func)

Ohio          0.251238
New York     -0.870607
Vermont       0.500000
Florida       0.625779
Oregon       -0.753870
Nevada       -1.000000
California   -2.039459
Idaho        -1.000000
dtype: float64

### 随机采样与排列

In [9]:
# 构造一副扑克牌（没有大小王）
# 红桃：Hearts, 黑桃：Spades, 梅花：Clubs, 方块：Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4  # 设定J Q K的值均为10
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']

# 构造扑克牌
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [10]:
# 从一副牌中随机取出5张牌
def draw(deck, n=5):
    return deck.sample(n)

draw(deck)

8C     8
3H     3
AD     1
JD    10
QH    10
dtype: int64

In [11]:
# 每个花色随机抽取两张牌
get_suit = lambda card: card[-1]  # 最后一个字符是花色

deck.groupby(get_suit).apply(draw, n=2)
# deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

C  9C      9
   8C      8
D  10D    10
   8D      8
H  3H      3
   9H      9
S  2S      2
   KS     10
dtype: int64

### 分组加权平均值与相关系数

In [12]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})

grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])  # 加权平均
grouped.apply(get_wavg)

category
a    0.048426
b   -0.584961
dtype: float64

In [13]:
close_px = pd.read_csv('../data/examples/stock_px_2.csv', parse_dates=True, index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    2214 non-null   float64
 1   MSFT    2214 non-null   float64
 2   XOM     2214 non-null   float64
 3   SPX     2214 non-null   float64
dtypes: float64(4)
memory usage: 86.5 KB


In [14]:
close_px[-4:]

Unnamed: 0,AAPL,MSFT,XOM,SPX
2011-10-11,400.29,27.0,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66
2011-10-14,422.0,27.27,78.11,1224.58


In [15]:
spx_corr = lambda x: x.corrwith(x['SPX'])
rets = close_px.pct_change().dropna()
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [16]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

### 分组线性回归

In [17]:
import statsmodels.api as sm
def regress(data, yvar, xvars):  # 回归函数，对每个数据块执行普通最小二乘（OLS）回归
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [18]:
by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514
