# 分组级运算和转换

In [28]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pandas import DataFrame, Series


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.772639,-3.352329
1,a,two,1.533222,1.422569
2,b,one,0.159989,-0.491793
3,b,two,-2.341067,-0.098958
4,a,one,-0.073373,1.279004


In [3]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.077496,-0.216919
b,-1.090539,-0.295376


In [4]:
pd.merge(df, k1_means, left_on='key1', right_index=True)


Unnamed: 0,key1,key2,data1,data2,mean_data1,mean_data2
0,a,one,1.772639,-3.352329,1.077496,-0.216919
1,a,two,1.533222,1.422569,1.077496,-0.216919
4,a,one,-0.073373,1.279004,1.077496,-0.216919
2,b,one,0.159989,-0.491793,-1.090539,-0.295376
3,b,two,-2.341067,-0.098958,-1.090539,-0.295376


In [4]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,1.943775,2.342499,1.484798,0.761255,1.72854
Steve,-0.190772,-0.899242,-0.495156,2.336061,0.185272
Wes,-0.113463,-0.565391,-0.331119,1.467056,1.723454
Jim,0.363096,-1.302187,0.917975,-1.260895,0.993835
Travis,-0.191768,-0.042166,0.553096,1.42353,0.491962


In [6]:
# 自定义每行名称
key = ['one', 'two', 'one', 'two', 'one'] # 每一行的名称
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.546181,0.578314,0.568925,1.217281,1.314652
two,0.086162,-1.100714,0.21141,0.537583,0.589554


## grouped.transform() - 变回原来形状，用分组聚合的结果填空。

In [7]:
# transform变回原来形状，但是用前面分组聚合的值填空。
people.groupby(key).transform(np.mean) 


Unnamed: 0,a,b,c,d,e
Joe,0.546181,0.578314,0.568925,1.217281,1.314652
Steve,0.086162,-1.100714,0.21141,0.537583,0.589554
Wes,0.546181,0.578314,0.568925,1.217281,1.314652
Jim,0.086162,-1.100714,0.21141,0.537583,0.589554
Travis,0.546181,0.578314,0.568925,1.217281,1.314652


In [8]:
# 自定义函数，并用transform作用于每一列。
def demean(arr):
    return arr - arr.mean()

demeaned = people.groupby(key).transform(demean) # 
demeaned

# cc：这个方法非常好用

Unnamed: 0,a,b,c,d,e
Joe,1.397594,1.764185,0.915873,-0.456025,0.413888
Steve,-0.276934,0.201473,-0.706565,1.798478,-0.404281
Wes,-0.659645,-1.143705,-0.900045,0.249776,0.408802
Jim,0.276934,-0.201473,0.706565,-1.798478,0.404281
Travis,-0.73795,-0.62048,-0.015829,0.20625,-0.82269


In [9]:
demeaned.groupby(key).mean() # 因为前面每个值都减去了平均值，所以应该是0。显示不为0是因为浮点数计算误差。


Unnamed: 0,a,b,c,d,e
one,-7.401487e-17,1.480297e-16,0.0,1.850372e-17,0.0
two,0.0,0.0,-5.5511150000000004e-17,0.0,0.0


## grouped.apply()

### 一般性的用法：“拆分－应用－合并”

In [10]:
tips = pd.read_csv('data/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 新加一列，小费与账单金额的比例。
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [17]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:] # 获取小费比例最高的n条数据
top(tips, n=3)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [18]:
# 先按是否吸烟分组，再分别命中apply top的函数，查看Top2的消费情况

tips.groupby('smoker').apply(top, n=2, column='tip_pct') 

# cc：top这个function的参数就直接放到了apply中
# 这里apply用在了groupby之后，实际也可直接作用于dataframe，这就把整个dataframe看做是一个group

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [19]:
tips.apply(top, n=2, column='tip_pct') 
# cc：这是为啥？后面研究下。

TypeError: sort_values() got an unexpected keyword argument 'by'

In [21]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [14]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587


In [15]:
result.unstack('smoker') # 把列添加到索引，作用在smoker外。


       smoker
count  No        5.000000
mean   No        0.134633
std    No        0.043359
min    No        0.059447
25%    No        0.139780
50%    No        0.146808
75%    No        0.160542
max    No        0.166587
dtype: float64

In [16]:
result.apply(lambda x: x.describe()) # result.describe()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587
std,,,,,,,,
min,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587
25%,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587
50%,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587
75%,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587
max,5.0,0.134633,0.043359,0.059447,0.13978,0.146808,0.160542,0.166587


### group_keys=False - 参数，用于禁止分组键

In [22]:
tips.groupby('smoker', group_keys=False).apply(top) # 禁止构成多重索引

# 类似做了 reset_index()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [23]:
# cc：如果不禁止，就构成了多重索引
tips.groupby('smoker').apply(top) # 构成多重索引

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


###  pd.cut() - 分位数和桶分析

In [25]:
frame = DataFrame({'data1': np.random.randn(1000),
                   'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4) # 切4份
factor[:5] # 前5个元素所在区间

0     (-0.00717, 1.608]
1    (-1.623, -0.00717]
2     (-0.00717, 1.608]
3     (-0.00717, 1.608]
4        (1.608, 3.224]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.244, -1.623] < (-1.623, -0.00717] < (-0.00717, 1.608] < (1.608, 3.224]]

In [29]:
def get_stats(group):
    return {'min': group.min(),
             'max': group.max(),
             'count': group.count(),
             'mean': group.mean()}
grouped = frame.data2.groupby(factor) # 根据data1的分段，对data2进行分组。
# cc：首先，到这一步的用法就很有用了！
# factor 是创建好的区间，虽然没有在frame中，但是可以按照factor进行分组；

result = grouped.apply(get_stats)
result
result.unstack()


data1                    
(-3.244, -1.623]    min       -1.596762
                    max        2.388572
                    count     60.000000
                    mean       0.103069
(-1.623, -0.00717]  min       -3.239306
                    max        2.314984
                    count    443.000000
                    mean      -0.064414
(-0.00717, 1.608]   min       -2.709442
                    max        2.840688
                    count    447.000000
                    mean       0.058181
(1.608, 3.224]      min       -2.294837
                    max        3.533058
                    count     50.000000
                    mean       0.050865
Name: data2, dtype: float64

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.244, -1.623]",-1.596762,2.388572,60.0,0.103069
"(-1.623, -0.00717]",-3.239306,2.314984,443.0,-0.064414
"(-0.00717, 1.608]",-2.709442,2.840688,447.0,0.058181
"(1.608, 3.224]",-2.294837,3.533058,50.0,0.050865


In [21]:
grouping = pd.qcut(frame.data1, 10, labels=list('ABCDEFGHIJ')) # False的话就是默认数字编号
grouping.head() # 返回每个元素的区间编号

0    E
1    E
2    H
3    A
4    G
Name: data1, dtype: category
Categories (10, object): ['A' < 'B' < 'C' < 'D' ... 'G' < 'H' < 'I' < 'J']

In [22]:
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,min,max,count,mean
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,-2.351596,2.521797,100.0,0.018743
B,-2.637367,2.556429,100.0,0.045279
C,-2.67421,2.040353,100.0,0.126277
D,-2.497992,1.999256,100.0,-0.206787
E,-2.39899,2.518462,100.0,0.005545
F,-2.180804,2.424534,100.0,0.085148
G,-2.207227,2.367427,100.0,0.063048
H,-2.975735,2.044182,100.0,-0.141416
I,-2.054868,2.339424,100.0,0.081223
J,-2.574812,2.691533,100.0,0.004016


### fillna - 用特定于分组的值填充缺失值

In [23]:
s = Series(np.random.randn(6))
s[::2] = np.nan
s.fillna(s.mean()) # 用平均数填充缺失值

0    0.691767
1    0.685475
2    0.691767
3    0.666042
4    0.691767
5    0.723786
dtype: float64

In [24]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4 # 前4个东部州，后4个西部州。
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio          0.057784
New York      0.975776
Vermont            NaN
Florida       1.087341
Oregon       -0.609988
Nevada             NaN
California   -0.259613
Idaho              NaN
dtype: float64

In [25]:
data.groupby(group_key).mean() # 非NA求平均值


East    0.706967
West   -0.434800
dtype: float64

In [26]:
fill_mean = lambda g: g.fillna(g.mean()) # cc：fill_mean 为一个function
data.groupby(group_key).apply(fill_mean) # 分组之后用每组的平均值填充

Ohio          0.057784
New York      0.975776
Vermont       0.706967
Florida       1.087341
Oregon       -0.609988
Nevada       -0.434800
California   -0.259613
Idaho        -0.434800
dtype: float64

In [28]:
fill_values = {'East': 0.5, 'West': -1} # 指定填充值
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio          0.057784
New York      0.975776
Vermont       0.500000
Florida       1.087341
Oregon       -0.609988
Nevada       -1.000000
California   -0.259613
Idaho        -1.000000
dtype: float64

### np.random.permutation() - 随机采样和排列

In [30]:
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4 # Python3下range是生成器，必须用list显示展开。
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)
deck.head() # 牌面数组

AH    1
2H    2
3H    3
4H    4
5H    5
dtype: int64

In [34]:
(list(range(1, 11)) + [10] * 3)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10]

In [33]:
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4 # Python3下range是生成器，必须用list显示展开。
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)
deck.head() # 牌面数组

AH    1
2H    2
3H    3
4H    4
5H    5
dtype: int64

In [39]:
def draw(deck, n=5): # 随机抽n张
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

# cc：又遇到这个写法了：np.random.permutation(len(deck))，作用是：随机排列一个序列，返回一个排列的序列
# 也就是洗牌啊~

9D    9
6D    6
AH    1
4H    4
7C    7
dtype: int64

In [40]:
# 每种花色中随机抽取两张牌
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2) # 默认根据索引排序，索引的最后一个字符是花色。

# cc：这也太。。。厉害了！？？？？

C  AC     1
   6C     6
D  6D     6
   4D     4
H  9H     9
   QH    10
S  3S     3
   8S     8
dtype: int64

In [38]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2) # 效果一样，但是不用多重索引。


6C      6
QC     10
6D      6
KD     10
6H      6
10H    10
8S      8
3S      3
dtype: int64

## 分组加权平均数和相关系数

In [39]:
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-2.426934,0.090998
1,a,0.654896,0.80836
2,a,0.407956,0.888513
3,a,0.888946,0.036545
4,b,-0.828638,0.376539
5,b,1.090013,0.612026
6,b,0.581775,0.769727
7,b,-1.070719,0.051034


In [40]:
grouped = df.groupby('category')


In [41]:
get_wavg = lambda g: np.average(g['data'], weights=g['weights']) # 求加权平均，weights自动归一化处理。
grouped.apply(get_wavg) # 分组计算

category
a    0.385607
b    0.413561
dtype: float64

In [50]:
close_px = pd.read_csv('data/stock_px.csv', parse_dates=True, index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AA      5472 non-null   float64
 1   AAPL    5472 non-null   float64
 2   GE      5472 non-null   float64
 3   IBM     5472 non-null   float64
 4   JNJ     5472 non-null   float64
 5   MSFT    5472 non-null   float64
 6   PEP     5471 non-null   float64
 7   SPX     5472 non-null   float64
 8   XOM     5472 non-null   float64
dtypes: float64(9)
memory usage: 427.5 KB


In [51]:
close_px.head()


Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.0,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33


In [52]:
close_px.pct_change()
# cc：
# 表示当前元素与先前元素的相差百分比，当然指定periods=n,表示当前元素与先前n 个元素的相差百分比。

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,,,,,,,,,
1990-02-02,0.012048,0.017812,0.000000,0.005956,0.023419,0.000000,0.008278,0.006478,0.019608
1990-02-05,0.005952,0.022500,0.000000,0.025459,-0.006865,0.000000,-0.006568,0.002810,0.001603
1990-02-06,-0.011834,-0.007335,0.003484,0.013857,-0.004608,0.000000,0.016529,-0.006599,-0.003200
1990-02-07,0.005988,-0.043103,0.010417,0.021071,0.013889,0.000000,0.003252,0.012407,0.016051
...,...,...,...,...,...,...,...,...,...
2011-10-10,0.039135,0.051406,0.041290,0.023192,0.020592,0.026286,0.013930,0.034125,0.036977
2011-10-11,0.020813,0.029526,0.000000,-0.008681,-0.007295,0.002227,-0.014870,0.000544,-0.000131
2011-10-12,-0.024272,0.004747,0.016109,0.006054,0.005785,-0.001481,0.028712,0.009795,0.011669
2011-10-13,0.004975,0.015515,-0.010976,0.003761,-0.001554,0.008160,-0.005423,-0.002974,-0.010238


In [53]:
rets = close_px.pct_change().dropna() # 扔掉有空数据的行
spx_corr = lambda x: x.corrwith(x['SPX']) # 与SPX的相关系数
by_year = rets.groupby(lambda x: x.year) # 指定用那个函数去做group
by_year.apply(spx_corr) # 按年分组并计算与SPX的相关系数

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078
1995,0.291532,0.161829,0.519126,0.41639,0.315733,0.45366,0.413144,1.0,0.368752
1996,0.292344,0.191482,0.750724,0.388497,0.569232,0.564015,0.421477,1.0,0.538736
1997,0.564427,0.211435,0.827512,0.646823,0.703538,0.606171,0.509344,1.0,0.695653
1998,0.533802,0.379883,0.815243,0.623982,0.591988,0.698773,0.494213,1.0,0.369264
1999,0.099033,0.425584,0.710928,0.486167,0.517061,0.631315,0.336593,1.0,0.315383


In [54]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT'])) # 计算两个股票之间的相关系数


1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571435
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## 面向分组的线性回归

In [55]:
def regress(data, yvar, xvars):
    y= data[yvar]
    x = data[xvars]
    x['intercept'] = 1.
    result = sm.OLS(y, x).fit()
    return result.params

by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
1990,1.512772,0.001395
1991,1.187351,0.000396
1992,1.832427,0.000164
1993,1.39047,-0.002657
1994,1.190277,0.001617
1995,0.858818,-0.001423
1996,0.829389,-0.001791
1997,0.749928,-0.001901
1998,1.164582,0.004075
1999,1.384989,0.003273
