# 1、回测区间：回归(2014-01-01~2016-01-01)
#           回测2016-01-01  ~  2018-01-01
# 2、选股：
#   选股区间：沪深300
#   选股因子：经过因子分析之后的若干因子，可以不知方向
#   选股权重：回归训练的权重
#   数据处理：缺失值、去极值、标准化、市值中心化处理（防止选股集中）
# 3、调仓周期：
#   调仓：每月进行一次调仓
#   交易规则：卖出已持有的股票
#          买入新的股票池当中的股票

In [96]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [97]:
dates = get_trading_dates(start_date='2016-01-01',end_date='2016-07-01')

In [98]:
#每天日期--》每月月末
#每月最后一个交易日，按月计算收益率
month_date = []
for i in range(len(dates)-1):
    if dates[i].year != dates[i+1].year:
        month_date.append(dates[i])
    elif dates[i].month != dates[i+1].month:
        month_date.append(dates[i])

#把最后一个交易日加入
month_date.append(dates[-1])

In [99]:
#获取沪深300 的股票列表
stocks = index_components('000300.XSHG')

###2、特征值都是该月的因子数据（避免下个月在日期列表当中不存在）

In [100]:
all_data = pd.DataFrame()
#获得沪深300的股票列表
for date in month_date:
    q = query(
        fundamentals.eod_derivative_indicator.pe_ratio, 
        fundamentals.eod_derivative_indicator.pb_ratio, 
        fundamentals.eod_derivative_indicator.market_cap, 
        fundamentals.financial_indicator.ev, 
        fundamentals.financial_indicator.return_on_asset_net_profit, 
        fundamentals.financial_indicator.du_return_on_equity, 
        fundamentals.financial_indicator.earnings_per_share, 
        fundamentals.income_statement.revenue, 
        fundamentals.income_statement.total_expense
    ).filter(fundamentals.stockcode.in_(stocks))
    
    #查询因子数据
    fund = get_fundamentals(q,entry_date=date).iloc[:,0,:]
    
    fund['date'] = date
    
    #进行每月数据拼接
    all_data = pd.concat([all_data,fund])



In [101]:
#删除缺失值
all_data = all_data.dropna()

In [102]:
all_data['next_month_return'] = np.nan

#1\
获取价格数据计算对应的收益率

In [103]:
all_price = pd.DataFrame()
for date in month_date:    
    price = get_price(stocks,start_date=date,end_date=date,fields='close')
    all_price = pd.concat([all_price,price])

In [104]:
all_price = all_price.T

#2计算收益率

In [105]:
for i in range(len(all_price.columns)-1):
    #利用每次后一个月的收盘价-循环的这个月的收盘价/循环的这个月的收盘价
    all_price.iloc[:,i] = all_price.iloc[:,i+1]/all_price.iloc[:,i] - 1

In [106]:
#3将收益率填充到因子对应的下个月收益率列当中

In [107]:
for i in range(len(all_data)):
    #每个样本填充对应收益率
    stock = all_data.index[i]
    
    date = all_data.ix[i,'date']
    
    #在all_price里面寻找收益率
    if stock in all_price.index and date in all_price.columns:
        all_data.ix[i,'next_month_return'] = all_price.loc[stock,date]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':


In [108]:
all_data = all_data.dropna()

特征值和目标值处理

In [109]:
y = all_data['next_month_return']
x = all_data.drop(['next_month_return','date'],axis=1)
x_market_cap = x['market_cap']

In [110]:
#自己实现标准化
def stand(factor):
    mean = factor.mean()
    std = factor.std()
    
    return (factor - mean)/std

#3、去极值的方法
def mad(factor):
    #1、
    me = np.median(factor)
    
    #2、3、
    mad = np.median(abs(factor - me))
    
    up = me + (3 * 1.4826 * mad)
    down = me - (3 * 1.4826 * mad)
    
    #利用3倍中位数的值去极值
    factor = np.where(factor > up,up,factor)
    factor = np.where(factor < down,down,factor)
    
    return factor

In [111]:
for name in x.columns:
    x[name] = mad(x[name])
    x[name] = stand(x[name])

In [112]:
x.head()

Unnamed: 0,earnings_per_share,ev,total_expense,revenue,market_cap,pb_ratio,return_on_asset_net_profit,pe_ratio,du_return_on_equity
000001.XSHE,1.85628,1.81305,1.55742,1.57723,1.88921,-1.0786,-0.633706,-0.843391,1.05891
000002.XSHE,1.04653,1.77468,1.55742,1.57723,1.88921,-0.361062,-0.25229,-0.502819,0.30657
000063.XSHE,1.07613,0.127925,1.55742,1.57723,0.101902,-0.633272,0.0584776,-0.352897,0.695783
000069.XSHE,0.130761,-0.104262,0.627544,0.669116,-0.071852,-0.881046,0.164656,-0.647053,0.443895
000100.XSHE,-0.410345,-0.0430015,1.55742,1.57723,-0.285442,-0.704076,0.0138433,-0.406361,0.640836


In [113]:
#市值中性化处理
#特征值，市值因子（不能是处理过的）,目标其他因子
for name in x.columns:
    if name == 'market_cap':
        continue
        
    #准备特征值、目标值
    y_factor = x[name]
    
    #线性回归方程建立
    lr = LinearRegression()
    
    lr.fit(x_market_cap.values.reshape(-1,1),y_factor)
    
    #得出真实值与预测值之间的误差当作新的因子值
    y_predict = lr.predict(x_market_cap.values.reshape(-1,1))
    
    #得出真实值与预测之间的误差当做新的因子值
    x[name] = y_factor - y_predict

In [114]:
#收益率目标值y
y = stand(y)

In [115]:
#建立特征值因子数据（处理过的）与目标值（标准化）下期收益率之间的回归方程
x.head()

Unnamed: 0,earnings_per_share,ev,total_expense,revenue,market_cap,pb_ratio,return_on_asset_net_profit,pe_ratio,du_return_on_equity
000001.XSHE,1.37999,0.170909,0.558199,0.46336,1.88921,-0.448901,-0.515504,-0.456318,0.768662
000002.XSHE,0.570242,0.132537,0.558199,0.46336,1.88921,0.268635,-0.134087,-0.115746,0.0163206
000063.XSHE,1.05044,0.0393489,1.50352,1.51714,0.101902,-0.599306,0.0648534,-0.332019,0.680127
000069.XSHE,0.148876,-0.0418063,0.665547,0.711479,-0.071852,-0.904995,0.16016,-0.661775,0.454934
000100.XSHE,-0.338382,0.205111,1.70839,1.74552,-0.285442,-0.799217,-0.004016,-0.464845,0.68469


In [116]:
lr = LinearRegression()

In [117]:
lr.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [118]:
lr.coef_

array([ 0.23539855, -0.00391808, -0.24177885,  0.14179757,  0.03446925,
        0.08535851,  0.10545642,  0.02562047, -0.38349568])