In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline   
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
#使用tushare获取数据
import tushare as ts

ts.set_token('8a13051b514249491b029cb46bcf1cd4e059b83bdeb516fc53c9f630')
pro = ts.pro_api()

In [7]:
########## 获取原始数据 ##########
#获取当前交易的股票代码和名称
def get_code():
    df = pro.stock_basic(exchange='', list_status='L')
    #剔除2017年以后上市的新股次新股
    df = df[df['list_date'].apply(int).values<20180101]
    #剔除st股
    df = df[-df['name'].apply(lambda x:x.startswith('*ST'))]
    codes = df.ts_code.values
    return codes


#获取股票行业分类
def get_index_member():
    # 获取申万一级行业分类
    df = pro.index_classify(level='L1', src='SW')
    index_classify = list(zip(df.index_code,df.industry_name))
    # 获取行业包含的股票代码并拼接
    index_member = pro.index_member(index_code=index_classify[0][0])
    index_member['industry_name'] = index_classify[0][1]
    for i,j in index_classify[1:]:
        temp_index_member = pro.index_member(index_code=i)
        temp_index_member['industry_name'] = j
        index_member=pd.concat([index_member,temp_index_member])
    index_member.columns = ['index_code','ts_code','in_date','out_date','industry_name']
    return index_member


#获取所有股票每日指标
def get_daily_basic():
    daily_basic = pro.daily_basic(ts_code=get_code()[0],start_date='20110101',end_date='20201031')
    for code in get_code()[1:]:
        temp_daily_basic = pro.daily_basic(ts_code=code,start_date='20110101',end_date='20201031')
        daily_basic = pd.concat([daily_basic,temp_daily_basic])
        print(code)
    return daily_basic


#获取所有股票日线行情
def get_daily():
    daily = pro.daily(ts_code=get_code()[0],start_date='20110101',end_date='20201031')
    for code in get_code()[1:]:
        temp_daily = pro.daily(ts_code=code,start_date='20110101',end_date='20201031')
        daily = pd.concat([daily,temp_daily])
        print(code)
    return daily


# 获取shibor
def get_shibor():
    shibor = pro.shibor(start_date='20110101', end_date='20201031')
    return shibor

    
def start_get_data():
    daily_basic = get_daily_basic()
    daily_basic.to_csv('./raw_data/股票每日指标.csv',index=False)
    
    index_member = get_index_member()
    index_member.to_csv('./raw_data/股票行业分类.csv',index=False)
    
    daily = get_daily()
    daily.to_csv('./raw_data/股票日线行情.csv',index=False)
    
    shibor = get_shibor()
    shibor.to_csv('./raw_data/shibor.csv',index=False)


仅按截面末交易的股票进入股票池，要计算每月交易天数

In [5]:
########## 数据处理 ##########
# 对数据进行缩尾处理
def winsor_data(data):
    q = data.quantile([0.01,0.99])
    data[data < q.iloc[0]] = q.iloc[0]
    data[data > q.iloc[1]] = q.iloc[1]
    return data


# 数据标准化
def MaxMinNormal(data):
    data = (data - data.mean()) / (data.std())
    return data


def data_process(data):
    # 生成年月日
    data['trade_date'] = pd.to_datetime(data['trade_date'].astype(str))
    data['year'] = data['trade_date'].dt.year
    data['month'] = data['trade_date'].dt.month
    data['day'] = data['trade_date'].dt.day

    # 计算每月个股累积收益率
    lreturn = data[['lreturn','adjustment_return','ts_code','year','month']].groupby(['ts_code','year','month']).sum().reset_index()

    # 每个股票取每月最后一个交易日
    data = data.groupby(['ts_code','year','month']).last().reset_index()
    data = data[data['trade_date'].isin(pd.to_datetime(data.groupby(['year','month']).max().day.reset_index()))]
    data.dropna(inplace=True)
    data.drop(['day','lreturn','adjustment_return'],axis=1, inplace=True)

    # 数据合并
    processed_data = pd.merge(lreturn,data,on=['ts_code','year','month'],how='inner')
    processed_data.drop(['year','month'],axis=1, inplace=True)
    print(data.shape)
    
    # 截面数据去极值和标准化
    processed_data = processed_data.set_index(['trade_date','ts_code']).sort_index()
    for i in processed_data.columns[2:]:
        processed_data[i] = processed_data[i].groupby('trade_date').apply(winsor_data)
        processed_data[i] = processed_data[i].groupby('trade_date').apply(MaxMinNormal)
    
    # T期因子与T+1期收益对齐
    processed_data = processed_data.reset_index().set_index('ts_code')
    processed_data[['lreturn','adjustment_return']] = processed_data[['lreturn','adjustment_return']].groupby('ts_code').shift(-1)
    processed_data.dropna(inplace=True)
    
    # 原始数据与行业数据合并
    index_member = pd.read_csv("./raw_data/股票行业分类.csv")
    trade_date_list=processed_data.trade_date.unique()
    industry_processed_data = pd.merge(processed_data[processed_data.trade_date==trade_date_list[0]],index_member[['ts_code','industry_name']],how='inner',on='ts_code')
    for i in trade_date_list[1:]:
        temp_industry_processed_data = pd.merge(processed_data[processed_data.trade_date==i],index_member[['ts_code','industry_name']],how='inner',on='ts_code')
        industry_processed_data=pd.concat([industry_processed_data,temp_industry_processed_data])
    # 生成行业哑变量
    industry_processed_data = pd.concat([industry_processed_data,pd.get_dummies(industry_processed_data['industry_name'])],axis=1)
    industry_processed_data.drop(['industry_name'],axis=1, inplace=True)
    print(data.shape)
    
    return industry_processed_data.reset_index()

In [6]:
data = pd.read_csv("./processed_data/raw_factor.csv")
industry_processed_data = data_process(data)
industry_processed_data.to_csv("./processed_data/processed_factor.csv",index=False)
industry_processed_data.head()

(114445, 33)
(114445, 33)


Unnamed: 0,index,ts_code,trade_date,lreturn,adjustment_return,pe,pb,return_1m,return_2m,return_3m,return_6m,return_12m,wgt_return_1m,wgt_return_2m,wgt_return_3m,wgt_return_6m,wgt_return_12m,high_low_1m,high_low_2m,high_low_3m,high_low_6m,high_low_12m,std_1m,std_2m,std_3m,std_6m,std_12m,ln_price,turnover_1m,turnover_2m,turnover_3m,turnover_6m,turnover_12m,size_factor,交通运输,休闲服务,传媒,公用事业,农林牧渔,化工,医药生物,商业贸易,国防军工,家用电器,建筑材料,建筑装饰,房地产,有色金属,机械设备,汽车,电子,电气设备,纺织服装,综合,计算机,轻工制造,通信,采掘,钢铁,银行,非银金融,食品饮料
0,0,000028.SZ,2012-12-31,0.002264,0.023903,0.355775,2.236821,0.699988,0.616073,-0.084945,1.444789,2.37832,1.459859,0.619496,0.789587,1.776209,1.426618,0.255222,-0.005747,-0.303117,-0.702886,0.383798,0.838498,0.242534,0.100309,0.264018,0.538356,2.010079,-0.707227,-0.650736,-0.594642,-0.555396,-0.395789,0.218815,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,000090.SZ,2012-12-31,0.024725,0.031091,-0.366738,-0.871772,0.312197,0.58306,0.982273,0.810041,0.512867,-0.005539,0.618362,0.911317,0.185912,0.736401,0.354554,-0.113563,-0.439193,-0.786355,-0.299288,1.376857,0.798041,0.508711,0.360218,1.026215,-0.55511,1.367661,1.024095,0.668366,0.470465,0.638832,-0.432372,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,000099.SZ,2012-12-31,0.083642,0.107874,0.211462,-0.231939,-0.333772,-0.358965,-0.42882,-0.902162,0.196715,-0.087634,0.007833,0.047049,-0.034521,0.190818,-0.559722,-0.100915,-0.429594,-0.151639,-0.473386,-0.861699,-0.529136,-0.657113,-0.122687,0.237438,-0.535437,-0.220361,-0.233857,-0.31559,-0.051804,0.452873,-0.409788,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,000159.SZ,2012-12-31,0.169802,0.202524,-0.92694,-0.6329,0.289403,-1.508645,-1.310711,-1.07383,-0.989217,0.739849,-1.987627,-1.420851,-1.094347,-1.877156,0.0896,0.859141,0.850941,0.577924,0.426154,-0.130708,0.407266,0.109678,-0.198845,0.257493,-0.831909,-0.540134,-0.514793,-0.449611,-0.507694,-0.271039,-0.570992,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,000402.SZ,2012-12-31,0.027759,0.012997,-0.885021,-0.934013,-0.087177,0.914811,0.812685,0.34758,0.515719,-0.113835,1.226756,0.821181,0.184451,0.654288,-0.356191,-0.556226,-0.775151,-0.816114,-1.019293,-0.844476,-1.042163,-1.069439,-0.953879,-0.964344,-0.750854,-0.183351,-0.311379,-0.368899,-0.413138,-0.35792,0.681682,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
