In [2]:
import datetime
import math
import time
from datetime import timedelta #两个日期间隔
import pandas as pd
import numpy as np
import statsmodels.api as sm #统计分析库
import matplotlib
from matplotlib import pyplot as plt

#机器学习
import sklearn #机器学习库
from sklearn.utils import shuffle #随机打乱数据集
from sklearn.model_selection import train_test_split #分割训练集

#Joinquant因子
from jqlib.technical_analysis import * #技术因子
from jqfactor import Factor, calc_factors #自定义因子
from jqfactor import get_factor_values #已有的因子
from jqfactor import neutralize #因子中性化
from jqfactor import winsorize #去极值
from jqfactor import standardlize #标准化

In [3]:
from jqfactor import get_all_factors
all_factors = get_all_factors()
print(type(all_factors))
all_factors.to_csv("test1.csv",index=False)

<class 'pandas.core.frame.DataFrame'>


## 函数定义

In [4]:
#给定起始日期和终止日期，输出起始日期按序排列的日期列表
def timerange_list(begin,end,N):
    list1= []
    now = end
    for i in range(N-1):
        now = pd.to_datetime(now)
        start_ = str(datetime.datetime(now.year, now.month, 1))[0:10]
        end_ = str(pd.to_datetime(start_) - timedelta(days=1))[0:10]
        list1.append(start_)
        list1.append(end_)
        now = end_
    now = pd.to_datetime(now)
    list1.append(str(datetime.datetime(now.year, now.month, 1))[0:10])
    list1 = list1[::-1]
    list1.append(end[0:4]+"-"+end[4:6]+"-"+end[6:8])
    
    return list1

#将日期列表分割成月初和月末列表
def get_s_e(l1:list):
    
    l_start = []
    l_end = []
    for i in range(len(l1)):
        if i%2 == 0:
            l_start.append(l1[i])
        if i%2 == 1:
            l_end.append(l1[i])
            
    return l_start,l_end

In [5]:
def get_month_data(train_s,train_e,test_s,test_e):
    
    ending_day = str(list(get_price('000300.XSHG', start_date=train_s, end_date=train_e).index)[-1])[0:10]
    stock_list = list(get_index_weights("000300.XSHG", date= test_s).index)

    #factor panel: factors of Joinquant
    factor_data = get_factor_values(securities=stock_list, 
                                factors= factors,
                                start_date= ending_day , end_date=ending_day)
    df_factor = pd.DataFrame()
    factor_name = list(factor_data.keys())
    for name in factor_name:
        df_factor = pd.concat([df_factor,factor_data[name]])
    df_factor = df_factor.T
    df_factor.columns = factor_name
    #df_factor.isnull().sum()
    filter_list = list(df_factor.index)
    df_factor.index = filter_list
    
    
    #factor panel: factors of personally defined factors
    factor_data2 = calc_factors(filter_list, factors2, 
             start_date = ending_day, end_date = ending_day,  
             use_real_price=False, skip_paused=False)
    df_factor2 = pd.DataFrame()
    factor_name = list(factor_data2.keys())
    for name in factor_name:
        df_factor2 = pd.concat([df_factor2, factor_data2[name]])
    df_factor2 = df_factor2.T
    df_factor2.columns = factor_name
    df_factor2.index = filter_list
    
    #merge panel1 and panel2
    df_neu = pd.merge(df_factor, df_factor2, left_index = True, right_index = True)
    
    
    #data processing
    #df_neu = neutralize(df_factor, how=['sw_l1','market_cap'], date=ending_day, 
                    #axis=0, fillna='sw_l1', add_constant= True) 
    df_extreme = winsorize(df_neu, qrange=[0.05,0.93], 
                       inclusive=True, inf2nan=True, axis=0) #deal with extreme value
    df_stand = standardlize(df_extreme, inf2nan=True, axis=0) #standarlization


    #next month return
    return_next = []
    for code in filter_list:
        dq = np.array(get_price(code, start_date= test_s, end_date= test_e,
                 frequency='daily', skip_paused=False, fq='pre')['close'])
        return_next.append(dq[-1]/dq[0]-1)
    df_stand['return'] = return_next
    df_stand = df_stand[df_stand['return']!=0]

    #sort by return
    df_sort = df_stand.sort_values(by = "return", ascending=False).dropna(axis = 0)

    #pos and nega
    data_positive = df_sort.iloc[0:40,:]
    data_negative = df_sort.iloc[-40:,:]
    
    return data_positive, data_negative

def get_data(train_s:list,train_e:list,test_s:list,test_e:list):
    
    data_posi = pd.DataFrame()
    data_nega = pd.DataFrame()
    
    train_start = train_s
    train_end = train_e
    test_start = test_s
    test_end = test_e
    
    len_month = len(train_s)
    
    train_start_l = []
    train_end_l = []
    test_start_l = []
    test_end_l = []
    
    for j in range(len_month):
        
        train_s = train_start[j]
        train_e = train_end[j]
        test_s = test_start[j]
        test_e = test_end[j]
        
        df_pos,df_neg = get_month_data(train_s,train_e,test_s,test_e)
        data_posi = pd.concat([data_posi,df_pos])
        data_nega = pd.concat([data_nega,df_neg])
        
        # train_start_l.append(train_s)
        # train_end_l.append(train_e)
        # test_start_l.append(test_s)
        # test_end_l.append(test_e)
        
        # time_l = pd.DataFrame({'train_start':train_start_l,'train_end':train_end_l,'test_start':test_start_l, 'test_end':test_end_l})
        # time_l = time_l.reset_index(drop=True)

    data_ = pd.concat([data_posi,data_nega])
    # data_l = data_l.reset_index(drop=True)
    # data_ = pd.concat([data_l,time_l],axis=1)
    data_ = shuffle(data_)
    
    return data_

In [6]:
class GROSSPROFITABILITY_YOY(Factor):
    
    name = 'grossprofitability_yoy'
    max_window = 1
    dependencies = ['total_operating_revenue','total_operating_cost','total_assets',
                    'total_operating_revenue_4','total_operating_cost_4','total_assets_4']
    
    def calc(self, data):
        
        total_operating_revenue = data['total_operating_revenue']
        total_operating_revenue_4 = data['total_operating_revenue_4']
        
        total_operating_cost = data['total_operating_cost']
        total_operating_cost_4 = data['total_operating_cost_4']
        
        total_assets = data['total_assets']
        total_assets_4 = data['total_assets_4']
        
        gross_profitability = (total_operating_revenue - total_operating_cost)/total_assets
        gross_profitability_4 = (total_operating_revenue_4 - total_operating_cost_4)/total_assets_4
        
        return gross_profitability.mean()-gross_profitability_4.mean()

In [7]:
class ROE_YOY(Factor):
    
    name = 'roe_yoy'
    max_window = 1
    dependencies = ['net_profit','total_assets','total_liability',
                    'net_profit_4','total_assets_4','total_liability_4']
    

    def calc(self, data):
        
        roe = data['net_profit']/(data['total_assets']-data['total_liability'])
        roe_y = data['net_profit_4']/(data['total_assets_4']-data['total_liability_4'])
        result = roe.mean()-roe_y.mean()
        
        return result

## 筛选因子

In [8]:
from jqfactor import get_all_factors 
factors_set = get_all_factors()
factors = list(factors_set.iloc[:,0])
factors2 = [GROSSPROFITABILITY_YOY(),ROE_YOY()] # 毛利增长,净资产收益率同比增长
length = len(factors) + len(factors2)
train_date_list = timerange_list("20191201","20211130",24)
test_date_list = timerange_list("20200101","20211231",24)
train_start, train_end = get_s_e(train_date_list)
test_start, test_end = get_s_e(test_date_list)
data_train = get_data(train_start,train_end,test_start,test_end)
X_train = np.array(data_train.iloc[:,0:length])
y_train = np.array(data_train.iloc[:,length])
print("the number of training samples:", X_train.shape)

the number of training samples: (106, 262)


In [9]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
import warnings
warnings.filterwarnings('ignore')

scaler=StandardScaler()
df_sc= scaler.fit_transform(data_train)
df_sc = pd.DataFrame(df_sc, columns=data_train.columns)
y = df_sc['return']
X = df_sc.drop('return', axis=1) # becareful inplace= False
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
alpha_lasso = 10**np.linspace(-3,1,100)
lasso = Lasso()
coefs_lasso = []

for i in alpha_lasso:
    lasso.set_params(alpha = i)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
    
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_lasso, coefs_lasso)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Lasso regression coefficients Vs. alpha')
plt.savefig('test2.jpg')
plt.show()

In [11]:
lasso = Lasso(alpha=10**(-2))
model_lasso = lasso.fit(X_train, y_train)
coef = pd.Series(model_lasso.coef_,index=X_train.columns)
coef_top = coef[coef != 0].abs().sort_values(ascending = False)
pd.DataFrame(coef_top).to_csv("coef.csv")

## 模型构建

In [12]:
coeftop_l = list(pd.DataFrame(coef_top).index)
coef_l = list(pd.DataFrame(coef).index) 
for i in coef_l:
    if i not in coeftop_l:
        data_train.drop(i,axis=1, inplace=True)

In [13]:
factors = coeftop_l
factors2 = [GROSSPROFITABILITY_YOY()] # 毛利增长,净资产收益率同比增长
length = len(factors) + len(factors2)
train_date_list = timerange_list("20181201","20211130",36)
test_date_list = timerange_list("20190101","20211231",36)
train_start, train_end = get_s_e(train_date_list)
test_start, test_end = get_s_e(test_date_list)
data_train = get_data(train_start,train_end,test_start,test_end)
X_train = np.array(data_train.iloc[:,0:length])
y_train = np.array(data_train.iloc[:,length])
print("the number of training samples:", X_train.shape)

the number of training samples: (2010, 48)


In [14]:
y = data_train['return']
X = data_train.drop('return', axis=1)

In [106]:
cor = data_train.corr()
cor.to_csv('cor.csv')

In [15]:
X = X.drop('grossprofitability_yoy',axis=1)
return_label = []
for i in y:
    if i > 0:
        return_label.append(1)
    else:
        return_label.append(0)
y = pd.DataFrame({'return_label':return_label})

In [68]:
data_train.to_csv("data.csv")

In [17]:
describe1 = X.describe()

In [18]:
describe1.to_csv("describe1.csv",index=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

RFC = RandomForestClassifier(oob_score=True,random_state=10)
RFC.fit(X,y)
print(RFC.oob_score_)

In [None]:
y_predict=RFC.predict(X)
mae_RFC = mean_absolute_error(y, y_predict)
mse_RFC = mean_squared_error(y, y_predict)
rmse_RFC = round(mse_RFC **0.5,4)
r2_RFC = round(r2_score(y, y_predict),4)
print("MAE: %.4f" % mae_RFC) # 平均绝对误AA差
print("MSE: %.4f" % mse_RFC) # 均方误差
print("RMSE: %.4f" % rmse_RFC) # 误差均方根
print("R2: %.4f "% r2_RFC )

## 随机森林调参

In [19]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_test1 = {"n_estimators":range(1,200,10)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(),param_grid=param_test1,
                        scoring='roc_auc',cv=10)
gsearch1.fit(X,y)

scores = pd.DataFrame(gsearch1.grid_scores_).iloc[:,1]
print(gsearch1.best_params_)
print("best accuracy:%f" % gsearch1.best_score_)

In [None]:
numbers = [1,11,21,31,41,51,61,71,81,91]
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(numbers,scores)

In [None]:
param_test2 = {"max_features":range(2,14,2)}
gsearch1 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=121,
                        random_state=10),
                        param_grid = param_test2,scoring='roc_auc',cv=10)
gsearch1.fit(X,y)
print(gsearch1.best_params_)
print('best accuracy:%f' % gsearch1.best_score_)

In [30]:
numbers = []
i = 2
while i <= 12:
    numbers.append(i)
    i += 2

In [None]:
scores = pd.DataFrame(gsearch1.grid_scores_).iloc[:,1]
plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(numbers,scores)

In [None]:
RFC = RandomForestClassifier(n_estimators = 121, random_state=10, max_features=2,oob_score = True)
RFC.fit(X,y)
print(RFC.oob_score_)

In [None]:
# 评估指标
y_predict=RFC.predict(X)
mae_RFC = mean_absolute_error(y, y_predict)
mse_RFC = mean_squared_error(y, y_predict)
rmse_RFC = round(mse_RFC **0.5,4)
r2_RFC = round(r2_score(y, y_predict),4)
print("MAE: %.4f" % mae_RFC) # 平均绝对误AA差
print("MSE: %.4f" % mse_RFC) # 均方误差
print("RMSE: %.4f" % rmse_RFC) # 误差均方根
print("R2: %.4f "% r2_RFC )
print(RFC.feature_importances_)
columns = X.columns
importance = list(RFC.feature_importances_)
importance_l = pd.DataFrame({'item':columns,'point':importance})

In [None]:
importance_l = importance_l.sort_values(by=['point'],ascending=False)
plt.figure(figsize=(12,8))
plt.barh(importance_l['item'],importance_l['point'])
plt.title('the importance features')
plt.show()

## 模型稳定性

In [70]:
factors = coeftop_l
factors2 = [GROSSPROFITABILITY_YOY()] # 毛利增长,净资产收益率同比增长
length = len(factors) + len(factors2)
train_date_list = timerange_list("20211201","20221130",12)
test_date_list = timerange_list("20220101","20221231",12)
train_start, train_end = get_s_e(train_date_list)
test_start, test_end = get_s_e(test_date_list)
data_train_1 = get_data(train_start,train_end,test_start,test_end)

In [71]:
y1 = data_train_1['return']
X1 = data_train_1.drop('return', axis=1)
X1 = X1.drop('grossprofitability_yoy',axis=1)
return_label = []
for i in y1:
    if i > 0:
        return_label.append(1)
    else:
        return_label.append(0)
y1 = pd.DataFrame({'return_label':return_label})

In [73]:
RFC.score(X1,y1)

0.545929018789144

## 模型预测

In [87]:
factors = coeftop_l
factors2 = [GROSSPROFITABILITY_YOY()] # 毛利增长,净资产收益率同比增长
length = len(factors) + len(factors2)
train_date_list = timerange_list("20211201","20221130",12)
test_date_list = timerange_list("20220101","20221231",12)
train_start, train_end = get_s_e(train_date_list)
test_start, test_end = get_s_e(test_date_list)
data_train_1 = get_data(train_start,train_end,test_start,test_end)

In [148]:
X1 = data_train.iloc[:,:-1]
y1 = data_train.iloc[:,-1]
return_label = []
for i in y1:
    if i > 0:
        return_label.append(1)
    else:
        return_label.append(0)
y1 = pd.DataFrame({'return_label':return_label})

In [149]:
print(RFC.score(X1,y1))

0.5481171548117155


## 根据模型选股

In [90]:
factors = coeftop_l
factors2 = [GROSSPROFITABILITY_YOY()] # 毛利增长,净资产收益率同比增长
length = len(factors) + len(factors2)
train_date_list = timerange_list("20221101","20230228",4)
test_date_list = timerange_list("20221201","20230331",4)
train_start, train_end = get_s_e(train_date_list)
test_start, test_end = get_s_e(test_date_list)
data_train_2 = get_data(train_start,train_end,test_start,test_end)

In [91]:
y2 = data_train_2['return']
X2 = data_train_2.drop('return', axis=1)
X2 = X2.drop('grossprofitability_yoy',axis=1)
return_label = []
for i in y2:
    if i > 0:
        return_label.append(1)
    else:
        return_label.append(0)
y2 = pd.DataFrame({'return_label':return_label})

In [92]:
y2_predict=pd.DataFrame({'pre':RFC.predict(X2)})

In [93]:
X2.to_csv('data_predict.csv')
y2_predict.to_csv()

In [None]:
IC = []
for i in f1.columns[:-1]:
    IC_0 = round(np.corrcoef(f1[i].rank(axis=0,ascending = True),f1['return'].rank(axis=0,ascending = False))[0,1],3)
    IC.append(IC_0)
for i in range(len(IC)):
    print(f'{f1.columns[i]}，{IC[i]}')
columns = f1.columns[:-1]
c = pd.DataFrame({'name':columns,'IC':IC})