In [3]:
import tushare as ts
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels import regression
import ffn
from sklearn.svm import SVR
import matplotlib.pyplot as plt
from collections import defaultdict
import os

  from pandas.core import datetools


In [4]:
class Model:
    
    #初始化参数
    def __init__(self, data, train_start, train_end, test_start, test_end, n):
        
        self.code = data['code']
        self.train_start = train_start
        self.train_end = train_end
        self.test_start = test_start
        self.test_end = test_end
        self.n = n
        
    #获取单只股票初始训练集和测试集
    def getInitData(self, code):
        
        train_start = self.train_start
        train_end = self.train_end
        
        test_start = self.test_start
        test_end = self.test_end
        
        train_data = ts.get_hist_data(code, start = train_start, end = train_end)
        test_data = ts.get_hist_data(code, start = test_start, end = test_end)

        train_data = train_data.loc[:,['open','high','close','low','price_change','p_change','turnover']]
        test_data = test_data.loc[:,['open','high','close','low','price_change','p_change','turnover']]
        
        return train_data, test_data
    
    #训练和预测股票的N期收益
    def train(self):
        #使用scikit-learn导入SVR进行训练和预测
        x_train = []
        y_label = []
        x_test = []
        
        for code in self.code:
            train_data, test_data = self.getInitData(code)
            
            #计算特征
            '''
            以后特征的计算，写在feature的class里即可
            '''
            feature = Feature(train_data, test_data, self.n)
            
            
            train_data, test_data = feature.cal_mean()
            train_data = feature.cal_nReturn(train_data)

            svr_rbf = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)
            X = train_data.iloc[:,-9:-2]
            X1 = test_data.iloc[:,-8:-1]
            y = train_data.iloc[:,-1:]
            for i in range(len(X)):
                x_train.append(list(X.iloc[i]))
            for i in range(len(y)):
                y_label.append(list(y.iloc[i]))
            for i in range(len(X1)):    
                x_test.append(list(X1.iloc[i]))

        y_rbf = svr_rbf.fit(x_train,y_label).predict(x_test)
        
        return y_rbf

    #得到预期代码

#     def getCode():
        
#         return 
        

In [5]:
class Feature:
    
    def __init__(self, train_data, test_data, n):
        self.train_data = train_data
        self.test_data = test_data
        self.n = n
        
    #计算开盘到收盘之间的价格均值
    def cal_mean(self):
        train_data = self.train_data
        test_data = self.test_data
        
        first_data_open = train_data.loc[:,['open']]
        first_data_close = train_data.loc[:,['close']]

        second_data_open = test_data.loc[:,['open']]
        second_data_close = test_data.loc[:,['close']]

        first_data_open.rename(columns = {'open':'mean'},inplace=True)
        first_data_close.rename(columns = {'close':'mean'},inplace=True)
        first_data_mean = (first_data_close + first_data_open)/2

        second_data_open.rename(columns = {'open':'mean'},inplace=True)
        second_data_close.rename(columns = {'close':'mean'},inplace=True)
        second_data_mean = (second_data_close + second_data_open)/2

        train_data['mean'] = first_data_mean
        test_data['mean'] = second_data_mean
        
        return train_data,test_data
    
    #计算N期收益
    def cal_nReturn(self, train_data):
        n = self.n
        #训练集的N期平均收益率
        origin_mean = train_data.loc[:,['mean']]
        origin_mean_n = origin_mean.shift(n)
        train_simpleret = (origin_mean - origin_mean_n) / origin_mean_n
        train_data['n_simpleret'] = train_simpleret
        #填充缺失的值
        train_data = train_data.fillna(method='bfill')
        
        return train_data
    

In [6]:
class Season_Feature:
    
    def __init__(self,year,season):
        self.year = year
        self.season = season
    
    
    
        
    #基本面数据的处理
    #
    #将基本面数据中的中文名称hash
    def name_to_code(name,data):
        d = defaultdict(list)
        real_name = []
        num = 0
        for i in name:
            if (not len(d[i])):
                d[i].append(num)
                num+=1
            real_name.append(d[i][0])
        text = str(num)
        return real_name
    
    #change_name 默认为基本面数据的列名，可以根据需要修改
    def need_change(data,change_name = ['name','industry','area']):
        for i in change_name:
            name = data[i]
            real_name = name_to_code(name,data)
            del data[i]
            data[i] = real_name
        return data
    
    #获取2017年第二季度的基本面数据
    def get_basic(self):
        year = self.year
        season = self.season
        
        base = ts.get_stock_basics()
        #获取2017年第2季度的业绩报表数据
        table1 = ts.get_report_data(year,season)
        #获取 2017年第二季度的盈利能力数据
        table2 = ts.get_profit_data(year,season)
        #获取2017年第2季度的营运能力数据
        table3 = ts.get_operation_data(year,season)
        #获取2017年第2季度的成长能力数据
        table4 = ts.get_growth_data(year,season)
        #获取2017年第2季度的偿债能力数据
        table5 = ts.get_debtpaying_data(year,season)
        #获取2017年第2季度的现金流量数据
        table6 = ts.get_cashflow_data(year,season)
        
        return base,table1,table2,table3,table4,table5,table6
    
    #将基本面的后6个表按 code，name 和并
    def merge_basic(self):
        year = self.year
        season = self.season
        
        base,table1,table2,table3,table4,table5,table6 = self.get_basic()
        #合并后6个表的数据
        result = pd.merge(table1,table2,on=['code','name'])
        result = pd.merge(result,table3,on=['code','name'])
        result = pd.merge(result,table4,on=['code','name'])
        result = pd.merge(result,table5,on=['code','name'])
        result = pd.merge(result,table6,on=['code','name'])
        #result = result.set_index('code')
        #合并前两个表的数据
        
        return result


In [7]:
#主函数
model = Model(ts.get_sz50s(), '2017-03-01', '2017-06-30', '2017-07-01', '2017-08-31', 15)

print (model.train())

  y = column_or_1d(y, warn=True)


[-0.01226435  0.00128179  0.00536256 ...,  0.05089279  0.04711263
  0.04966329]


In [8]:
'''
season2 = Season_Feature(2017,2)
season2 = season2.merge_basic()
season2 = season2.drop('name',1)
season2.to_csv('2017_season2.csv',sep='\t',na_rep='NaN',encoding='UTF-8')
'''


"\nseason2 = Season_Feature(2017,2)\nseason2 = season2.merge_basic()\nseason2 = season2.drop('name',1)\nseason2.to_csv('2017_season2.csv',sep='\t',na_rep='NaN',encoding='UTF-8')\n"

In [9]:
'''
season1 = Season_Feature(2017,1)
season1 = season1.merge_basic()
season1 = season1.drop('name',1)
#season1 = season1.set_index('code')
season1.to_csv('2017_season1.csv',sep='\t',na_rep='NaN',encoding='UTF-8')
'''

"\nseason1 = Season_Feature(2017,1)\nseason1 = season1.merge_basic()\nseason1 = season1.drop('name',1)\n#season1 = season1.set_index('code')\nseason1.to_csv('2017_season1.csv',sep='\t',na_rep='NaN',encoding='UTF-8')\n"

In [10]:
#load data
season1 = pd.read_csv('2017_season1.csv',sep='\t',index_col=0)
season2 = pd.read_csv('2017_season2.csv',sep='\t',index_col=0)

In [11]:
#获取两个季度股票代码
def get_season_codes(season1,season2):
    season1_codes = season1['code']
    season2_codes = season2['code']
    season1_codes = list(set(season1_codes))
    season2_codes = list(set(season2_codes))
    return season1_codes,season2_codes

In [12]:
season1_codes,season2_codes = get_season_codes(season1,season2)

In [14]:
#获取季度内股票的历史数据 返回的是股票代码和收益率的label
def get_season_hist_data(season_codes,season_start='2017-01-01',season_end='2017-03-31'):
    y_label = []
    x_code = []
    for code in season_codes:
        stock = str(code)
        if len(stock)<6:
            continue
        data = ts.get_hist_data(stock,start=season_start,end=season_end)
        #需要考虑在当前季度中没有该股票的历史数据的情况
        #没有数据无法进行计算，直接抛弃。
      
        if (type(data) == 'NoneType'):
            continue
        elif data.empty == True:
            continue
        else:
            #获取需要计算收益的两列数据
            data_open = data.open
            data_open = list(data_open)
            data_close = data.close
            data_close = list(data_close)
            # 获取季度有交易日区间
            days = len(data.index.tolist())
        
            #找出收益率 >=15% 和 收益率 <= -5%的股票并给出对应的y_label
            
            ret_flag = 0 #收益率的标记 为0表示在该季度的交易日内没能达到要求
            
            for i in range(0,days):
                for j in range(i+1,days):
                #计算收益率
                #只要在季度交易区间内达到了限制条件，就设置相应的label 在交易期间没达到就按照季初-季末算收益率
                    ret = (data_close[j] - data_open[i]) / data_open[i]
        
                    if ret >= 0.15 :
                        x_code.append(code)
                        y_label.append(0.15) 
                        ret_flag = 1
                        break
                    elif ret <= -0.05:
                        x_code.append(code)
                        y_label.append(-0.05) 
                        ret_flag = 1
                        break
                if ret_flag == 1:
                    break
            #若没有达到要求，将label 设置成季初到季末的收益率
            if ret_flag == 0:
                x_code.append(code)
                ret = (data_close[days-1]- data_open[0]) / data_open[0]
                y_label.append(ret) 
               
                   
    return x_code,y_label
            
        
        

In [15]:
x_code,y_label = get_season_hist_data(season1_codes,'2017-01-01','2017-03-31')

In [None]:
len(x_code)

In [None]:
x_code
len(x_code)

In [None]:
len(y_label)

In [16]:
def code_to_return(code,ret):
    code_to_ret = {}
    for i in range(len(code)):
        code_to_ret[str(code[i])] = ret[i]
    return code_to_ret
                   

In [17]:
code2return = code_to_return(x_code,y_label)

In [18]:
def add_feature_to_csv(season_codes,code2return,data_start,data_end,op=1):
    hist_feature = ['open','high','close','low','price_change','p_change','turnover']
    if op == 1:
        filename = 'train_data.csv'
    
        for stock in season_codes:
            code = str(stock)

            df = ts.get_hist_data(code,start=data_start,end=data_end)
            df = df.loc[:,hist_feature]
            ret = code2return.get(code)
            y_label = []
            for i in range(len(df.index.tolist())):
                y_label.append(ret)
            df['y'] = y_label
            if os.path.exists(filename):
                df.to_csv(filename, mode='a', header=None)
            else:
                df.to_csv(filename)
    else:
        filename = 'test_data.csv'
        for stock in season_codes:
            code = str(stock)
            df = ts.get_hist_data(code,start=data_start,end=data_end)
            df = df.loc[:,hist_feature]
            if os.path.exists(filename):
                df.to_csv(filename, mode='a', header=None)
            else:
                df.to_csv(filename)
    

In [19]:
add_feature_to_csv(x_code,code2return,data_start='2017-01-01',data_end='2017-03-31',op=1)

In [20]:
add_feature_to_csv(x_code,code2return,data_start='2017-04-01',data_end='2017-07-31',op=2)

In [23]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [24]:
y_label = train_data.iloc[:,-1]

In [27]:
train_data = train_data.iloc[:,-8:-2]

In [28]:
train_data.head()

Unnamed: 0,open,high,close,low,price_change,p_change
0,14.41,14.6,14.42,14.35,0.02,0.14
1,14.78,14.79,14.4,14.25,-0.39,-2.64
2,14.87,14.98,14.79,14.78,-0.11,-0.74
3,14.88,14.99,14.9,14.81,0.0,0.0
4,15.02,15.1,14.9,14.88,-0.19,-1.26


In [31]:
test_data = test_data.iloc[:,-7:-1]

In [32]:
svr_rbf = SVR(kernel = 'rbf', C = 1e3, gamma = 0.1)

In [33]:
y_rbf = svr_rbf.fit(train_data,y_label).predict(test_data)

In [34]:
y_rbf

array([ 0.0498513 ,  0.0498388 ,  0.04983744, ...,  0.05021839,
        0.05022358,  0.05020119])

In [40]:
result = open('y_rbf.txt','w')

In [41]:
for i in y_rbf:
    result.write(str(i)+'\n')
result.close()