In [8]:
import pandas as pd
import numpy as np
import random
import time
import pickle
from sklearn.ensemble import RandomForestClassifier
from Statistics import Statistics

In [9]:
import os
SEED = 9
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [10]:
SP500_df = pd.read_csv('data/SPXconst.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [11]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
                

In [12]:
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    # print(len(constituents_train[test_year][0]))
    # print(len(constituents_train[test_year][-1]))
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])
    # break

In [13]:
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]

In [27]:
def create_stock_data(df_open,df_close,st,m=240):
    '''
        df_open: 某年的开盘价
        df_close： 某年的收盘价
        st: 股票代码
    '''
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    # 日变化率: close_price / open_price - 1
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    # close_price(t + 1) / open_price(t) - 1
    nextday_ret = (np.array(df_open[st][1:])/np.array(df_close[st][:-1])-1)
    nextday_ret = pd.Series(list(nextday_ret)+[np.nan])     
    for k in range(m)[::-1]:
        st_data['NextR'+str(k)] = nextday_ret.shift(k)

    # 收盘价的变化率：close_price(t + 1) / close_price(t) - 1
    close_change = df_close[st].pct_change()
    for k in range(m)[::-1]:
        st_data['CloseR'+str(k)] = close_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)    
    st_data['label'] = list(label[st])+[np.nan] 
    st_data['Month'] = list(df_close['Date'].str[:-3])
    st_data = st_data.dropna()
    print(st_data)
    trade_year = st_data['Month'].str[:4]
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)]
    st_test_data = st_data[trade_year==str(test_year)]
    return np.array(st_train_data),np.array(st_test_data)

In [29]:
from sklearn.preprocessing import RobustScaler
def scalar_normalize(train_data,test_data):
    scaler = RobustScaler()
    scaler.fit(train_data[:,2:-2])
    train_data[:,2:-2] = scaler.transform(train_data[:,2:-2])
    test_data[:,2:-2] = scaler.transform(test_data[:,2:-2])    

In [30]:
for test_year in range(1993,2018):
    
    print('-'*40)
    print(test_year)
    print('-'*40)
    
    filename = 'data/Open-'+str(test_year-3)+'.csv'
    df_open = pd.read_csv(filename)
    filename = 'data/Close-'+str(test_year-3)+'.csv'
    df_close = pd.read_csv(filename)
    label = create_label(df_open,df_close)
    stock_names = sorted(list(constituents[str(test_year-1)+'-12']))
    train_data,test_data = [],[]

    start = time.time()
    for st in stock_names:

        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        train_data.append(st_train_data)
        test_data.append(st_test_data)
    train_data = np.concatenate([x for x in train_data])
    test_data = np.concatenate([x for x in test_data])
    scalar_normalize(train_data,test_data)
    print(train_data.shape,test_data.shape)
    break

  \
681    0.058893   0.014723  -0.048387  ...  0.031379  0.000000  0.000000   
682    0.014723  -0.048387   0.000000  ...  0.000000  0.000000  0.000000   
683   -0.048387   0.000000   0.000000  ...  0.000000  0.000000  0.000000   
684    0.000000   0.000000  -0.044170  ...  0.000000  0.000000  0.000000   
685    0.000000  -0.044170   0.000000  ...  0.000000  0.000000  0.000000   
...         ...        ...        ...  ...       ...       ...       ...   
1007  -0.017556  -0.052669   0.000000  ...  0.011197 -0.011073  0.011197   
1008  -0.052669   0.000000  -0.035740  ... -0.011073  0.011197  0.000000   
1009   0.000000  -0.035740   0.000000  ...  0.011197  0.000000 -0.053663   
1010  -0.035740   0.000000  -0.017870  ...  0.000000 -0.053663 -0.033303   
1011   0.000000  -0.017870   0.000000  ... -0.053663 -0.033303  0.011173   

       CloseR3   CloseR2   CloseR1   CloseR0  IntraR-future  label    Month  
681   0.000000  0.000000  0.000000  0.000000       0.000000    0.0  1992-09  
682