In [12]:
import pandas as pd
import numpy as np
import random
import time
import pickle
from sklearn.ensemble import RandomForestClassifier
from Statistics import Statistics

In [13]:
import os
SEED = 9
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [14]:
SP500_df = pd.read_csv('data/SPXconst.csv')
all_companies = list(set(SP500_df.values.flatten()))
all_companies.remove(np.nan)

In [15]:
constituents = {'-'.join(col.split('/')[::-1]):set(SP500_df[col].dropna()) 
                for col in SP500_df.columns}
                

In [16]:
constituents_train = {} 
for test_year in range(1993,2016):
    months = [str(t)+'-0'+str(m) if m<10 else str(t)+'-'+str(m) 
              for t in range(test_year-3,test_year) for m in range(1,13)]
    constituents_train[test_year] = [list(constituents[m]) for m in months]
    # print(len(constituents_train[test_year][0]))
    # print(len(constituents_train[test_year][-1]))
    constituents_train[test_year] = set([i for sublist in constituents_train[test_year] 
                                         for i in sublist])
    print(len(constituents_train[test_year]))
    # break

172
174
181
186
187
196
200
209
217
224
233
237
242
249
260
280
302
321
337
353
373
392
409


In [17]:
def create_label(df_open,df_close,perc=[0.5,0.5]):
    if not np.all(df_close.iloc[:,0]==df_open.iloc[:,0]):
        print('Date Index issue')
        return
    perc = [0.]+list(np.cumsum(perc))
    label = (df_close.iloc[:,1:]/df_open.iloc[:,1:]-1).apply(
            lambda x: pd.qcut(x.rank(method='first'),perc,labels=False), axis=1)
    return label[1:]

In [28]:
def create_stock_data(df_open,df_close,st,m=240):
    '''
        df_open: 某年的开盘价
        df_close： 某年的收盘价
        st: 股票代码
    '''
    st_data = pd.DataFrame([])
    st_data['Date'] = list(df_close['Date'])
    st_data['Name'] = [st]*len(st_data)
    # 日变化率: close_price / open_price - 1
    print(df_close.columns.values)
    print(df_close[st])
    print(df_open[st])
    return
    daily_change = df_close[st]/df_open[st]-1
    for k in range(m)[::-1]:
        st_data['IntraR'+str(k)] = daily_change.shift(k)

    # close_price(t + 1) / open_price(t) - 1
    nextday_ret = (np.array(df_open[st][1:])/np.array(df_close[st][:-1])-1)
    nextday_ret = pd.Series(list(nextday_ret)+[np.nan])     
    for k in range(m)[::-1]:
        st_data['NextR'+str(k)] = nextday_ret.shift(k)

    # 收盘价的变化率：close_price(t + 1) / close_price(t) - 1
    close_change = df_close[st].pct_change()
    for k in range(m)[::-1]:
        st_data['CloseR'+str(k)] = close_change.shift(k)

    st_data['IntraR-future'] = daily_change.shift(-1)    
    st_data['label'] = list(label[st])+[np.nan] 
    st_data['Month'] = list(df_close['Date'].str[:-3])
    st_data = st_data.dropna()
    
    trade_year = st_data['Month'].str[:4]
    st_data = st_data.drop(columns=['Month'])
    st_train_data = st_data[trade_year<str(test_year)]
    st_test_data = st_data[trade_year==str(test_year)]
    return np.array(st_train_data),np.array(st_test_data) 

In [29]:
for test_year in range(1993,2020):
    
    print('-'*40)
    print(test_year)
    print('-'*40)
    
    filename = 'data/Open-'+str(test_year-3)+'.csv'
    df_open = pd.read_csv(filename)
    filename = 'data/Close-'+str(test_year-3)+'.csv'
    df_close = pd.read_csv(filename)
    label = create_label(df_open,df_close)
    stock_names = sorted(list(constituents[str(test_year-1)+'-12']))
    train_data,test_data = [],[]

    start = time.time()
    for st in stock_names:
        st = "AEE"
        st_train_data,st_test_data = create_stock_data(df_open,df_close,st)
        train_data.append(st_train_data)
        test_data.append(st_test_data)
    print(train_data)
    print(test_data)
    break
    train_data = np.concatenate([x for x in train_data])
    test_data = np.concatenate([x for x in test_data])
    break

----------------------------------------
1993
----------------------------------------
['Date' 'HBAN' 'SPGI' 'EBAY' 'VLO' 'MKC' 'GLW' 'TSN' 'ETN' 'QCOM' 'OMC'
 'D' 'MCO' 'L' 'PNW' 'MET' 'FITB' 'USB' 'PPL' 'BEN' 'LEG' 'GE' 'PAYX' 'K'
 'TMO' 'STT' 'MAR' 'HUM' 'ED' 'NI' 'FE' 'SRE' 'MSI' 'VNO' 'XRX' 'DHR'
 'MCK' 'WHR' 'TROW' 'TPR' 'WRK' 'DRI' 'WY' 'MS' 'WAT' 'FCX' 'TXN' 'KLAC'
 'ROK' 'WM' 'NTRS' 'ES' 'SBUX' 'COP' 'CVX' 'XEL' 'KR' 'ETR' 'KMB' 'PFE'
 'PPG' 'PG' 'SO' 'PEP' 'CVS' 'IP' 'NSC' 'DE' 'F' 'CPB' 'DTE' 'AEP' 'HIG'
 'SEE' 'EXC' 'EIX' 'BA' 'BMY' 'IBM' 'CAT' 'KO' 'CL' 'MRK' 'MO' 'PEG' 'HSY'
 'XOM' 'HAL' 'GD' 'UNP' 'HON' 'ABT' 'SHW' 'SLB' 'CMI' 'EMR' 'CSX' 'GIS'
 'CLX' 'NEM' 'MCD' 'LLY' 'BDX' 'BAX' 'JNJ' 'GPC' 'HPQ' 'WMB' 'JPM' 'IFF'
 'WFC' 'AXP' 'DIS' 'CI' 'NEE' 'TAP' 'LNC' 'DUK' 'BAC' 'TGT' 'INTC' 'TXT'
 'VFC' 'WBA' 'AIG' 'FDX' 'PCAR' 'ADP' 'MAS' 'GWW' 'ADM' 'WMT' 'SNA' 'SWK'
 'BF.B' 'AAPL' 'OXY' 'CAG' 'LB' 'VZ' 'LOW' 'PHM' 'HES' 'LMT' 'HAS' 'BLL'
 'T' 'NUE' 'APD' 'PKI' 'NOC' 'CNP' 'TJX

KeyError: 'AEE'

In [1]:
import pandas as pd 
import numpy as np
import os

In [4]:
path = "./data"
open_file = []
close_flie = []
for file_item in files:
    if file_item[0:4] == "Open":
        open_file.append(file_item)
    elif file_item[0:5] == "Close":
        close_flie.append(file_item)
sorted(open_file)
sorted(close_flie)
for i in range(len(open_file) - 3):
    # Open 
    df = pd.read_csv("{}/{}".format(path,open_file[i]),index_col="Date")
    df1 = pd.read_csv("{}/{}".format(path,open_file[i + 1]),index_col="Date")
    df2 = pd.read_csv("{}/{}".format(path,open_file[i + 2]),index_col="Date")
    df3 = pd.read_csv("{}/{}".format(path,open_file[i + 3]),index_col="Date")
    df = pd.concat([df, df1, df2, df3])
    df.sort_index(inplace=True)
    df.index.name = "Date"
    df.to_csv("{}/{}".format(path,open_file[i]))
    # Close
    df = pd.read_csv("{}/{}".format(path,close_flie[i]),index_col="Date")
    df1 = pd.read_csv("{}/{}".format(path,close_flie[i + 1]),index_col="Date")
    df2 = pd.read_csv("{}/{}".format(path,close_flie[i + 2]),index_col="Date")
    df3 = pd.read_csv("{}/{}".format(path,close_flie[i + 3]),index_col="Date")
    df = pd.concat([df, df1, df2, df3])
    df.sort_index(inplace=True)
    df.index.name = "Date"
    df.to_csv("{}/{}".format(path,close_flie[i]))
    print("{} {} Finished!".format(open_file[i], close_flie[i]))
    
print("Finished!")

Open-1990.csv Close-1990.csv Finished!
Open-1991.csv Close-1991.csv Finished!
Open-1992.csv Close-1992.csv Finished!
Open-1993.csv Close-1993.csv Finished!
Open-1994.csv Close-1994.csv Finished!
Open-1995.csv Close-1995.csv Finished!
Open-1996.csv Close-1996.csv Finished!
Open-1997.csv Close-1997.csv Finished!
Open-1998.csv Close-1998.csv Finished!
Open-1999.csv Close-1999.csv Finished!
Open-2000.csv Close-2000.csv Finished!
Open-2001.csv Close-2001.csv Finished!
Open-2002.csv Close-2002.csv Finished!
Open-2003.csv Close-2003.csv Finished!
Open-2004.csv Close-2004.csv Finished!
Open-2005.csv Close-2005.csv Finished!
Open-2006.csv Close-2006.csv Finished!
Open-2007.csv Close-2007.csv Finished!
Open-2008.csv Close-2008.csv Finished!
Open-2009.csv Close-2009.csv Finished!
Open-2010.csv Close-2010.csv Finished!
Open-2011.csv Close-2011.csv Finished!
Open-2012.csv Close-2012.csv Finished!
Open-2013.csv Close-2013.csv Finished!
Open-2014.csv Close-2014.csv Finished!
Open-2015.csv Close-2015.

In [38]:
a = "SPXconst.csv"
b = (a[0:4] == "Open" or a[0:5] == "Close")
b

False