In [1]:
import random
import pandas as pd
import os
import numpy as np
import datetime

## add the classfication label to a stock for each date
def daily_label(stock_id, daily_path, UP_parm, DOWN_parm):
    try:
        string = daily_path+'/'+stock_id+'.csv'
        # daily_eg = pd.read_csv(string)
        daily_eg = pd.read_csv(string, sep = '\t')
    except:
        print('file not exist!')
    diff = list(daily_eg.open.diff(-1))
    diff = [float('inf')] + diff[:len(diff)-1]
    daily_eg['rise_pct'] = diff / daily_eg.open*100
    daily_eg['label'] = 'PRESERVE'
    daily_eg.loc[daily_eg['rise_pct']>UP_parm, 'label'] = 'UP'
    daily_eg.loc[daily_eg['rise_pct']<DOWN_parm, 'label'] = 'DOWN'
    daily_eg['date'] = pd.to_datetime(daily_eg['trade_date'], format='%Y%m%d')
    return daily_eg.loc[daily_eg['trade_date']<20201014]


## build Dataloader for all stocks
def buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm):
    dataloader = pd.DataFrame()
    for stock_id in stock_id_list:
        daily_data = daily_label(stock_id,daily_path, UP_parm, DOWN_parm)
        date_list = [i.strftime("%Y-%m-%d") for i in daily_data.date]
        date_list.reverse()
        moving_date_list = [date_list[i:i+11] for i in range(0, len(date_list)-11+1)]
        df_id = pd.DataFrame()
        for Ndayslist in moving_date_list:
            dayslist = Ndayslist[:len(Ndayslist)-1]
            day = Ndayslist[-1]
            lastday = Ndayslist[-2]
            trade_day = int(day[:4]+day[5:7]+day[8:10])
            last_day = int(lastday[:4]+lastday[5:7]+lastday[8:10])
            cur_label = list(daily_data.loc[daily_data['trade_date']==trade_day].label)
            stats_10 = daily_data.loc[daily_data['trade_date']==last_day,['EMA10', 'turnoverrate10', 'volatilityratio10']]
            numeric_stats = stats_10.values[0].tolist()
            entity = []
            for date in dayslist:
                if os.path.isfile(news_path+f'{stock_id}/{date}.txt'):
                    entity.append(news_path+f'{stock_id}/{date}.txt')
                else:
                    entity.append(float('nan'))
            df_entity = pd.DataFrame(entity)
            if int(df_entity.isna().sum()) == 10:
                print('stock_id: ', stock_id, 'contains no news to predict the label at date ', str(trade_day))
                continue
            df_id = df_id.append(pd.DataFrame([stock_id]+[trade_day]+entity+numeric_stats+cur_label).transpose())
        dataloader = dataloader.append(df_id)
    dataloader.columns = ['stock_id','label_date','day-1','day-2','day-3','day-4','day-5','day-6','day-7','day-8',
                          'day-9','day-10','EMA10','turnoverrate10', 'volatilityratio10','label']
    dataloader.reset_index(inplace = True, drop = True)
    return dataloader


def write_dataloader(dataloader, Ptrain, Pcv, outputpath):
    random.seed(50)
    dataloader.sort_values(by='label_date')
    
    n = dataloader.shape[0]
    train_num = int(n*Ptrain)
    cv_num = int(n*Pcv)

    train = dataloader[:train_num]
    cv = dataloader[train_num:train_num+cv_num]
    test = dataloader[cv_num+train_num:]
    
    if not os.path.isdir(outputpath):
        os.mkdir(outputpath)
    train.to_csv(outputpath+'/train_data.csv', index=False)
    cv.to_csv(outputpath+'/cv_data.csv', index=False)
    test.to_csv(outputpath+'/test_data.csv', index=False)
    print('Successfully created training, cv and test dataset!')
    # return train, cv, test

In [2]:
## example
UP_parm = 0.87
DOWN_parm = -0.41
stock_id_list = ['sh600000']
daily_path = 'daily/'
news_path = 'stockNews/'
outputpath = 'dataloader/'
Ptrain = 0.67
Pcv = 0.1
df_dataloader = buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm)
write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)

stock_id:  sh600000 contains no news to predict the label at date  20200102
stock_id:  sh600000 contains no news to predict the label at date  20200103
stock_id:  sh600000 contains no news to predict the label at date  20200106
stock_id:  sh600000 contains no news to predict the label at date  20200107
stock_id:  sh600000 contains no news to predict the label at date  20200624
stock_id:  sh600000 contains no news to predict the label at date  20200629
stock_id:  sh600000 contains no news to predict the label at date  20200630
Successfully created training, cv and test dataset!
