In [10]:
import random
import pandas as pd
import os
import numpy as np
import datetime

## add the classfication label to a stock for each date
def daily_label(stock_name, daily_path):
    try:
        string = daily_path+'/'+stock_name+'.csv'
        daily_eg = pd.read_csv(string, sep = '\t')
    except:
        print('file not exist!')
    diff = list(daily_eg.open.diff(-1))
    diff = [float('inf')] + diff[:len(diff)-1]
    daily_eg['rise_pct'] = diff / daily_eg.open*100
    daily_eg['label'] = 'PRESERVE'
    daily_eg.loc[daily_eg['rise_pct']>0.87, 'label'] = 'UP'
    daily_eg.loc[daily_eg['rise_pct']<-0.41, 'label'] = 'DOWN'
    daily_eg['date'] = pd.to_datetime(daily_eg['trade_date'], format='%Y%m%d')
    return daily_eg.loc[daily_eg['trade_date']<20201014]

def buildDataloader(stock_id, daily_path, news_path):
    daily_data = daily_label(stock_id,daily_path)
    date_list = [i.strftime("%Y-%m-%d") for i in daily_data.date]
    date_list.reverse()
    moving_date_list = [date_list[i:i+11] for i in range(0, len(date_list)-11+1)]
    df_id = pd.DataFrame()
    for Ndayslist in moving_date_list:
        dayslist = Ndayslist[:len(Ndayslist)-1]
        day = Ndayslist[-1]
        trade_day = int(day[:4]+day[5:7]+day[8:10])
        cur_label = list(daily_data.loc[daily_data['trade_date']==trade_day].label)
        df_id = df_id.append(pd.DataFrame([stock_id]+[news_path + f'{stock_id}/{date}.txt' for date in dayslist]+cur_label).transpose())
    df_id.columns = ['stock_id','day-1','day-2','day-3','day-4','day-5','day-6','day-7','day-8','day-9','day-10','label']
    df_id.reset_index(inplace = True, drop = True)
    return df_id

def write_dataloader(dataloader, Ptrain, Pcv, outputpath):
    random.seed(50)
    PRESERVE = dataloader.loc[dataloader['label']=='PRESERVE']
    UP = dataloader.loc[dataloader['label']=='UP']
    DOWN = dataloader.loc[dataloader['label']=='DOWN']
    
    def split(dataset):
        train_cv = dataset.sample(frac=Ptrain)
        test = dataset.drop(train_cv.index)
        cv = train_cv.sample(frac=Pcv)
        train = train_cv.drop(cv.index)
        return train, cv, test
    
    P_train, P_cv, P_test = split(PRESERVE)
    U_train, U_cv, U_test = split(UP)
    D_train, D_cv, D_test = split(DOWN)
    
    train = pd.concat([P_train, U_train, D_train])
    cv = pd.concat([P_cv, U_cv, D_cv])
    test = pd.concat([P_test, U_test, D_test])
    try:
        os.mkdir(outputpath)
    except OSError:
        print ("Creation of the directory %s failed or already existed" % outputpath)
    else:
        print ("Successfully created the directory %s " % outputpath)
    
    train.to_csv(outputpath+'/train_data.csv')
    cv.to_csv(outputpath+'/cv_data.csv')
    test.to_csv(outputpath+'/test_data.csv')
    print('Successfully created training, cv and test dataset!')
    # return train, cv, test

In [11]:
## example
stock_id = 'sh600000'
daily_path = 'daily/'
news_path = 'generalNews/'
outputpath = 'dataloader/'
Ptrain = 0.67
Pcv = 0.1
df_dataloader = buildDataloader(stock_id, daily_path, news_path)
write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)

Successfully created the directory dataloader/ 
Successfully created training, cv and test dataset!
