In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
import random
import pandas as pd
import os
import numpy as np
import datetime

## add the classfication label to a stock for each date
def daily_label(stock_id, daily_path, UP_parm, DOWN_parm):
    try:
        string = daily_path+'/'+stock_id+'.csv'
        # daily_eg = pd.read_csv(string)
        daily_eg = pd.read_csv(string, sep = '\t')
    except:
        print('file not exist!')
    diff = list(daily_eg.open.diff(-1))
    diff = [float('inf')] + diff[:len(diff)-1]
    daily_eg['rise_pct'] = diff / daily_eg.open*100
    daily_eg['label'] = 'PRESERVE'
    daily_eg.loc[daily_eg['rise_pct']>UP_parm, 'label'] = 'UP'
    daily_eg.loc[daily_eg['rise_pct']<DOWN_parm, 'label'] = 'DOWN'
    daily_eg['date'] = pd.to_datetime(daily_eg['trade_date'], format='%Y%m%d')
    return daily_eg.loc[daily_eg['trade_date']<20201014]


## build Dataloader for all stocks
def buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm):
    dataloader = pd.DataFrame()
    N = len(stock_id_list)
    for ct, stock_id in enumerate(stock_id_list):
        daily_data = daily_label(stock_id,daily_path, UP_parm, DOWN_parm)
        date_list = [i.strftime("%Y-%m-%d") for i in daily_data.date]
        date_list.reverse()
        moving_date_list = [date_list[i:i+11] for i in range(0, len(date_list)-11+1)]
        df_id = pd.DataFrame()
        for Ndayslist in moving_date_list:
            dayslist = Ndayslist[:len(Ndayslist)-1]
            day = Ndayslist[-1]
            lastday = Ndayslist[-2]
            trade_day = int(day[:4]+day[5:7]+day[8:10])
            last_day = int(lastday[:4]+lastday[5:7]+lastday[8:10])
            cur_label = list(daily_data.loc[daily_data['trade_date']==trade_day].label)
            ## 现在还没有数值特征数据
            # stats_10 = daily_data.loc[daily_data['trade_date']==last_day,['EMA10', 'turnoverrate10', 'volatilityratio10']]
            # numeric_stats = stats_10.values[0].tolist()
            numeric_stats = [float('nan')]*3
            entity = []
            for date in dayslist:
                if os.path.isfile(news_path+f'{stock_id}/{date}.txt'):
                    entity.append('stockNews/'+f'{stock_id}/{date}.txt')
                else:
                    entity.append(float('nan'))
            df_entity = pd.DataFrame(entity)
            #if int(df_entity.isna().sum()) == 10:
            #    print('stock_id: ', stock_id, 'contains no news to predict the label at date ', str(trade_day))
            #    continue
            df_id = df_id.append(pd.DataFrame([stock_id]+[trade_day]+entity+numeric_stats+cur_label).transpose())
        dataloader = dataloader.append(df_id)
        print('stock:', stock_id, ' finished!')
        print(N-ct-1, ' stocks left.')
        print('------------------------------')
    dataloader.columns = ['stock_id','label_date','day-1','day-2','day-3','day-4','day-5','day-6','day-7','day-8',
                          'day-9','day-10','EMA10','turnoverrate10', 'volatilityratio10','label']
    dataloader.reset_index(inplace = True, drop = True)
    return dataloader


def write_dataloader(dataloader, Ptrain, Pcv, outputpath):
    random.seed(50)
    dataloader.sort_values(by='label_date')
    
    n = dataloader.shape[0]
    train_num = int(n*Ptrain)
    cv_num = int(n*Pcv)

    train = dataloader[:train_num]
    cv = dataloader[train_num:train_num+cv_num]
    test = dataloader[cv_num+train_num:]
    
    if not os.path.isdir(outputpath):
        os.mkdir(outputpath)
    train.to_csv(outputpath+'/train_data.csv', index=False)
    cv.to_csv(outputpath+'/cv_data.csv', index=False)
    test.to_csv(outputpath+'/test_data.csv', index=False)
    print('Successfully created training, cv and test dataset!')
    # return train, cv, test

In [5]:
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'stockDataFromTushare545'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH+'/daily/'))

['sh600000.csv', 'sh600884.csv', 'sh600886.csv', 'sh600887.csv', 'sh600890.csv', 'sh600889.csv', 'sh600888.csv', 'sh600893.csv', 'sh600892.csv', 'sh600894.csv', 'sh600895.csv', 'sh600896.csv', 'sh600898.csv', 'sh600897.csv', 'sh600900.csv', 'sh600901.csv', 'sh600903.csv', 'sh600909.csv', 'sh600908.csv', 'sh600918.csv', 'sh600917.csv', 'sh600919.csv', 'sh600928.csv', 'sh600926.csv', 'sh600933.csv', 'sh600936.csv', 'sh600929.csv', 'sh600939.csv', 'sh600956.csv', 'sh600958.csv', 'sh600959.csv', 'sh600960.csv', 'sh600962.csv', 'sh600961.csv', 'sh600963.csv', 'sh600965.csv', 'sh600967.csv', 'sh600966.csv', 'sh600968.csv', 'sh600970.csv', 'sh600969.csv', 'sh600975.csv', 'sh600971.csv', 'sh600973.csv', 'sh600976.csv', 'sh600977.csv', 'sh600978.csv', 'sh600980.csv', 'sh600979.csv', 'sh600982.csv', 'sh600981.csv', 'sh600983.csv', 'sh600985.csv', 'sh600984.csv', 'sh600986.csv', 'sh600987.csv', 'sh600988.csv', 'sh600990.csv', 'sh600989.csv', 'sh600992.csv', 'sh600993.csv', 'sh600996.csv', 'sh6009

In [37]:
## example
UP_parm = 0.87
DOWN_parm = -0.41
## 先做300个
stock300 = pd.read_csv(GOOGLE_DRIVE_PATH+'/stockid2name.csv')
stock_id_list = list(stock300.id)
daily_path = GOOGLE_DRIVE_PATH+'/daily/'
news_path = GOOGLE_DRIVE_PATH+'/stockNews/'
outputpath = GOOGLE_DRIVE_PATH+'/dataloader/'
Ptrain = 0.67
Pcv = 0.1
df_dataloader = buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm)
# write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)

stock: sz000001  finished!
299  stocks left.
------------------------------
stock: sz000002  finished!
298  stocks left.
------------------------------
stock: sz000063  finished!
297  stocks left.
------------------------------
stock: sz000066  finished!
296  stocks left.
------------------------------
stock: sz000069  finished!
295  stocks left.
------------------------------
stock: sz000100  finished!
294  stocks left.
------------------------------
stock: sz000157  finished!
293  stocks left.
------------------------------
stock: sz000166  finished!
292  stocks left.
------------------------------
stock: sz000333  finished!
291  stocks left.
------------------------------
stock: sz000338  finished!
290  stocks left.
------------------------------
stock: sz000425  finished!
289  stocks left.
------------------------------
stock: sz000538  finished!
288  stocks left.
------------------------------
stock: sz000568  finished!
287  stocks left.
------------------------------
stock: sz000

In [38]:
write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)
cvfile = pd.read_csv(GOOGLE_DRIVE_PATH+'/dataloader/cv_data.csv')
cvfile.head()

Successfully created training, cv and test dataset!


Unnamed: 0,stock_id,label_date,day-1,day-2,day-3,day-4,day-5,day-6,day-7,day-8,day-9,day-10,EMA10,turnoverrate10,volatilityratio10,label
0,sh600900,20190128,,,,stockNews/sh600900/2019-01-17.txt,,stockNews/sh600900/2019-01-21.txt,,,stockNews/sh600900/2019-01-24.txt,,,,,DOWN
1,sh600900,20190129,,,stockNews/sh600900/2019-01-17.txt,,stockNews/sh600900/2019-01-21.txt,,,stockNews/sh600900/2019-01-24.txt,,,,,,UP
2,sh600900,20190130,,stockNews/sh600900/2019-01-17.txt,,stockNews/sh600900/2019-01-21.txt,,,stockNews/sh600900/2019-01-24.txt,,,,,,,PRESERVE
3,sh600900,20190131,stockNews/sh600900/2019-01-17.txt,,stockNews/sh600900/2019-01-21.txt,,,stockNews/sh600900/2019-01-24.txt,,,,,,,,UP
4,sh600900,20190201,,stockNews/sh600900/2019-01-21.txt,,,stockNews/sh600900/2019-01-24.txt,,,,,,,,,DOWN
