In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import random
import pandas as pd
import os
import numpy as np
import datetime


def getThreshold(stock_id_list, daily_path):
    rise_pct = []
    for stk_id in stock_id_list:
      string = daily_path+'/'+stk_id+'.csv'
      daily_data = pd.read_csv(string, sep = '\t')
      diff = list(daily_data.open.diff(-1))
      stk_rise_pct = diff[:len(diff)-1] / daily_data.open[1:] * 100
      rise_pct.extend(stk_rise_pct)
      # print(f'{stk_id} finished!\n')
    UP_parm = np.quantile(rise_pct, 0.67)
    DOWN_parm = np.quantile(rise_pct, 0.33)
    return UP_parm, DOWN_parm

## add the classfication label to a stock for each date
def daily_label(stock_id, daily_path, UP_parm, DOWN_parm):
    try:
        string = daily_path+'/'+stock_id+'.csv'
        # daily_eg = pd.read_csv(string)
        daily_eg = pd.read_csv(string, sep = '\t')
    except:
        print('file not exist!')
    diff = list(daily_eg.open.diff(-1))
    diff = [float('inf')] + diff[:len(diff)-1]
    daily_eg['rise_pct'] = diff / daily_eg.open*100
    daily_eg['label'] = 'PRESERVE'
    daily_eg.loc[daily_eg['rise_pct']>UP_parm, 'label'] = 'UP'
    daily_eg.loc[daily_eg['rise_pct']<DOWN_parm, 'label'] = 'DOWN'
    daily_eg['date'] = pd.to_datetime(daily_eg['trade_date'], format='%Y%m%d')
    return daily_eg.loc[daily_eg['trade_date']<20201014]


## build Dataloader for all stocks
def buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm):
    dataloader = pd.DataFrame()
    N = len(stock_id_list)
    for ct, stock_id in enumerate(stock_id_list):
        daily_data = daily_label(stock_id,daily_path, UP_parm, DOWN_parm)
        date_list = [i.strftime("%Y-%m-%d") for i in daily_data.date]
        date_list.reverse()
        moving_date_list = [date_list[i:i+11] for i in range(0, len(date_list)-11+1)]
        df_id = pd.DataFrame()
        for Ndayslist in moving_date_list:
            dayslist = Ndayslist[:len(Ndayslist)-1]
            day = Ndayslist[-1]
            lastday = Ndayslist[-2]
            trade_day = int(day[:4]+day[5:7]+day[8:10])
            last_day = int(lastday[:4]+lastday[5:7]+lastday[8:10])
            cur_label = list(daily_data.loc[daily_data['trade_date']==trade_day].label)
            ## 现在还没有数值特征数据
            # stats_10 = daily_data.loc[daily_data['trade_date']==last_day,['EMA10', 'turnoverrate10', 'volatilityratio10']]
            # numeric_stats = stats_10.values[0].tolist()
            numeric_stats = [float('nan')]*3
            entity = []
            for date in dayslist:
                if os.path.isfile(news_path+f'{stock_id}/{date}.txt'):
                    entity.append(f'{stock_id}/{date}.txt')
                else:
                    entity.append(float('nan'))
            df_entity = pd.DataFrame(entity)
            #if int(df_entity.isna().sum()) == 10:
            #    print('stock_id: ', stock_id, 'contains no news to predict the label at date ', str(trade_day))
            #    continue
            df_id = df_id.append(pd.DataFrame([stock_id]+[trade_day]+entity+numeric_stats+cur_label).transpose())
        dataloader = dataloader.append(df_id)
        print('stock:', stock_id, ' finished!')
        print(N-ct-1, ' stocks left.')
        print('------------------------------')
    dataloader.columns = ['stock_id','label_date','day-1','day-2','day-3','day-4','day-5','day-6','day-7','day-8',
                          'day-9','day-10','EMA10','turnoverrate10', 'volatilityratio10','label']
    dataloader.reset_index(inplace = True, drop = True)
    return dataloader


def write_dataloader(dataloader, Ptrain, Pcv, outputpath):
    random.seed(50)
    dataloader.sort_values(by='label_date')
    
    n = dataloader.shape[0]
    train_num = int(n*Ptrain)
    cv_num = int(n*Pcv)

    train = dataloader[:train_num]
    cv = dataloader[train_num:train_num+cv_num]
    test = dataloader[cv_num+train_num:]
    
    if not os.path.isdir(outputpath):
        os.mkdir(outputpath)
    train.to_csv(outputpath+'/train_data.csv', index=False)
    cv.to_csv(outputpath+'/cv_data.csv', index=False)
    test.to_csv(outputpath+'/test_data.csv', index=False)
    print('Successfully created training, cv and test dataset!')
    # return train, cv, test

In [3]:
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'stockDataFromTushare545'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
# print(os.listdir(GOOGLE_DRIVE_PATH+'/daily/'))

In [34]:
## example
UP_parm = 0.77618351
DOWN_parm = -0.71681229
## 先做300个
stock300 = pd.read_csv(GOOGLE_DRIVE_PATH+'/stockid2name.csv')
stock_id_list = list(stock300.id)
daily_path = GOOGLE_DRIVE_PATH+'/daily/'
news_path = GOOGLE_DRIVE_PATH+'/stockNews/'
outputpath = GOOGLE_DRIVE_PATH+'/dataloader/'
Ptrain = 0.67
Pcv = 0.1
df_dataloader = buildDataloader(stock_id_list, daily_path, news_path, UP_parm, DOWN_parm)
write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)

stock: sz000001  finished!
299  stocks left.
------------------------------
stock: sz000002  finished!
298  stocks left.
------------------------------
stock: sz000063  finished!
297  stocks left.
------------------------------
stock: sz000066  finished!
296  stocks left.
------------------------------
stock: sz000069  finished!
295  stocks left.
------------------------------
stock: sz000100  finished!
294  stocks left.
------------------------------
stock: sz000157  finished!
293  stocks left.
------------------------------
stock: sz000166  finished!
292  stocks left.
------------------------------
stock: sz000333  finished!
291  stocks left.
------------------------------
stock: sz000338  finished!
290  stocks left.
------------------------------
stock: sz000425  finished!
289  stocks left.
------------------------------
stock: sz000538  finished!
288  stocks left.
------------------------------
stock: sz000568  finished!
287  stocks left.
------------------------------
stock: sz000

In [35]:
# write_dataloader(df_dataloader, Ptrain, Pcv, outputpath)
cvfile = pd.read_csv(GOOGLE_DRIVE_PATH+'/dataloader/cv_data.csv')
cvfile.head()

Unnamed: 0,stock_id,label_date,day-1,day-2,day-3,day-4,day-5,day-6,day-7,day-8,day-9,day-10,EMA10,turnoverrate10,volatilityratio10,label
0,sh600900,20190128,,,,sh600900/2019-01-17.txt,,sh600900/2019-01-21.txt,,,sh600900/2019-01-24.txt,,,,,PRESERVE
1,sh600900,20190129,,,sh600900/2019-01-17.txt,,sh600900/2019-01-21.txt,,,sh600900/2019-01-24.txt,,,,,,UP
2,sh600900,20190130,,sh600900/2019-01-17.txt,,sh600900/2019-01-21.txt,,,sh600900/2019-01-24.txt,,,,,,,PRESERVE
3,sh600900,20190131,sh600900/2019-01-17.txt,,sh600900/2019-01-21.txt,,,sh600900/2019-01-24.txt,,,,,,,,UP
4,sh600900,20190201,,sh600900/2019-01-21.txt,,,sh600900/2019-01-24.txt,,,,,,,,,DOWN


In [33]:
np.quantile(pct,[0.33,0.67])

array([-0.71681229,  0.77618351])

In [None]:
import jieba,math
import jieba.analyse
import re
import pickle
import gensim

## get stopword list
f = open(GOOGLE_DRIVE_PATH + '/Stopwords/stopwords.pkl', 'rb')
stopwords = pickle.load(f)
f.close()

## get pretrained word2vec model
word2vecPath =  GOOGLE_DRIVE_PATH + '/ChineseWord2Vec/sgns.financial.word'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vecPath, binary=False)


## class dataset dataloader
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, datafile, word2vec, newspath, stopwords, max_size):
        self.dataset = pd.read_csv(datafile)
        self.stopwords = stopwords
        self.max_size = max_sizr
        self.word2vec = wordvec
        
    def __get_item__(self, index):
        news_list, label = self.dataset[index,2:-1], self.dataset[index,-1]
        
        ## 返回10天新闻文本得到的1个词向量
        def convert2vec(news_list):
            
            def preProcess(raw_data):
                raw_text = pd.read_csv(raw_data, header=None)
                news_data = []
                for i in range(len(raw_text)):
                    text = str(raw_data.iloc[i,:])
                    str_text = text.split('\n')[0]
                    tokens = jieba.lcut(str_text,cut_all=False)
                    cleanword = []
                    for wd in tokens:
                      if wd not in self.stopwords:
                        cleanword.append(wd)
                    news_data.append(cleanword)
                return news_data

            def wd2vec(news_data):
                model = self.word2vec
                vec_list = []
                for wd_list in news_data:
                  for wd in wd_list:
                    clean_vec = np.zeros(self.max_size)
                    if wd in model.vocab:
                      clean_vec += model.get_vector(wd)
                  vec_list.append(clean_vec)
                return vec_list

            def combineVec(vec_list):
              ## implement attetion layer for 10 days' news

            #------
            news_vec_list = []   ## 用来存每天的新闻词向量
            for news_path in news_list:
                if np.isnan(news_path):
                    continue
                else:
                    news_raw_data = pd.read_csv(news_path) ## 读入某天的新闻文本
                    news_data = preProcess(news_raw_data)  ## 预处理新闻文本：分词，去停用词等等
                    news_word2vec = wd2vec(news_data)    ## 对处理后的文本词向量化
                    news_vec_list.append(news_word2vec)    
            news_vec = combineVec(news_vec_list)           ## 对10天的词向量列表整合成一个词向量
            return news_vec
        
        
        news_vec = convert2vec(news_list)
        return news_vec, label
            
        
        
    def __len__():
    

In [None]:
import pickle
f = open(GOOGLE_DRIVE_PATH + '/Stopwords/stopwords.pkl', 'rb')
stopwords = pickle.load(f)
f.close()
raw_data = pd.read_csv(GOOGLE_DRIVE_PATH + '/stockNews/' + cvfile.iloc[0,5], header=None)

In [None]:
raw_data
  

0


In [None]:
import jieba,math
import jieba.analyse
import re
text = str(raw_data.iloc[2,:])
str_text = text.split('\n')[0]

#精准模式cut_all=False，默认即是
str_jing1=jieba.lcut(str_text,cut_all=False)
cleanword = []
for wd in str_jing1:
  if wd not in stopwords:
    cleanword.append(wd)
print(cleanword)

#搜索引擎模式  cut_for_search

# str_soso1=jieba.cut_for_search(str_text)
# print('搜索引擎分词：{ %d}' % len(list(str_soso1)))
# str_soso2=jieba.cut_for_search(str_text)
# print("/".join(str_soso2))




Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.911 seconds.
Prefix dict has been built successfully.


['长江', '电力', '2018', '年度', '净利润', '226.30', '亿元', '同比', '增长', '1.66%', '长江', '电力', '2018', '年度', '净']


In [None]:
import gensim
word2vecPath =  GOOGLE_DRIVE_PATH + '/ChineseWord2Vec/sgns.financial.word'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vecPath, binary=False)

In [None]:
            def preProcess(raw_data):
                # raw_text = pd.read_csv(raw_data, header=None)
                raw_text = raw_data
                news_data = []
                for i in range(len(raw_text)):
                    text = str(raw_data.iloc[i,:])
                    str_text = text.split('\n')[0]
                    tokens = jieba.lcut(str_text,cut_all=False)
                    cleanword = []
                    for wd in tokens:
                      if wd not in stopwords:
                        cleanword.append(wd)
                    news_data.append(cleanword)
                return news_data

            def wd2vec(news_data):
                model = word2vec
                max_size = 300
                vec_list = []
                for wd_list in news_data:
                  for wd in wd_list:
                    clean_vec = np.zeros(max_size)
                    if wd in model.vocab:
                      clean_vec += model.get_vector(wd)
                  vec_list.append(clean_vec)
                return vec_list

In [None]:
import gensim
news_data = preProcess(raw_data)
vec_list = wd2vec(news_data)
news_data[1]
wd2vec(news_data[1])

[array([ 1.14968002e-01, -2.40005001e-01,  3.15555990e-01, -7.40747988e-01,
        -4.42382991e-01,  1.75108001e-01, -8.20502996e-01,  1.21886998e-01,
        -1.75564006e-01, -7.63426006e-01,  3.24483007e-01,  1.39925003e-01,
         3.25177997e-01, -6.37099985e-03,  2.54424989e-01, -2.03062996e-01,
         3.56566995e-01,  5.63660003e-02,  3.56617987e-01, -2.89184988e-01,
        -7.64560029e-02, -5.46739995e-01,  1.27260000e-01, -5.47479987e-02,
        -7.63799995e-02, -9.70090032e-02,  4.46296006e-01, -5.43056011e-01,
        -4.87118006e-01,  4.84943002e-01, -1.79221004e-01,  8.20349976e-02,
        -1.34297997e-01, -1.35982007e-01,  1.51208997e-01, -3.39215994e-01,
         3.42629999e-02, -3.96270007e-02, -1.79774001e-01, -2.97420006e-02,
         1.49531007e-01, -4.35380004e-02,  3.62385005e-01, -2.95964986e-01,
        -2.26541996e-01,  5.26230991e-01, -7.32320026e-02,  2.81179994e-02,
        -6.09380007e-02, -9.07720029e-02,  9.73999966e-03, -4.79454011e-01,
         1.2