In [1]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import glob
import argparse
import os
from shutil import copyfile, move
from pathlib import Path

In [7]:
 from mplfinance.original_flavor import candlestick_ohlc,volume_overlay

In [8]:
#python run_binary_preprocessing.py 
def isnan(value):
    try:
        import math
        return math.isnan(float(value))
    except:
        return False


def removeOutput(finput):
    if(Path(finput)).is_file():
        os.remove(finput)

In [9]:
def countImage(input):
    num_file = sum([len(files) for r, d, files in os.walk(input)])
    num_dir = sum([len(d) for r, d, files in os.walk(input)])
    print("num of files : {}\nnum of dir : {}".format(num_file, num_dir))

In [10]:
def create_label(fname,seq_len):
    print('Creating label ...')
    filename=fname.split('/')
    #데이터가 없데이트될 경우를 대비해 기존에 만든 레이블데이터를 삭제
    removeOutput("{}_label_{}.txt".format(filename[1][:-4],seq_len))
    
    #개별 종목 데이터 읽어오기
    df = pd.read_csv(fname,parse_dates=True,index_col=0)
    df.fillna(0)
    
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].map(mdates.date2num)
    for i in range(0,len(df)):
        # ix: 인덱스 접근자 iloc인덱서와 유사
        # seq_len 단위로 데이터프레임을 슬라이싱해 기간별 레이블링 작업 준비
        c = df.ix[i:i+int(seq_len),:]
        
        starting = 0
        endvalue = 0
        label = ''
        #레이블링 작업을 위해 +1
        if len(c) == int(seq_len)+1:
            starting = c['Open'].iloc[-1]
            value = c['Close'].iloc[-1]
            tmp_rtn = endvalue / starting - 1
            if tmp_rtn > 0:
                label = 1
            else:
                label = 0
                
            with open('{}_label_{}.txt'.format(filename[1][:-4],seq_len),'a') as the_file:
                #데이터 구간별 레이블을 저장하기 위해 텍스트파일에 덮어쓰기
                the_file.write('{}-{},{}'.format(filename[1][-4],i,label))
                the_file.write('\n')
                
    print("Create label finished.")

In [14]:
def ohcl2cs(fname,seq_len,dataset_type,dimension,use_volume):
    print("Converting ohlc to candlestick")
    symbol = fname.split('_')[0]
    symbol = symbol.split('/')[1]
    print(symbol)
    path = "{}".format(os.getcwd())
    #캔들차트 데이터를 저장할 디렉토리 생성
    if not os.path.exists("{}/dataset/{}_{}/{}/{}".format(path,seq_len,dimension,symbol,dataset_type)):
        os.makedirs("{}/dataset/{}_{}/{}/{}".format(path,seq_len,dimension,symbol,dataset_type))
        
    df = pd.read_csv(fname,parse_date=True,index_col=0)
    df.fillna(0)
    plt.style.use('dark_background')
    df.reset_index(inplace=True)
    df['Date'] = df['Date'].map(mdates.date2num)
    # 마지막 일수 미만인 경우를 제외하기 위해 입력 일자 간격만큼 뒤에서 빼기
    for i in range(0,len(df)-int(seq_len)):
        # ohlc + volume
        # 소스데이터 불러오기
        c = df.ix[i:i+int(seq_len)-1,:]
        if len(c) == int(seq_len):
            my_dpi = 96
            fig = plt.figure(figsize=(dimension/my_dpi,dimension/my_dpi),dpi=my_dpi)
            ax1 = fig.add_subplot(1,1,1)
            candlestick_ohlc(ax1,c['Open'],c['Close'],c['High'],c['Low'],width=1,colorup='#77d879',colordown='#db3f3f')
            ax1.grid(False)
            ax1.set_xticklabels([])
            ax1.set_yticklabels([])
            ax1.xaxis.set_visible(False)
            ax1.yaxis.set_visible(False)
            ax1.axis('off')
            # 거래량 데이터 사용여부
            if use_volume:
                ax2 = ax1.tiwnx()
                bc = volume_overlay(ax2,c['Open'],c['Close'],c['High'],c['Low'],width=1,colorup='#77d879',colordown='#db3f3f',alpha=0.5)
                ax2.add_collection(bc)
                ax2.grid(False)
                ax2.set_xticklabels([])
                ax2.set_yticklabels([])
                ax2.xaxis.set_visible(False)
                ax2.yaxis.set_visible(False)
                ax2.axis('off')
        pngfile = 'dataset/{}_{}/{}/{}/{}-{}.png',format(seq_len,dimension,symbol,dataset_type,fname[11:-4],i)
        fig.savefig(pngfile,pad_inches=0,transparent=False)
        plt.close(fig)
        
    print('Converting olhc to candlestick finished.')

In [12]:
def image_to_dataset(input,label_file):
    label_dict = {}
    # create_label에서 만든 레이블 데이터 읽어오기
    with open(label_file) as f:
        for line in f:
            (key,val) = line.split(',')
            label_dict[key] = val.rstrip()
            
    path = "{}/{}".format(os.getcwd(),input)
    for filename in os.lostdir(path):
        if filename is not'':
            for k,v in label_dict.items():
                splitname = filename.split('_')
                f,e = os.path.splitext(filename)
                newname = "{}_{}".format(splitname[0],splitname[1])
                if newname == k:
                    new_name = "{}-{}.png".format(v,f)
                    os.rename("{}/{}".format(path,filename),"{}/{}".format(path,new_name))
                    break
                    
    folders = ['1','0']
    for folder in folders:
        #캔들차트 이미지 분류를 위한 디렉토리 생성
        if not os.path.exists("{}/classes/{}".format(path,folder)):
            os.makedirs("{}/classes/{}".format(path,folder))
     
    # path경로에 있는 파일을 찾는다. 파일을 찾아 특정 디렉토리를 제외한 모든 캔들차트 파일은 레이블 폴더로 옮긴다.
    for filename in os.listdir(path):
        if filename is not '' and filename is not 'classes':
            f,e = os.path.splitext(filename)
            #레이블이 1인 경우에 해당되는 디렉토리에 옮김
            if label_dict[f] == "1":
                move("{}/{}".format(path,filename),"{}/classes/1/{}".format(path,filename))
            #레이블이 0인 경우에 해당되는 디렉토리에 옮김
            elif label_dict[f] == "0":
                move("{}/{}".format(path,filename),"{}/clases/0/{}".format(path,filename))
            
    print('Done')

In [None]:
# python generatedata.py
counttest = 0
counttrain = 0
# os.walk() 이용하여 현재 디렉터리 파일과 하위 디렉터리를 순차적으로 순회
for root,dirs,files in os.walk("{}/{}".format(pathdir,origindir)):
    for file in files:
        tmp = root.replace('\\','/')
        tmp_label = tmp.split('/')[-1]
        
        if tmp_label=='0':
            if 'test' in file:
                origin = "{}/{}".format(root,file)
                destination = "{}/{}/test/0/{}".format(pathdir,targetdir,file)
                copyfile(origin,destination)
                counttest+=1
            elif 'train' in file:
                origin = "{}/{}".format(root,file)
                destination = "{}/{}/train/0/{}".format(pathdir,targetdir,file)
                copyfile(origin,destination)
                counttrain+=1
        elif tmp_label=='1':
            if 'test' in file:
                origin = "{}/{}".format(root,file)
                destination = "{}/{}/test/1/{}".format(pathdir,targetdir,file)
                copyfile(origin,destination)
                counttest+=1
            elif 'train' in file:
                origin = "{}/{}".format(root,file)
                destination = "{}/{}/train/1/{}".format(pathdir,targetdir,file)
                copyfile(origin,destination)
                counttrain+=1

In [16]:
# python myDeepCNN.py 
def build_model(SHAPE,nb_classes,bn_axis,seed=None):
    # 예측 결과값을 재현하기 위해 특정 시드값 설정
    if seed:
        np.random.seed(seed)
    # 입력값을 전달받는 츨을 정의. 첫번째 입력값을 받을 때는 항상 입력데이터의 형태를 정의해야함
    input_layer = Input(shape=SHAPE)
    #Step 1
    # 2차원 합성곱층 정의 앞에서 선언한 input_layer를 함수형으로 연결
    x = Conv2D(32,3,3,init='glorot_uniform',border_mode='same',activation='relu')(input_layer)
    
    #Step 2 - Pooling
    x = MaxPooling2D(pool_size=(2,2))(x)
    
    #Step1
    x = Conv2D(48,3,3,init='glorot_uniform',border_mode='same',activation='relu')(x)
    
    #Step2
    x = MaxPooling2D(pool_size=(2,2))(x)
    # 과적합을 방지하기 위해 일부 연결층을 제거하는 드롭아웃층 추가
    x = Dropout(0.25)(x)
    
    #Step 1
    x = Conv2D(64,3,3,init='glorot_uniform',border_mode='same',activation='relu')(x)
    
    #Step 2
    x = MaxPooling2D(pool_size=(2,2))(x)
    
     #Step 1
    x = Conv2D(96,3,3,init='glorot_uniform',border_mode='same',activation='relu')(x)
    
    #Step 2
    x = MaxPooling2D(pool_size=(2,2))(x)
    x = Dropout(0.25)(x)
    
    #Step 3 - Flattening
    x = Flatten()(x)
    
    #Step 4 - Full connection
    x = Dense(output_dim=256,activation='relu')(x)
    
    #Dropout
    x = Dropout(0.5)(x)
    
    x = Dense(output_dim=2,activation='softmax')(x)
    
    # 최종 연결된 출력층과 최초입력층을 전달해 모델 구축
    model = Model(input_layer,x)
    
    return model

In [None]:
model.compile(optimizer=Adam(lr=1.0e-4),loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(X_train,y_train,batch_size=batch_size,epochs=epochs)
model.save('{}epochs_{}batch_cnn_model_{}.h5'.format(epochs,batch_size,data_directory.replace("/","_")),overwrite=True)

In [None]:
predicted = model.predict(X_test)
y_pred = np.argmax(predicted,axis=1)
Y_test = np.argmax(Y_test,axis=1)
cm = confusion_matrix(Y_test,y_pred)
report = classification_report(Y_test,y_pred)
tn = cm[0][0]
fn = cm[1][0]
tp = cm[1][1]
fp = vm[0][1]
if tp == 0:
    tp=1
if tn == 0:
    tn=1
if fp == 0:
    fp=1
if fn == 0:
    fn=1
    
TPR = float(tp)/(float(tp)+float(fn))
FPR = float(fp)/(float(fp)+float(tn))
accracy = round(float(tp)+float(tn))/(float(tp)+float(fp)+float(fn)+float(tn),3)
percitivity = round(float(tn)/(float(tn)+float(fp)),3)
sensitivity = round(float(tp)/(float(tp)+float(fn)),3)
mcc = round((float(tp)*float(tn)-float(fp)*float(fn))/math.sqrt((float(tp)+float(fp))*(float(tp)+float(fn))*(float(tn)+float(fp))*(float(tn)+float(fn))),3)

In [20]:
import subprocess
# python run_binary_preprocessing.py <ticker> <tradingdays> <windows> : 주가 데이터를 이미지 데이터로 변환 준비
# python generatedata.py <pathdir> <origindir> <destinationdir> : 이미지 생성
# python myDeepCNN.py -i <datasetdir> -e <number of epoch> -d <dimensionsize> -b <batchsize> -o <outputresult report> : 훈련
try:
    print(f'python run_binary_preprocessing.py "BBNI.JK" "20" "50"')
    subprocess.call(f'python run_binary_preprocessing.py  "BBNI.JK" "20" "50" ', shell=True)

    print(f'python generatedata.py "dataset" "20_50/BBNI.JK" "dataset_BBNIJK_20_50" ')
    subprocess.call(f'python generatedata.py "dataset" "20_50/BBNI.JK" "dataset_BBNIJK_20_50" ', shell=True)

    print(f'python myDeepCNN.py "-i" "dataset/dataset_BBNIJK_20_50" "-e" "50" "-d" "50" "-b" "8" "-o" "outputresult.txt"')
    subprocess.call(f'python myDeepCNN.py "-i" "dataset/dataset_BBNIJK_20_50" "-e" "50" "-d" "50" "-b" "8" "-o" "outputresult.txt"', shell=True)
except Exception as identifier:
    print(identifier)

python run_binary_preprocessing.py "BBNI.JK" "20" "50"
python generatedata.py "dataset" "20_50/BBNI.JK" "dataset_BBNIJK_20_50" 
python myDeepCNN.py "-i" "dataset/dataset_BBNIJK_20_50" "-e" "50" "-d" "50" "-b" "8" "-o" "outputresult.txt"
