In [1]:
import numpy as np
import pandas as pd
import FinanceDataReader as fdr
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [2]:
def trend_separater(x): # 트렌드 구분자
    if x > 0.005:
        return 1
    elif x < -0.004:
        return -1
    else:
        return np.nan

def updown(dataframe): # 트렌드 레이블링
    dataframe['UD_Trend'] = dataframe['Change'].map(lambda x : trend_separater(x)) 
    dataframe['UD_Trend'] = dataframe['UD_Trend'].shift(-1)
    return dataframe

def moving_average(df, n): # 종가 이동평균
    MA = pd.Series(df['Close'].rolling(window=n, min_periods=n).mean(), name='MA_'+ str(n))
    df = df.join(MA)
    return df

def volume_moving_average(df, n): # 거래량 이동평균
    VMA = pd.Series(df['Volume'].rolling(window=n, min_periods=n).mean(), name='VMA_'+ str(n))
    df = df.join(VMA)
    return df

def relative_strength_index(df, n): # 상대 강도 지수
    tdf = df.loc[:, ['High', 'Low']].copy()

    tdf['UpMove'] = tdf['High'] - tdf['High'].shift(1)
    tdf['DoMove'] = tdf['Low'].shift(1) - tdf['Low']
    
    for i in tdf.index:
        if tdf.loc[i, 'UpMove'] > tdf.loc[i, 'DoMove'] and tdf.loc[i, 'UpMove'] > 0:
            tdf.loc[i, 'UpI'] = tdf.loc[i, 'UpMove']
        else:
            tdf.loc[i, 'UpI'] = 0
        
        if tdf.loc[i, 'DoMove'] > tdf.loc[i, 'UpMove'] and tdf.loc[i, 'DoMove'] > 0:
            tdf.loc[i, 'DoI'] = tdf.loc[i, 'DoMove']
        else:
            tdf.loc[i, 'DoI'] = 0
    
    tdf['PosDI'] = tdf['UpI'].ewm(span=n, min_periods=n).mean()
    tdf['NegDI'] = tdf['DoI'].ewm(span=n, min_periods=n).mean()
    tdf['RSI'] = tdf['PosDI'] / (tdf['PosDI'] + tdf['NegDI'])
    RSI = pd.Series(tdf['RSI'], name='RSI_'+ str(n))
    
    df = df.join(RSI)
    return df

def get_confusion_matrix(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    precision = precision_score(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_score = roc_auc_score(y_test, pred)
    print('confusion matrix')
    print('accuracy:{0:.4f}, precision:{1:4f}, recall:{2:4f}, F1:{3:4f}, ROC AUC score:{4:4f}'.format(accuracy, precision, recall, f1, roc_score))

In [3]:
dataframe = fdr.DataReader('005930', '2000')     # Index is already 'Date', 데이터 불러오기
dataframe.dropna(inplace=True) # 결측치제거
dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496
2000-01-05,5800,6060,5520,5580,1493604,-0.086743
2000-01-06,5750,5780,5580,5620,1087810,0.007168
2000-01-07,5560,5670,5360,5540,806195,-0.014235
2000-01-10,5600,5770,5580,5770,937615,0.041516
...,...,...,...,...,...,...
2022-01-04,78800,79200,78300,78700,12427416,0.001272
2022-01-05,78800,79000,76400,77400,25470640,-0.016518
2022-01-06,76700,77600,76600,76900,12931954,-0.006460
2022-01-07,78100,78400,77400,78300,15163757,0.018205


In [4]:
pped_df = dataframe.copy() # preprocessed dataframe

pped_df = moving_average(pped_df, 45) # Close Moving Average
pped_df = volume_moving_average(pped_df, 45) # Volume Moving Average
pped_df = relative_strength_index(pped_df, 14) # Relative Strength Index

In [5]:
pped_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,MA_45,VMA_45,RSI_14
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496,,,
2000-01-05,5800,6060,5520,5580,1493604,-0.086743,,,
2000-01-06,5750,5780,5580,5620,1087810,0.007168,,,
2000-01-07,5560,5670,5360,5540,806195,-0.014235,,,
2000-01-10,5600,5770,5580,5770,937615,0.041516,,,
...,...,...,...,...,...,...,...,...,...
2022-01-04,78800,79200,78300,78700,12427416,0.001272,74955.555556,1.492109e+07,0.547903
2022-01-05,78800,79000,76400,77400,25470640,-0.016518,75086.666667,1.511349e+07,0.339940
2022-01-06,76700,77600,76600,76900,12931954,-0.006460,75231.111111,1.511708e+07,0.339940
2022-01-07,78100,78400,77400,78300,15163757,0.018205,75402.222222,1.519141e+07,0.455743


In [6]:
labeled_df = updown(pped_df) # UpDown Trend labeled dataframe
labeled_df.dropna(inplace=True)  # 결측치 제거
total_count = labeled_df.UD_Trend.count()
labeled_df['UD_Trend'].value_counts()

-1.0    2102
 1.0    2096
Name: UD_Trend, dtype: int64

In [7]:
labeled_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,MA_45,VMA_45,RSI_14,UD_Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-03-08,5520,6000,5500,5920,649288,0.053381,5539.555556,1.046833e+06,0.668410,-1.0
2000-03-09,5970,5980,5620,5620,1117890,-0.050676,5528.666667,1.038698e+06,0.668410,-1.0
2000-03-10,5790,5800,5520,5540,572199,-0.014235,5527.777778,1.018222e+06,0.622955,-1.0
2000-03-13,5480,5490,5160,5180,644358,-0.064982,5518.000000,1.008368e+06,0.485742,1.0
2000-03-14,5170,5380,5160,5210,601241,0.005792,5510.666667,1.003813e+06,0.485742,1.0
...,...,...,...,...,...,...,...,...,...,...
2021-12-28,80200,80400,79700,80300,18226325,0.001247,74202.222222,1.490996e+07,0.759798,-1.0
2021-12-29,80200,80200,78500,78800,19794795,-0.018680,74395.555556,1.512106e+07,0.563473,-1.0
2022-01-04,78800,79200,78300,78700,12427416,0.001272,74955.555556,1.492109e+07,0.547903,-1.0
2022-01-05,78800,79000,76400,77400,25470640,-0.016518,75086.666667,1.511349e+07,0.339940,-1.0


In [8]:
target_df = labeled_df.copy() # Final Dataset for learning
target_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,MA_45,VMA_45,RSI_14,UD_Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2000-03-08,5520,6000,5500,5920,649288,0.053381,5539.555556,1.046833e+06,0.668410,-1.0
2000-03-09,5970,5980,5620,5620,1117890,-0.050676,5528.666667,1.038698e+06,0.668410,-1.0
2000-03-10,5790,5800,5520,5540,572199,-0.014235,5527.777778,1.018222e+06,0.622955,-1.0
2000-03-13,5480,5490,5160,5180,644358,-0.064982,5518.000000,1.008368e+06,0.485742,1.0
2000-03-14,5170,5380,5160,5210,601241,0.005792,5510.666667,1.003813e+06,0.485742,1.0
...,...,...,...,...,...,...,...,...,...,...
2021-12-28,80200,80400,79700,80300,18226325,0.001247,74202.222222,1.490996e+07,0.759798,-1.0
2021-12-29,80200,80200,78500,78800,19794795,-0.018680,74395.555556,1.512106e+07,0.563473,-1.0
2022-01-04,78800,79200,78300,78700,12427416,0.001272,74955.555556,1.492109e+07,0.547903,-1.0
2022-01-05,78800,79000,76400,77400,25470640,-0.016518,75086.666667,1.511349e+07,0.339940,-1.0


In [9]:
# 전체 세트
X_all = np.array(target_df.loc[:, target_df.drop(['UD_Trend'], axis=1).columns])
y_all = np.array(target_df.loc[:, ['UD_Trend']])

slice_index = 4000

# 훈련 세트
X_train = X_all[:slice_index]
y_train = y_all[:slice_index]

# 검증 세트
X_test = X_all[slice_index:]
y_test = y_all[slice_index:]

In [10]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train) # 교차검증 세트 생성

In [11]:
params = {
    'svc__C' : [0.01, 0.1, 1],
    'svc__gamma' : [0.1, 1, 5],
    'svc__kernel' : ['rbf']
}

In [12]:
svm_cla = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svc", SVC()),
            ])
clf = GridSearchCV(svm_cla, param_grid=params, cv=my_cv, n_jobs=-1)

In [13]:
clf.fit(X_train, y_train)

  return f(**kwargs)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x7f68e80b27d0>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [0.01, 0.1, 1], 'svc__gamma': [0.1, 1, 5],
                         'svc__kernel': ['rbf']})

In [14]:
print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

best parameter:
 {'svc__C': 1, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
best prediction:0.4958


In [15]:
pred_con = clf.predict(X_test)
accuracy_con = accuracy_score(y_test, pred_con)

print('accuracy:{0:.4f}'.format(accuracy_con))
get_confusion_matrix(y_test, pred_con)

accuracy:0.5404
confusion matrix
accuracy:0.5404, precision:0.488889, recall:0.244444, F1:0.325926, ROC AUC score:0.515741


In [16]:
clf.predict(X_test)

array([ 1.,  1.,  1.,  1., -1., -1.,  1.,  1., -1.,  1., -1., -1., -1.,
       -1., -1., -1.,  1., -1., -1., -1., -1., -1., -1., -1.,  1., -1.,
       -1.,  1., -1.,  1., -1.,  1.,  1.,  1., -1., -1.,  1., -1., -1.,
       -1., -1., -1., -1.,  1.,  1.,  1., -1.,  1., -1., -1.,  1., -1.,
       -1., -1., -1.,  1., -1., -1.,  1., -1., -1.,  1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1.,  1., -1., -1., -1., -1., -1., -1.,
       -1.,  1.,  1.,  1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1., -1., -1., -1., -1., -1., -1., -1., -1., -1.,
       -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1., -1

In [17]:
y_test.reshape(1,-1)

array([[-1.,  1.,  1., -1., -1.,  1., -1.,  1., -1.,  1.,  1.,  1.,  1.,
        -1.,  1., -1.,  1.,  1., -1., -1., -1.,  1.,  1.,  1., -1.,  1.,
         1.,  1., -1.,  1.,  1.,  1., -1., -1., -1., -1.,  1.,  1., -1.,
         1., -1., -1., -1., -1.,  1.,  1., -1.,  1., -1., -1.,  1.,  1.,
        -1., -1.,  1., -1.,  1., -1.,  1., -1., -1., -1.,  1.,  1., -1.,
         1., -1.,  1., -1., -1.,  1., -1.,  1.,  1.,  1.,  1., -1., -1.,
        -1., -1.,  1., -1.,  1., -1.,  1., -1., -1., -1.,  1., -1.,  1.,
        -1., -1., -1.,  1., -1.,  1., -1.,  1.,  1., -1., -1., -1.,  1.,
        -1., -1., -1.,  1., -1., -1.,  1., -1., -1., -1.,  1., -1., -1.,
        -1.,  1., -1., -1.,  1., -1.,  1.,  1.,  1., -1., -1., -1., -1.,
        -1., -1., -1., -1., -1.,  1.,  1., -1., -1.,  1., -1.,  1.,  1.,
        -1., -1.,  1.,  1., -1.,  1.,  1., -1., -1., -1., -1., -1., -1.,
         1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1., -1., -1.,  1., -1.,
        -1.,  1.,  1., -1., -1.,  1.,  1.,  1., -1.