In [1]:
import numpy as np
import pandas as pd
import FinanceDataReader as fdr
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [2]:
dataframe = fdr.DataReader('005930', '2000')     # Index is already 'Date', 데이터 불러오기
dataframe.dropna(inplace=True) # 결측치제거
dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496
2000-01-05,5800,6060,5520,5580,1493604,-0.086743
2000-01-06,5750,5780,5580,5620,1087810,0.007168
2000-01-07,5560,5670,5360,5540,806195,-0.014235
2000-01-10,5600,5770,5580,5770,937615,0.041516
...,...,...,...,...,...,...
2022-04-11,67800,68100,67400,67900,12263735,0.001475
2022-04-12,67600,67700,67000,67000,13924389,-0.013255
2022-04-13,67300,69000,67200,68700,17378619,0.025373
2022-04-14,68700,68700,67500,67500,16409494,-0.017467


In [3]:
def trend_separater(x):
    if x > 0.0016639: # 상승과 하락 트렌드의 절대적인 개수를 비슷하게 맞춰주기 위한 기준점을 선정한다.
        return 1
    elif x < -0.001:
        return -1

def updown(dataframe):
    dataframe['UD_Trend'] = dataframe['Change'].map(lambda x : trend_separater(x)) 
    dataframe['UD_Trend'] = dataframe['UD_Trend'].shift(-1)  # 다음날 트렌드를 예측해야하므로 다음날 트렌드를 앞으로 한 행 당긴다
    dataframe.dropna(inplace=True)  # 결측치 제거
    return dataframe

In [4]:
labeled_df = dataframe.copy()
labeled_df = updown(labeled_df)
total_count = labeled_df.UD_Trend.count()
labeled_df['UD_Trend'].value_counts()

-1.0    2561
 1.0    2549
Name: UD_Trend, dtype: int64

In [5]:
target_df = labeled_df.copy()
target_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,UD_Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496,-1.0
2000-01-05,5800,6060,5520,5580,1493604,-0.086743,1.0
2000-01-06,5750,5780,5580,5620,1087810,0.007168,-1.0
2000-01-07,5560,5670,5360,5540,806195,-0.014235,1.0
2000-01-11,5820,6100,5770,5770,1194974,0.000000,-1.0
...,...,...,...,...,...,...,...
2022-04-07,68500,68500,68000,68000,20683327,-0.007299,-1.0
2022-04-11,67800,68100,67400,67900,12263735,0.001475,-1.0
2022-04-12,67600,67700,67000,67000,13924389,-0.013255,1.0
2022-04-13,67300,69000,67200,68700,17378619,0.025373,-1.0


In [6]:
# 전체 세트
X_all = np.array(target_df.loc[:, target_df.drop(['UD_Trend'], axis=1).columns])
y_all = np.array(target_df.loc[:, ['UD_Trend']])

# 훈련 세트
X_train = X_all[:4000]
y_train = y_all[:4000]

# 검증 세트
X_test = X_all[4000:]
y_test = y_all[4000:]

In [7]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train) # 교차검증 세트 생성

In [8]:
params = {
    'svc__C' : [0.01, 0.1, 1.0],
    'svc__gamma' : [1, 10, 100],
}

In [9]:
svm_cla = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svc", SVC()),
            ])
clf = GridSearchCV(svm_cla, param_grid=params, cv=my_cv, n_jobs=-1)

In [10]:
clf.fit(X_train, y_train)

  return f(*args, **kwargs)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x00000285EDBACD60>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [0.01, 0.1, 1.0],
                         'svc__gamma': [1, 10, 100]})

In [11]:
print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

best parameter:
 {'svc__C': 1.0, 'svc__gamma': 10}
best prediction:0.5039


In [13]:
svm_cla2 = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svc", SVC(C=1, gamma=10)),
            ])

In [14]:
svm_cla2.fit(X_train, y_train) # 훈련 세트로 훈련진행

  return f(*args, **kwargs)


Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC(C=1, gamma=10))])

In [15]:
print("train_set score: ", svm_cla2.score(X_train, y_train)) # 훈련세트 점수
print("test_set score : ", svm_cla2.score(X_test, y_test)) # 검증세트 점수

train_set score:  0.73
test_set score :  0.5036036036036036


In [16]:
unique, counts = np.unique(svm_cla2.predict(X_all), return_counts=True)
print(unique)
print(counts)

[-1.  1.]
[3242 1868]
