In [61]:
import numpy as np
import pandas as pd
import FinanceDataReader as fdr
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [2]:
dataframe = fdr.DataReader('005930', '2000')     # Index is already 'Date', 데이터 불러오기
dataframe.dropna(inplace=True) # 결측치제거
dataframe

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496
2000-01-05,5800,6060,5520,5580,1493604,-0.086743
2000-01-06,5750,5780,5580,5620,1087810,0.007168
2000-01-07,5560,5670,5360,5540,806195,-0.014235
2000-01-10,5600,5770,5580,5770,937615,0.041516
...,...,...,...,...,...,...
2021-12-06,75100,76700,74900,76300,16391250,0.009259
2021-12-07,76100,77700,75600,77400,19232453,0.014417
2021-12-08,78300,78600,77100,77400,21558340,0.000000
2021-12-09,77400,78200,77000,78200,21604528,0.010336


In [3]:
def trend_separater(x):
    if x > 0.0016639: # 상승과 하락 트렌드의 절대적인 개수를 비슷하게 맞춰주기 위한 기준점을 선정한다.
        return 1
    elif x < -0.001:
        return -1

def updown(dataframe):
    dataframe['UD_Trend'] = dataframe['Change'].map(lambda x : trend_separater(x)) 
    dataframe['UD_Trend'] = dataframe['UD_Trend'].shift(-1)  # 다음날 트렌드를 예측해야하므로 다음날 트렌드를 앞으로 한 행 당긴다
    dataframe.dropna(inplace=True)  # 결측치 제거
    return dataframe

In [4]:
labeled_df = dataframe.copy()
labeled_df = updown(labeled_df)
total_count = labeled_df.UD_Trend.count()
labeled_df['UD_Trend'].value_counts()

 1.0    2517
-1.0    2516
Name: UD_Trend, dtype: int64

In [5]:
target_df = labeled_df.copy()
target_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Change,UD_Trend
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-04,6000,6110,5660,6110,1483967,0.148496,-1.0
2000-01-05,5800,6060,5520,5580,1493604,-0.086743,1.0
2000-01-06,5750,5780,5580,5620,1087810,0.007168,-1.0
2000-01-07,5560,5670,5360,5540,806195,-0.014235,1.0
2000-01-11,5820,6100,5770,5770,1194974,0.000000,-1.0
...,...,...,...,...,...,...,...
2021-12-02,73900,75800,73800,75800,23652940,0.018817,-1.0
2021-12-03,75600,76000,74100,75600,18330240,-0.002639,1.0
2021-12-06,75100,76700,74900,76300,16391250,0.009259,1.0
2021-12-08,78300,78600,77100,77400,21558340,0.000000,1.0


In [6]:
# 전체 세트
X_all = np.array(target_df.loc[:, target_df.drop(['UD_Trend'], axis=1).columns])
y_all = np.array(target_df.loc[:, ['UD_Trend']])

# 훈련 세트
X_train = X_all[:4000]
y_train = y_all[:4000]

# 검증 세트
X_test = X_all[4000:]
y_test = y_all[4000:]

In [52]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train) # 훈련-검증 세트 분리

In [53]:
params = {
    'svc__C' : [0.01, 0.1, 1.0],
    'svc__gamma' : [1, 10, 100],
}

In [54]:
svm_cla = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svc", SVC()),
            ])
clf = GridSearchCV(svm_cla, param_grid=params, cv=my_cv, n_jobs=-1)

In [55]:
# SVC(*, C=1.0, kernel='rbf', degree=3, gamma='scale',
#    coef0=0.0, shrinking=True, probability=False, tol=0.001,
#    cache_size=200, class_weight=None, verbose=False, max_iter=- 1,
#    decision_function_shape='ovr', break_ties=False, 
#    random_state=None)[source]

In [56]:
clf.fit(X_train, y_train)

  return f(**kwargs)


GridSearchCV(cv=<generator object TimeSeriesSplit.split at 0x7fcb508f6b50>,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('svc', SVC())]),
             n_jobs=-1,
             param_grid={'svc__C': [0.01, 0.1, 1.0],
                         'svc__gamma': [1, 10, 100]})

In [33]:
print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

best parameter:
 {'svc__C': 1.0, 'svc__gamma': 10}
best prediction:0.5015


In [57]:
svm_cla2 = Pipeline([
                    ("scaler", StandardScaler()),
                    ("svc", SVC(C=1, gamma=10)),
            ])

In [58]:
svm_cla2.fit(X_train, y_train) # 훈련 세트로 훈련진행

  return f(**kwargs)


Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC(C=1, gamma=10))])

In [59]:
print("train_set score: ", svm_cla2.score(X_train, y_train)) # 훈련세트 점수
print("test_set score : ", svm_cla2.score(X_test, y_test)) # 검증세트 점수

train_set score:  0.73
test_set score :  0.4975798644724105


In [66]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train) # 훈련-검증 세트 분리

In [67]:
params = {
    'lsvc__C' : [0.01, 0.1, 1.0],
}
lsvm_cla = Pipeline([
                    ("scaler", StandardScaler()),
                    ("lsvc", LinearSVC()),
            ])
clf = GridSearchCV(svm_cla, param_grid=params, cv=my_cv, n_jobs=-1)

clf.fit(X_train, y_train)

print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

  return f(**kwargs)


best parameter:
 {'lsvc__C': 1.0}
best prediction:0.5090




In [71]:
svm_cla2 = Pipeline([
                    ("scaler", StandardScaler()),
                    ("lsvc", LinearSVC(C=1)),
            ])
svm_cla2.fit(X_train, y_train) # 훈련 세트로 훈련진
print("train_set score: ", svm_cla2.score(X_train, y_train)) # 훈련세트 점수
print("test_set score : ", svm_cla2.score(X_test, y_test)) # 검증세트 점수

  return f(**kwargs)


train_set score:  0.54175
test_set score :  0.510164569215876


