In [1]:
# region [IMPORT]
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import time
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import os
from tqdm import tqdm
import warnings
# endregion
# region [MODEL]
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
import sklearn.metrics as metrics
import lightgbm
# endregion

warnings.filterwarnings("ignore", category=RuntimeWarning, message="invalid value encountered in log1p")

In [2]:
def estimate_eval(y_pred, y_test, eval_list:list):
    result = ''
    if "mse" in eval_list: result += f"MSE: {metrics.mean_squared_error(y_pred=y_pred, y_true=y_test)}\n"
    if "r2" in eval_list: result += f"R^2: {metrics.r2_score(y_pred=y_pred, y_true=y_test)}\n"
    return result


def figure_model_result(y_pred_test, y_pred_train, y_test, y_train, r_train, r_test):
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(go.Scatter(x=r_test, y=y_pred_test, mode='markers'), row=1, col=1)
    m, b = np.polyfit(r_test, y_pred_test, 1)
    fig.add_trace(go.Scatter(x=r_test, y=m*r_test+b, mode='lines', name='Regression Line', line=dict(width=3)))
    # fig.add_trace(go.Histogram2dContour(x=r_test, y=y_pred_test), row=1, col=1)
    fig.update_layout(yaxis_range=[-30, 30], xaxis_range=[-30, 30], width=900, height=900)
    fig.update_traces(marker=dict(size=2))
    fig.add_hline(y=0, line_width=0.5, line_color="red", row=1, col=1)
    fig.add_vline(x=0, line_width=0.5, line_color="red", row=1, col=1)
    #
    return fig

In [3]:
def load_data(fpath:str, arg:dict={}):
    ldir = os.listdir(fpath)
    files = [fname for fname in ldir if os.path.isfile(os.path.join(fpath, fname))]
    data = {fname.removesuffix('.csv'):pd.read_csv('/'.join([fpath, fname]), **arg) for fname in tqdm(files, desc="Loading Data")}
    # for k in data: data[k].index = pd.to_datetime(data[k]['일자'])
    return data

data = load_data("../../FinanceData/DB/AllFeatures"
                     , arg=dict(dtype={'수정주가구분':object, '수정비율':object}, parse_dates=['일자'], index_col='일자'))

Loading Data: 100%|██████████| 2559/2559 [00:57<00:00, 44.44it/s]


In [4]:
sub = []
dl = dict(sd=datetime(2023,1,1), ed=datetime(2024,3,1), test_size=0.1)

In [7]:
sichong = pd.read_csv('temp/sichong.csv', dtype={'종목코드':str})['종목코드']

In [16]:
def selection_cond(x: pd.Series):
    # cond = 5>x['rate_lag_1'] > x['rate_lag_2'] and x['rate_lag_2'] > 0 and 7>x['rate'] > x['rate_lag_1'] and (x['ma60'] > x['ma120'])
    cond=True
    return cond

def make_xyr(df:pd.DataFrame):
    def eval(y:pd.Series):
        return pd.Series(np.where(y > 0, np.log1p(y), np.log1p(np.abs(y)) * -1), index=y.index)
    
    ft_names = df.columns  #  ['거래량', 'rate'] + [f'rate_lag_{n}' for n in range(1, 11)] + ['ma5disp'] + ['ma20disp']
    ft_exclude = ['수정주가구분', '수정비율', '일자', 'y', '시가', '고가', '저가']
    
    X = df.loc[:, [ft for ft in ft_names if ft not in ft_exclude]]
    y = df['y']#eval(df['y'])
    r = df['y']
    X.dropna(inplace=True); y.dropna(inplace=True)
    common_index = X.index.intersection(y.index)
    return X.loc[common_index], y.loc[common_index], r.loc[common_index]

def data_selection(dnum, sub=None):  # return X_train, X_test, y_train, y_test, r_train, r_test, selection_rate 
    Xl, yl, rl = [], [], []; whole_num, sel_num = 0, 0; names = ['X', 'y', 'r']
    #
    for k in tqdm((sub if sub.__bool__ else data.keys()), desc="Data Selecting"):
        if k not in data.keys():
            print(f"{k} Excluded.")
            continue
        #
        df = data[k].loc[dl['ed']:dl['sd']]
        selected = df.loc[(bl := df.apply(selection_cond, axis=1))]
        selected.index = [(i, k) for i in selected.index]
        whole_num += len(bl); sel_num += sum(bl)
        X_part, y_part, r_part = make_xyr(selected)
        for n in names: exec(f"{n}l.append({n}_part)")
        
    X = pd.concat(Xl, axis=0, ignore_index=False); y = pd.concat(yl, axis=0, ignore_index=False).astype(int)
    r = pd.concat(rl, axis=0, ignore_index=False)
    for n in names: exec(f"{n}.sort_index(inplace=True)")
    
    # X = X[:dnum]; y = y[:dnum]; r = r[:dnum]
    X_train, X_test, y_train, y_test, r_train, r_test = train_test_split(X, y, r, test_size=dl['test_size'], shuffle=False)
    print(sel_num, whole_num, sep='/')
    return X_train, X_test, y_train, y_test, r_train, r_test, (sel_num, whole_num)

In [10]:
def classification(X_train, X_test, y_train, y_test):
    model = lightgbm.LGBMRegressor()
    #
    model.fit(X_train, y_train)
    y_pred_test = (model.predict(X_test)).tolist()
    y_pred_train = model.predict(X_train).tolist()
    #
    return y_pred_test, y_pred_train

In [19]:
dl = dict(sd=datetime(2018,5,1), ed=datetime(2023,6,1), test_size=0.1)
X_train, X_test, y_train, y_test, r_train, r_test, sel_tup = data_selection(100000, sub=sichong)

Data Selecting: 100%|██████████| 100/100 [00:01<00:00, 95.31it/s]


109171/109171


In [20]:
y_pred_test, y_pred_train = classification(X_train, X_test, y_train, y_test)
print(f"{sel_tup[0]} / {sel_tup[1]}")
print(estimate_eval(y_pred_test, y_test, eval_list=['mse', 'r2']))
figure_model_result(y_pred_test, y_pred_train, y_test, y_train, r_train, r_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004859 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4335
[LightGBM] [Info] Number of data points in the train set: 98145, number of used features: 17
[LightGBM] [Info] Start training from score 0.037373
109171 / 109171
MSE: 4.235577114803928
R^2: -0.031064676785769407

