# Preamble

**I have seen some of the notebook which are using regular CV methods for time series data, I wanted to implement TS-CV technique by myself.**

**Unfortunately, I couldn't manage to find a good implementation of TS-CV library, so I have created algorithm by myself.**

# Config Parameters and Imports

In [1]:
# You can tweak the hyperparameters for different results.
# Best Model Hyperparameters (Optuna) {'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = Score: 0.537631972137107.
class CFG:
    Debug = False # Enable/Disable debug mode True = Enable, False = Disable
    
    folds = 5
    val_ratio = 20 # validation dataset to train dataset ratio in % format
    
    mean = True # if it is True, submission API uses the mean value of our folds, uses median value if it is False.
    
    seed = 1889
    LR = 0.6268164565853203
    num_leaves = 2818
    n_estimators = 713
    max_bin = 100

In [2]:
import numpy as np

import pandas as pd 
pd.options.mode.chained_assignment = None 

from lightgbm import LGBMRegressor

#from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
#from sklearn.model_selection import cross_val_score

from decimal import ROUND_HALF_UP, Decimal

#import optuna

# Import Data and Handle Invalid Values

**Be careful, do not drop NaN values before dropping Expected Dividend features, it causes a major issue.**

In [3]:
train_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')
train_df = train_df.drop(['ExpectedDividend'], axis=1) # trivial imo
#train_df = train_df.dropna() # DO NOT!
train_df.interpolate(method='linear', inplace=True) # pandas interpolation fills NaN values with the mean of two upper and lower neighbour values.
train_df.isnull().sum()

RowId               0
Date                0
SecuritiesCode      0
Open                0
High                0
Low                 0
Close               0
Volume              0
AdjustmentFactor    0
SupervisionFlag     0
Target              0
dtype: int64

**Here, we are adjusting the close price according to Adjustment Factor. It is important, because adjustment factor is not comprehensive for our model.**

In [4]:
# https://www.kaggle.com/code/smeitoma/train-demo
def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    def generate_adjusted_close(df):
        """
        Args:
            df (pd.DataFrame)  : stock_price for a single SecuritiesCode
        Returns:
            df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
        """
        # sort data to generate CumulativeAdjustmentFactor
        df = df.sort_values("Date", ascending=False)
        # generate CumulativeAdjustmentFactor
        df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
        # generate AdjustedClose
        df.loc[:, "AdjustedClose"] = (
            df["CumulativeAdjustmentFactor"] * df["Close"]
        ).map(lambda x: float(
            Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
        ))
        # reverse order
        df = df.sort_values("Date")
        # to fill AdjustedClose, replace 0 into np.nan
        df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
        # forward fill AdjustedClose
        df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()
        return df

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    price.set_index("Date", inplace=True)
    return price
train_df = adjust_price(train_df)

In [5]:
train_df

Unnamed: 0_level_0,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,SupervisionFlag,Target,CumulativeAdjustmentFactor,AdjustedClose
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2017-01-04,20170104_1301,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,False,0.000730,1.0,2742.0
2017-01-05,20170105_1301,1301,2743.0,2747.0,2735.0,2738.0,17900,1.0,False,0.002920,1.0,2738.0
2017-01-06,20170106_1301,1301,2734.0,2744.0,2720.0,2740.0,19900,1.0,False,-0.001092,1.0,2740.0
2017-01-10,20170110_1301,1301,2745.0,2754.0,2735.0,2748.0,24200,1.0,False,-0.005100,1.0,2748.0
2017-01-11,20170111_1301,1301,2748.0,2752.0,2737.0,2745.0,9300,1.0,False,-0.003295,1.0,2745.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-29,20211129_9997,9997,678.0,679.0,665.0,668.0,320800,1.0,False,0.026987,1.0,668.0
2021-11-30,20211130_9997,9997,670.0,689.0,667.0,667.0,296300,1.0,False,-0.001460,1.0,667.0
2021-12-01,20211201_9997,9997,661.0,688.0,660.0,685.0,339100,1.0,False,0.017544,1.0,685.0
2021-12-02,20211202_9997,9997,681.0,692.0,680.0,684.0,342900,1.0,False,0.014368,1.0,684.0


Nice.

# LGBM Model Implementation and Time Series Cross Validation Implementation

**My Loss Metric is MSE(Mean Squared Error), it is your optional choice.**

"""# source: https://www.kaggle.com/code/swimmy/lgbm-model-fe-portfolio
X = train_df[['SecuritiesCode', 'Open', 'High', 'Low', 'Close', 'Volume', 'AdjustmentFactor']] # we include only these parameters.
y = train_df[['Target']]

def objective(trial):
    params = {
            'num_leaves': trial.suggest_int('num_leaves', 300, 4000),
            'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
            'max_bin': trial.suggest_int('max_bin', 2, 100),
            'learning_rate': trial.suggest_uniform('learning_rate',0, 1),
    }

    model = LGBMRegressor(**params)
    model.fit(X,y)
    score = model.score(X,y)
    return score

opt = optuna.create_study(direction='maximize',sampler=optuna.samplers.RandomSampler(seed=CFG.seed))
opt.optimize(objective, n_trials=20)

trial = opt.best_trial
params_best = dict(trial.params.items())
params_best['random_seed'] = CFG.seed
    
new_model = LGBMRegressor(**params_best)"""

{'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = 0.537631972137107.
{'num_leaves': 2818, 'n_estimators': 713, 'max_bin': 100, 'learning_rate': 0.6268164565853203} = 0.537631972137107.

In [6]:
def split_group(df):
    """Splits groups by giving them unique labels."""
    df['fold'] = 0
    num_split = len(df)//CFG.folds
    next_val = 0
    for i in range(1, CFG.folds+1):
        df['fold'].iloc[num_split*next_val:num_split*(i)] = (i)
        if (i) > next_val:
            next_val = (i)
    df.fold.loc[df['fold'] == 0] = CFG.folds
    return df

def LGBM_Model(X_train, y_train, X_val, y_val, i):
    if CFG.Debug:
        model = LGBMRegressor()
    else:
        model = LGBMRegressor(num_leaves= CFG.num_leaves, learning_rate = CFG.LR, n_estimators = CFG.n_estimators, max_bin=CFG.max_bin)
    
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, preds))
    print(f'{i}. Fold MSE: {score:.5f}')
    return model, score

def perc(num, percent):
    """Rounded percent calculator"""
    return round(float(num/100)*percent)

def tscv(raw_df):
    """Creates a time series cross validation method, you can specify your model and percentage."""
    df = raw_df.copy()
    df = split_group(df)
    models = []
    errs = []
    #last_num = 0
    for i in range(1, CFG.folds+1):
        
        rest_df = df.loc[df['fold'] <= i]
        tmp_df = df.loc[df['fold'] == i]
        #print(tmp_df)
        rest_df = rest_df.sort_values("Date", ascending=True)
        tmp_df = tmp_df.sort_values("Date", ascending=True)
        #print(rest_df)
        

        X_train = rest_df[0:-perc(len(tmp_df), CFG.val_ratio)]
        y_train = rest_df[0:-perc(len(tmp_df), CFG.val_ratio)]
        

        X_val = tmp_df[-perc(len(tmp_df), CFG.val_ratio):-1] 
        y_val = tmp_df[-perc(len(tmp_df), CFG.val_ratio):-1]
        
        X_train = X_train[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']] # we include only these parameters.
        y_train = y_train[['Target']]
        
        X_val = X_val[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']]
        y_val = y_val[['Target']]
        
        model, err = LGBM_Model(X_train, y_train, X_val, y_val, i)
        errs.append(err)
        models.append(model)
        if len(errs) == CFG.folds:
            print('\n')
            print('-'*30)
            print(f'\nAverage MSE is: {np.mean(errs):.5f}')
            
        #if i == 3:
         #   break
    return models

            
models = tscv(train_df)

1. Fold MSE: 0.02324
2. Fold MSE: 0.02465
3. Fold MSE: 0.02220
4. Fold MSE: 0.02136
5. Fold MSE: 0.01844


------------------------------

Average MSE is: 0.02198


# Submit by Competition API

In [7]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

In [8]:
for (stock_prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    
    stock_prices = adjust_price(stock_prices)
    X_test = stock_prices[['Open', 'High', 'Low', 'AdjustedClose', 'Volume']]
    preds = []
    for model in models:
        preds.append(model.predict(X_test))
    preds = np.mean(preds, axis=0) if CFG.mean else np.median(preds, axis=0)

    sample_prediction["Prediction"] = preds
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
