## References
- https://www.kaggle.com/code/unokensuke/eng-eda-and-baseline-predict-lightgbm
- https://www.kaggle.com/code/tatsuyafujii/lightgbm-baseline

# Import Packages

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import jpx_tokyo_market_prediction
from lightgbm import LGBMRegressor
import optuna.integration.lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import pickle
import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [None]:
root_path = Path('../input/jpx-tokyo-stock-exchange-prediction/')
df_stock_prices = pd.read_csv(root_path / 'supplemental_files' / 'stock_prices.csv', parse_dates=["Date"])
df_stock_list = pd.read_csv(root_path / 'stock_list.csv')

In [None]:
df_stock_prices.head(5)

## Merge section data

In [None]:
df = df_stock_prices.copy()

In [None]:
df_stock_list['33SectorName'].value_counts()

In [None]:
df_stock_list.head(5)

In [None]:
df_stock_list['Section/Products'].isnull().sum()

In [None]:
df_stock_list['SecuritiesCode'].unique()

In [None]:
# df = pd.merge(df, df_stock_list[['SecuritiesCode', 'Section/Products', '33SectorName', '33SectorCode', '17SectorName', '17SectorCode']], on='SecuritiesCode')

In [None]:
df

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
len(df)

In [None]:
df.isnull().sum()
# Almost all values of ExpectedDividend (Projected dividend amount) are null.
# This column may be dropped.

In [None]:
not_traded = (df["Volume"]==0)
not_traded.sum()

In [None]:
print((df["Open"].isnull() == not_traded).all())
print((df["High"].isnull() == not_traded).all())
print((df["Low"].isnull() == not_traded).all())
print((df["Close"].isnull() == not_traded).all())
# If not traded, stock value are null.

In [None]:
df.describe()

In [None]:
df["SecuritiesCode"].value_counts().sort_values()

In [None]:
df.groupby("SecuritiesCode")['Open'].shift(1)

# FeatureBlocks

## AbstractBaseBlock

In [None]:
class AbstractBaseBlock:
    def fit(self, input_df: pd.DataFrame, y=None):
        return self.transform(input_df)
    
    def transform(self, input_df: pd.DataFrame) -> pd.DataFrame:
        raise NotImplementedError()

## NumericFeatBlock

In [None]:
class NumericFeatBlock(AbstractBaseBlock):
    def __init__(self, col: str):
        self.col = col

    def fit(self, input_df, y=None):
        pass

    def transform(self, input_df):
        return input_df.loc[:, self.col]

## DateFeatBlock

In [None]:
class DateFeatBlock(AbstractBaseBlock):
    def __init__(self,
                 consecutive_year_month=True,
                 consecutive_month_day=True,
                 consecutive_week=True,
                 consecutive_week_denominator=7,
                 weekday=True):
        self.consecutive_year_month = consecutive_year_month
        self.consecutive_month_day = consecutive_month_day
        self.consecutive_week = consecutive_week
        self.consecutive_week_denominator = consecutive_week_denominator
        self.weekday = weekday

    def fit(self, input_df, y=None):
        pass

    def transform(self, input_df):
        out_df = pd.DataFrame()
        year = input_df.Date.dt.year
        month = input_df.Date.dt.month
        day = input_df.Date.dt.day
        
        if self.consecutive_year_month:
            out_df['consecutive_year_month'] = year * 12 + month
        if self.consecutive_month_day:
            out_df['consecutive_month_day'] = month * 30 + day
        if self.consecutive_week:
            out_df["consecutive_week"] = (year * 365 + month * 30 + day) // self.consecutive_week_denominator
        if self.weekday:
            out_df['weekday'] = input_df["Date"].dt.weekday
        
        return out_df

## CategoricalFeatBlock

In [None]:
class CategoricalFeatBlock(AbstractBaseBlock):
    def __init__(self, col: str, whole_df = None, threshold=0.001, is_label=True, is_dummy=False):
        self.col = col
        self.whole_df = whole_df
        self.threshold = threshold
        self.is_label = is_label
        self.is_dummy = is_dummy
    
    def fit(self, input_df, y=None):
        if self.whole_df == None:
            df = input_df.loc[:, self.col]
        else:
            df = self.whole_df.loc[:, self.col]
        vc = df.value_counts(normalize=True).reset_index()
        vc = vc.assign(thresh=lambda d: np.where(d[self.col].values >= self.threshold, 1, 0))\
               .assign(thresh=lambda d: d['thresh'].cumsum() - d['thresh'])
        self.label_dict_ = dict(vc[['index', 'thresh']].values)
        self.label_other_ = np.max(self.label_dict_.values())
        
        return self.transform(input_df)
        
    def transform(self, input_df):
        out_df = pd.DataFrame()
        label_df = pd.DataFrame()
        label_df[f'{self.col}_label_enc'] = np.vectorize(lambda x: self.label_dict_.get(x, self.label_other_))\
                                                        (input_df[self.col].values)
        if self.is_label:
            out_df = pd.concat([out_df, label_df], axis=1)
            
        if self.is_dummy:
            label_df[f'{self.col}_label_enc'] = label_df[f'{self.col}_label_enc'].astype(object)
            out_df = pd.concat([out_df, pd.get_dummies(label_df)], axis=1)
        
        return out_df

## ManualFeatBlock
### 

In [None]:
class ManualFeatBlock(AbstractBaseBlock):
    def __init__(self, key_col="SecuritiesCode"):
        self.key_col = key_col
        
    def fit(self, input_df, y=None):
        pass

    def transform(self, input_df):
        out_df = pd.DataFrame()
        out_df["CloseAdjustment"] = input_df.groupby(self.key_col).apply(lambda d:d["Close"]/d["AdjustmentFactor"].cumprod().shift().fillna(1)).reset_index(self.key_col,drop=True)
        out_df["CloseOpenDiffDivByMean"] = (input_df["Close"] - input_df["Open"]) / input_df[["Close","Open"]].mean(axis=1)

        return out_df

In [None]:
class MAFeatBlock(AbstractBaseBlock):
    def __init__(self, target_cols:list, key_col, agg_list, window=3, DMA=True):
        self.target_cols = target_cols
        self.key_col = key_col
        self.agg_list = agg_list
        self.window = window
        
    def fit(self, input_df, y=None):
        pass
    
    def transform(self, input_df):
        out_df = pd.DataFrame()
        input_groupby = input_df.groupby(self.key_col)
        
        for target_col in self.target_cols:
            prefix = f'MA_{target_col}_groupby_{self.key_col}_{str(self.window)}'
            df_tmp = input_groupby[target_col].apply(
                lambda x: x.rolling(self.window).agg(self.agg_list).add_prefix(prefix)
            )
            out_df = pd.concat([out_df, df_tmp], axis=1)
        
        return out_df

In [None]:
class MACrossFeatBlock(AbstractBaseBlock):
    def __init__(self, target_cols:list, key_col, window_short=5, window_long=25):
        self.target_cols = target_cols
        self.key_col = key_col
        self.window_short = window_short
        self.window_long = window_long
        
    def fit(self, input_df, y=None):
        pass
    
    def transform(self, input_df):
        out_df = pd.DataFrame()
        input_groupby = input_df.groupby(self.key_col)
        
        for target_col in self.target_cols:
            short_ma = input_groupby[target_col].rolling(self.window_short).mean()
            long_ma = input_groupby[target_col].rolling(self.window_long).mean()
            diff = short_ma - long_ma
            new_column_name = f'MA_Cross_{target_col}_groupby_{self.key_col}_window_{self.window_short}_{self.window_long}'
            out_df[new_column_name] = pd.Series(
                    np.where(
                        (diff>0) & (diff<0).shift().fillna(False),
                        1,
                        np.where((diff<0) & (diff>0).shift().fillna(False), -1, 0)
                    )
            )
        
        return out_df

In [None]:
class LogFeatBlock(AbstractBaseBlock):
    def __init__(self, col):
        self.col = col

    def fit(self, input_df, y=None):
        pass

    def transform(self, input_df):
        output_df = pd.DataFrame()
        output_df[f'{self.col}_log'] = np.log1p(input_df[self.col])
        return output_df

In [None]:
numeric_columns = ['Date', 'Volume', 'Target',
                   'SecuritiesCode'] #, '17SectorCode', '33SectorCode', ] # for groupby
categorical_columns = []
MA_target_cols = ["CloseAdjustment"]
MA_agg_list = ['mean'] #, 'skew', max, min, lambda x: max(x) - min(x)]
windows = [5, 15, 25]
log_columns = ["CloseAdjustment", "Volume"]
window_short = 5
window_long = 25
cross_target_col = "CloseAdjustment"

In [None]:
run_blocks = [
    *[NumericFeatBlock(c) for c in [numeric_columns]],
#     *[CategoricalFeatBlock(c) for c in categorical_columns],
    *[ManualFeatBlock(key_col="SecuritiesCode")],
    *[DateFeatBlock(consecutive_year_month=False,
                    consecutive_month_day=False,
                    consecutive_week=False,
                    consecutive_week_denominator=7,
                    weekday=True)],
]

# apply out df blocks
latter_run_blocks = [
    *[MAFeatBlock(MA_target_cols, "SecuritiesCode", MA_agg_list, window) for window in windows],
#     *[MAFeatBlock(MA_target_cols, "17SectorCode", MA_agg_list, window) for window in windows],
#     *[MAFeatBlock(MA_target_cols, "33SectorCode", MA_agg_list, window) for window in windows],
    *[MACrossFeatBlock([cross_target_col], "SecuritiesCode", window_short, window_long)],
#     *[LogFeatBlock(c) for c in log_columns]
]

In [None]:
def get_train_data(input_df, feat_blocks, latter_feat_blocks=None, y=None, fit_df=None):
    if fit_df is None:
        fit_df = input_df.copy()
        
    for block in feat_blocks:
        block.fit(fit_df, y)
        
    out = [block.transform(input_df) for block in feat_blocks]
    out = pd.concat(out, axis=1)

    if latter_feat_blocks is not None:
        for block in latter_feat_blocks:
            block.fit(out, y)
        tmp = [block.transform(out) for block in latter_feat_blocks]
        tmp = pd.concat(tmp, axis=1)
        out = pd.concat([out, tmp], axis=1)
    
    return out

def get_test_data(input_df, feat_blocks, latter_feat_blocks=None):
    
    out = [block.transform(input_df) for block in feat_blocks]
    out = pd.concat(out, axis=1)

    if latter_feat_blocks is not None:
        tmp = [block.transform(out) for block in latter_feat_blocks]
        tmp = pd.concat(tmp, axis=1)
        out = pd.concat([out, tmp], axis=1)
    
    return out

### functional

In [None]:
def MA(series, window=25):
    return series.rolling(window, min_periods=1).mean()

def DMA(series, window=25):
    return series/MA(series, window) - 1

def divergence(series, window=25):
    std = series.rolling(window,min_periods=1).std()
    mean = series.rolling(window,min_periods=1).mean()
    return (series-mean) / std    

def rsi(series, n=14):
    return (series - series.shift(1)).rolling(n).apply(lambda s:s[s>0].sum()/abs(s).sum())

def stochastic(series, k=14, n=3, m=3):
    _min = series.rolling(k).min()
    _max = series.rolling(k).max()
    _k = (series - _min)/(_max - _min)
    _d1 = _k.rolling(n).mean()
    _d2 = _d1.rolling(m).mean()
    return pd.DataFrame({
                    "%K":_k,
                    "FAST-%D":_d1,
                    "SLOW-%D":_d2,
                    },index=series.index)
    # return _k, _d1, _d2

def psy(series, n=14):
    return (series - series.shift(1)).rolling(n).apply(lambda s:(s>=0).mean())


In [None]:
df = get_train_data(df, run_blocks, latter_feat_blocks=latter_run_blocks)

## Other Manual Features
### TODO: Feature Class

In [None]:
# other manual features
for window in windows:
    df[f'Diff_MA_CloseAdjustment_{window}mean'] = df['CloseAdjustment'] - df[f'MA_CloseAdjustment_groupby_SecuritiesCode_{window}mean']

for i in range(1, 3):
    df["MA_Cross_lag_{:}".format(i)] = df.groupby("SecuritiesCode")[f"MA_Cross_{cross_target_col}_groupby_SecuritiesCode_window_{window_short}_{window_long}"].shift(i)

# Manual
df["DivMA"] = df.groupby("SecuritiesCode")["CloseAdjustment"].apply(DMA)
df["Div"] = df.groupby("SecuritiesCode")["CloseAdjustment"].apply(divergence)
df["Rsi"] = df.groupby("SecuritiesCode")["CloseAdjustment"].apply(rsi)
df = df.join(df.groupby("SecuritiesCode")["CloseAdjustment"].apply(stochastic))

# Train

In [None]:
def train_model(X, y):
    model=LGBMRegressor(boosting_type="gbdt",
                        objective='rmse',
                        num_leaves=50,
                        max_depth=12,
                        learning_rate=0.1,
                        n_estimators=1000,
                        random_state=42
    )
    model.fit(X,y)
    
    return model

## Train models groupby "SecuritiesCode"

In [None]:
training = True

In [None]:
%%time
if training:
    models = {}
    for code, df_group in df.groupby("SecuritiesCode"):
        df_group = df_group[~df_group['Target'].isnull()]
        y = df_group['Target']
        X = df_group.drop(['Date', 'Target', 'SecuritiesCode'], axis=1)
        model = train_model(X, y)
        models[code] = model
#         print(code, model.score(X,y))
        
    with open("lgb_models_groupby_SecuritiesCode.pkl", "wb") as f:
        pickle.dump(models, f)

else:
    with open("lgb_models_groupby_SecuritiesCode.pkl", "rb") as f:
        models = pickle.load(f)


# Test

In [None]:
env = jpx_tokyo_market_prediction.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
data = df.copy()

In [None]:
# NOTE: if object type is in dataframe, error may occur
data.dtypes[data.dtypes == object]


In [None]:
input_columns = list(data.columns)

In [None]:
for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    prices["Date"] = pd.to_datetime(prices["Date"])
    data = data.append(prices).drop_duplicates(["SecuritiesCode", "Date"], keep="last").sort_values(["SecuritiesCode", "Date"]).reset_index(drop=True)

    # sample_prediction["Avg"] = sample_prediction["SecuritiesCode"].apply(get_avg)
    sample_prediction["Date"] = pd.to_datetime(sample_prediction["Date"])
    d = sample_prediction[["Date","SecuritiesCode"]].merge(data, on=["Date","SecuritiesCode"])
    d = d[input_columns]
    d_groupby = d.groupby("SecuritiesCode")
    
    for code, _d in d_groupby:
        _d = _d.drop(['Date', 'Target', 'SecuritiesCode'], axis=1)
        d.loc[_d.index, "Pred"] = models[code].predict(_d)

    sample_prediction = d.sort_values(by="Pred", ascending=False)
    sample_prediction["Rank"] = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    # sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    env.predict(submission)

In [None]:
submission