# IMPORTS

In [105]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# CONSTANTS

In [106]:
TRAIN_FILES_PATH = 'jpx-tokyo-stock-exchange-prediction/train_files'
NUMBER_OF_STABLE_STOCKS = 370

In [107]:
stock_prices = pd.read_csv(f'{TRAIN_FILES_PATH}/stock_prices.csv')
stock_prices.columns = [re.sub(r'(?<!^)(?=[A-Z])', '_', x).lower() for x in stock_prices.columns]
stock_prices.head()

Unnamed: 0,row_id,date,securities_code,open,high,low,close,volume,adjustment_factor,expected_dividend,supervision_flag,target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026


In [108]:
delisted_stocks = stock_prices[stock_prices['supervision_flag'] == True]['securities_code'].values

In [109]:
def get_stocks_traded_every_day(stock_prices_df):
    """Return stocks that are traded in every trading day in data and remove others"""
    traded_stock_per_date = stock_prices_df['date'].value_counts()
    stocks_by_descending_trade_count = stock_prices_df['securities_code'].value_counts()
    return stocks_by_descending_trade_count[stocks_by_descending_trade_count == len(traded_stock_per_date)].index.values


daily_traded_stocks = get_stocks_traded_every_day(stock_prices)

In [110]:
daily_traded_stocks = [x for x in daily_traded_stocks if (x not in delisted_stocks)]
print(
    f'We now have {len(daily_traded_stocks)} stocks which have available information in every trading day and which are NOT delisted.\n'
    'These remaining stocks are the ones we are going to analyze and "play" with from now on.')

We now have 1850 stocks which have available information in every trading day and which are NOT delisted.
These remaining stocks are the ones we are going to analyze and "play" with from now on.


In [111]:
listed_stock_prices = stock_prices.loc[
    stock_prices['securities_code'].isin(daily_traded_stocks), ['date', 'securities_code', 'open', 'close', 'volume',
                                                                'adjustment_factor', 'target']]

# 1850 daily-traded & listed stocks * 1202 trading days == 2223700 rows
assert len(listed_stock_prices) == len(daily_traded_stocks) * len(stock_prices['date'].unique())

In [112]:
def adjust_price(price):
    def calculate_adjusted(df):
        """apply AdjustmentFactor on columns"""
        new = df.sort_index(ascending=False)
        split_coef = new['adjustment_factor'].shift(1).fillna(1).cumprod()
        new['adj_open'] = new['open'] / split_coef
        new['adj_close'] = new['close'] / split_coef
        new['adj_volume'] = split_coef * new['volume']
        return new.sort_index(ascending=True)

    price = price.groupby('securities_code').apply(calculate_adjusted).reset_index(drop=True)
    price.set_index("date", inplace=True)

    return price

In [114]:
adj_prices_df = adjust_price(listed_stock_prices)
adj_prices_df['daily_change'] = (adj_prices_df['adj_close'] - adj_prices_df['adj_open']) / adj_prices_df['adj_close']
adj_prices_df

Unnamed: 0_level_0,securities_code,open,close,volume,adjustment_factor,target,adj_open,adj_close,adj_volume,daily_change
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-04,1301,2734.0,2742.0,31400,1.0,0.000730,2734.0,2742.0,31400.0,0.002918
2017-01-04,1332,568.0,571.0,2798500,1.0,0.012324,568.0,571.0,2798500.0,0.005254
2017-01-04,1333,3150.0,3210.0,270800,1.0,0.006154,3150.0,3210.0,270800.0,0.018692
2017-01-04,1376,1510.0,1550.0,11300,1.0,0.011053,1510.0,1550.0,11300.0,0.025806
2017-01-04,1377,3270.0,3330.0,150800,1.0,0.003026,3270.0,3330.0,150800.0,0.018018
...,...,...,...,...,...,...,...,...,...,...
2021-12-03,9990,514.0,528.0,44200,1.0,0.034816,514.0,528.0,44200.0,0.026515
2021-12-03,9991,782.0,794.0,35900,1.0,0.025478,782.0,794.0,35900.0,0.015113
2021-12-03,9993,1690.0,1645.0,7200,1.0,-0.004302,1690.0,1645.0,7200.0,-0.027356
2021-12-03,9994,2388.0,2389.0,6500,1.0,0.009098,2388.0,2389.0,6500.0,0.000419


In [104]:
grouped_df = adj_prices_df.groupby(adj_prices_df.index, observed=True)['daily_change'].mean()
grouped_df = pd.DataFrame(grouped_df)
grouped_df.columns = ['market_change_mean']
grouped_df.reset_index()

Unnamed: 0,date,market_change_mean
0,2017-01-04,0.013817
1,2017-01-05,0.001241
2,2017-01-06,0.006505
3,2017-01-10,-0.001976
4,2017-01-11,-0.001404
...,...,...
1197,2021-11-29,-0.008025
1198,2021-11-30,-0.017962
1199,2021-12-01,0.005028
1200,2021-12-02,-0.001521


In [88]:
all_stocks_df = adj_prices_df.join(grouped_df, on='date').sort_index(inplace=True)
all_stocks_df['market_gap'] = all_stocks_df['daily_change'] - all_stocks_df['market_change_mean']
all_stocks_df.head()

TypeError: 'NoneType' object is not subscriptable

In [None]:
all_stocks_df[all_stocks_df['securities_code'] == 1301]['market_gap'].std()

In [None]:
all_stocks_df[all_stocks_df['securities_code'] == 1301].reset_index().plot(x='date',
                                                                           y=["daily_change", "market_change_mean"],
                                                                           kind="line", figsize=(9, 18))

In [None]:
std_df = all_stocks_df.groupby(by='securities_code')['market_gap'].agg(['mean', 'median', 'std'])

In [None]:
std_df['std'].hist()

In [None]:
std_df.sort_values(by='std', inplace=True)

In [None]:
stable_stocks_df = std_df[:NUMBER_OF_STABLE_STOCKS]
stable_stocks_df

Get all non-stable stocks:
These are the stocks from which we want to get the ones which are better/worse than the market

In [None]:
extreme_stocks = std_df[(NUMBER_OF_STABLE_STOCKS + 1):]

In [None]:
top_df = extreme_stocks[extreme_stocks['mean'] > 0].sort_values(by=['std'])
bottom_df = extreme_stocks[extreme_stocks['mean'] < 0].sort_values(by=['std'])

In [None]:
top_stocks = top_df.index.unique()
bottom_stocks = bottom_df.index.unique()

In [None]:
top_df = all_stocks_df[all_stocks_df['securities_code'].isin(top_stocks)]
top_df

In [None]:
bottom_df = all_stocks_df[all_stocks_df['securities_code'].isin(bottom_stocks)]
bottom_df

In [None]:
import sys
!{sys.executable} -m pip install pandas_ta

In [None]:
import pandas_ta as ta


def get_features_for_prediction(df, code):
    """
    Args:
        df (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feats DataFrame (pd.DataFrame)
    """
    # close_col = "adj_close"
    feats = df.loc[df['securities_code'] == code, ['securities_code', 'target', 'adj_close', 'adj_volume']].copy()

    periods = [10, 21, 63]

    for period in periods:
        feats[f'return_{period}_days'] = feats['adj_close'].pct_change(period)
        feats[f'volume_{period}_days'] = feats['adj_volume'].pct_change(period)
        feats[f'ema_{period}_days'] = ta.ema(feats['adj_close'], length=period)

    # drop Close column
    #feats = feats.drop(['adj_close'], axis=1)

    # filling data for nan and inf
    feats['adj_volume'] = feats['adj_volume'].fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)

    return feats

In [None]:
# generate features
feature_list = []

for code in top_df['securities_code'].unique():
    top_feature = get_features_for_prediction(top_df, code)
    feature_list.append(top_feature)

top_features = pd.concat(feature_list)

In [None]:
top_features

In [None]:
# generate features
feature_list = []

for code in bottom_df['securities_code'].unique():
    bottom_feature = get_features_for_prediction(bottom_df, code)
    feature_list.append(bottom_feature)

bottom_features = pd.concat(feature_list)

In [None]:
bottom_features['securities_code'].nunique()

In [None]:
for i in [top_features, bottom_features]:
    i.reset_index().sort_values(by=['date', 'securities_code'], inplace=True)

In [None]:
top_features

In [None]:
bottom_features

In [None]:
# split data into TRAIN and TEST
TRAIN_END = "2019-12-31"
# We put a week gap between TRAIN_END and TEST_START
# to avoid leakage of test data information from label
TEST_START = "2020-01-06"


def get_features_and_label(df):
    """
    Args:
        price (pd.DataFrame): loaded price data
        codes  (array) : target codes
        feature (pd.DataFrame): features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in df['securities_code'].unique():
        feats = df[df['securities_code'] == code].dropna().drop(columns='target')
        labels = df.loc[df['securities_code'] == code, ['securities_code', 'target']].dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, 'securities_code'] == feats.loc[:, 'securities_code']).all()
            labels = labels["target"]

            # split data into TRAIN and TEST
            _train_X = feats[: TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[: TRAIN_END]
            _test_y = labels[TEST_START:]

            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)

    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [None]:
# generate feature/label
train_X, train_y, test_X, test_y = get_features_and_label(top_features)

In [None]:
train_X.tail(2)

In [None]:
test_X.head(2)

In [None]:
lgbm_params = {
    'seed': 42,
    'n_jobs': -1,
}

feat_cols = [
    'return_10_days',
    'volume_10_days',
    'return_21_days',
    'volume_21_days',
    'return_63_days',
    'volume_63_days'
]

In [None]:
import sys
!{sys.executable} -m pip install lightgbm

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from ipywidgets import HTML
from io import BytesIO
import base64
from lightgbm import LGBMRegressor

In [None]:
# initialize model
pred_model = LGBMRegressor(**lgbm_params)
# train
pred_model.fit(train_X[feat_cols].values, train_y)
# prepare result data
result = test_X[['securities_code']].copy()
# predict
result.loc[:, "predict"] = pred_model.predict(test_X[feat_cols])
# actual result
result.loc[:, "target"] = test_y.values

In [None]:
result

In [None]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with rank
    """

    # set rank starting from 0
    df.loc[:, "rank"] = np.arange(len(df["predict"]))
    return df


result = result.reset_index().sort_values(["date", "predict"], ascending=[True, False])

In [None]:
result2 = result.reset_index().groupby("date").apply(set_rank)

In [None]:
result2 = result2.reset_index(drop=True).drop(columns=['index', 'predict', 'target'])

In [None]:
top_15 = result2.head(15)

In [None]:
top_15

In [None]:
# Add LGBM for bottom_features

In [None]:
# Create bottom_15 df

In [None]:
# Split Stable randomly to 2 groups each with 185 stocks

In [None]:
# Concat top15 + stable half
# Concat bottom15 + stable half

In [None]:
# Add artifical ranking