# IMPORTS

In [1]:
!pip install lightgbm
!pip install pandas_ta
# !pip install pandas_ta --no-index --find-links=file:///kaggle/input/pandas-ta/



In [2]:
import pandas as pd
import numpy as np
import random
from lightgbm import LGBMRegressor
import pandas_ta as ta

# CONSTANTS

In [3]:
NUMBER_OF_STABLE_STOCKS = 370
NUMBER_OF_DAILY_CHOSEN_STOCKS = 15
DAYS_PERIODS = [10, 21, 63]

# PROCESS DATA

Submission format: our end goal

In [4]:
# sample_submission = pd.read_csv(
#     r'../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv')
# sample_submission

In [5]:
train_stock_prices = pd.read_csv(f'kaggle\input\jpx-tokyo-stock-exchange-prediction\\train_files\stock_prices.csv')
supplemental_stock_prices = pd.read_csv(
    f'kaggle\input\jpx-tokyo-stock-exchange-prediction\supplemental_files\stock_prices.csv')

In [6]:
stock_prices = pd.concat([train_stock_prices, supplemental_stock_prices])

In [8]:
delisted_stocks = stock_prices[stock_prices['SupervisionFlag'] == True]['SecuritiesCode'].values

In [9]:
def get_stocks_traded_every_day(stock_prices_df):
    """Return stocks that are traded in every trading day in data and remove others"""
    traded_stock_per_date = stock_prices_df['Date'].value_counts()
    stocks_by_descending_trade_count = stock_prices_df['SecuritiesCode'].value_counts()
    return stocks_by_descending_trade_count[stocks_by_descending_trade_count == len(traded_stock_per_date)].index.values


daily_traded_stocks = get_stocks_traded_every_day(stock_prices)

In [10]:
daily_traded_stocks = [x for x in daily_traded_stocks if (x not in delisted_stocks)]
print(
    f'We now have {len(daily_traded_stocks)} stocks which have available information in every trading day and which are NOT delisted.\n'
    'These remaining stocks are the ones we are going to analyze and "play" with from now on.')

We now have 1845 stocks which have available information in every trading day and which are NOT delisted.
These remaining stocks are the ones we are going to analyze and "play" with from now on.


In [11]:
listed_stock_prices = stock_prices.loc[
    stock_prices['SecuritiesCode'].isin(daily_traded_stocks), ['Date', 'SecuritiesCode', 'Open', 'Close', 'Volume',
                                                               'AdjustmentFactor', 'Target']]

# 1850 daily-traded & listed stocks * 1202 trading days == 2223700 rows
assert len(listed_stock_prices) == len(daily_traded_stocks) * len(stock_prices['Date'].unique())

In [12]:
def adjust_price(price):
    def calculate_adjusted(df):
        """apply AdjustmentFactor on columns"""
        new = df.sort_index(ascending=False)
        split_coef = new['AdjustmentFactor'].shift(1).fillna(1).cumprod()
        new['adj_open'] = new['Open'] / split_coef
        new['adj_close'] = new['Close'] / split_coef
        new['adj_volume'] = split_coef * new['Volume']
        return new.sort_index(ascending=True)

    price = price.groupby('SecuritiesCode').apply(calculate_adjusted).reset_index(drop=True)
    price.set_index('Date', inplace=True)

    return price

In [13]:
adj_prices_df = adjust_price(listed_stock_prices)
adj_prices_df['daily_change'] = (adj_prices_df['adj_close'] - adj_prices_df['adj_open']) / adj_prices_df['adj_close']
# adj_prices_df

In [14]:
grouped_df = adj_prices_df.groupby(adj_prices_df.index, observed=True)['daily_change'].mean()
grouped_df = pd.DataFrame(grouped_df)
grouped_df.columns = ['market_change_mean']
# grouped_df

In [15]:
all_stocks_df = adj_prices_df.join(grouped_df, on='Date')
all_stocks_df['market_gap'] = all_stocks_df['daily_change'] - all_stocks_df['market_change_mean']
all_stocks_df.sort_index(inplace=True)
# all_stocks_df.head()

In [16]:
# all_stocks_df[all_stocks_df['SecuritiesCode'] == 1301]['market_gap'].std()

In [17]:
# all_stocks_df[all_stocks_df['SecuritiesCode'] == 1301].reset_index().plot(x='Date',
#                                                                            y=["daily_change", "market_change_mean"],
#                                                                            kind="line", figsize=(20, 10))

In [18]:
statistics_df = all_stocks_df.groupby(by='SecuritiesCode')['market_gap'].agg(['mean', 'median', 'std'])

In [19]:
# statistics_df['std'].hist()

The stocks with std around 0 (+-) are those that behave LIKE THE MARKET.
We want them to be part of our prediction because they are the least-noisy stocks.

In [20]:
statistics_df.sort_values(by='std', ascending=True, inplace=True)
# statistics_df

In [21]:
stable_stocks_statistics_df = statistics_df[:NUMBER_OF_STABLE_STOCKS]

In [22]:
stable_stocks_df = all_stocks_df[all_stocks_df['SecuritiesCode'].isin(stable_stocks_statistics_df.index)]
# stable_stocks_df

In [23]:
assert stable_stocks_df.SecuritiesCode.nunique() == NUMBER_OF_STABLE_STOCKS

Get all non-stable stocks:
These are the stocks from which we want to get the ones which are better/worse than the market

In [24]:
extreme_stocks = statistics_df[(NUMBER_OF_STABLE_STOCKS + 1):]

"Top": the stocks which are usually "better" then the market in terms of daily_change (on average)
"Bottom": the stocks which are usually "worse" then the market in terms of daily_change (on average)

In [25]:
top_stocks = extreme_stocks[extreme_stocks['mean'] > 0].sort_values(by=['std'], ascending=True).index.unique()
bottom_stocks = extreme_stocks[extreme_stocks['mean'] < 0].sort_values(by=['std'], ascending=True).index.unique()
top_stocks_df = all_stocks_df[all_stocks_df['SecuritiesCode'].isin(top_stocks)]
bottom_stocks_df = all_stocks_df[all_stocks_df['SecuritiesCode'].isin(bottom_stocks)]

In [26]:
print(f'There are {len(top_stocks)} top_stocks and {len(bottom_stocks)} bottom_stocks')

There are 746 top_stocks and 728 bottom_stocks


In [27]:
# top_stocks_df

In [28]:
def get_features_for_prediction(df, code):
    """
    Args:
        df (pd.DataFrame)  : pd.DataFrame include stock_price
        code (int)  : A local code for a listed company
    Returns:
        feats DataFrame (pd.DataFrame)
    """
    # close_col = "adj_close"
    feats = df.loc[df['SecuritiesCode'] == code, ['SecuritiesCode', 'Target', 'adj_close', 'adj_volume']].copy()

    for period in DAYS_PERIODS:
        feats[f'return_{period}_days'] = feats['adj_close'].pct_change(period)
        feats[f'volume_{period}_days'] = feats['adj_volume'].pct_change(period)
        feats[f'ema_{period}_days'] = ta.ema(feats['adj_close'], length=period)

    # drop Close column
    #feats = feats.drop(['adj_close'], axis=1)

    # filling data for nan and inf
    feats['adj_volume'] = feats['adj_volume'].fillna(0)
    feats = feats.replace([np.inf, -np.inf], 0)

    return feats

## Generate features:

In [29]:
features = []

for i in [top_stocks_df, bottom_stocks_df]:
    # .reset_index().sort_values(by=['Date', 'SecuritiesCode'], inplace=True)
    result = pd.concat([get_features_for_prediction(i, code) for code in i['SecuritiesCode'].unique()])
    result.reset_index().sort_values(by=['Date', 'SecuritiesCode'], inplace=True)
    features.append(result)

top_features_df = features[0]
bottom_features_df = features[1]

In [30]:
# top_features_df

# MODEL

In [31]:
LGBM_PARAMS = {
    'seed': 42,
    'n_jobs': -1,
}

FEAT_COLS = [
    'return_10_days',
    'volume_10_days',
    'ema_10_days',
    'return_21_days',
    'volume_21_days',
    'ema_21_days',
    'return_63_days',
    'volume_63_days',
    'ema_63_days'
]

In [32]:
def get_features_and_label(df):
    """
    Args:
        df (pd.DataFrame): loaded price data with features
    Returns:
        train_X (pd.DataFrame): training data
        train_y (pd.DataFrame): label for train_X
        test_X (pd.DataFrame): test data
        test_y (pd.DataFrame): label for test_X
    """
    date_list = list(df.index.unique())

    # split data into TRAIN and TEST
    # TRAIN_END = date_list[int(len(date_list) * 0.8)]
    TRAIN_END = "2021-12-03"
    # We put a week gap between TRAIN_END and TEST_START
    # to avoid leakage of test data information from label
    # TEST_START = str(pd.to_datetime(TRAIN_END).date() + timedelta(days=7))
    TEST_START = "2021-12-06"

    # to store splited data
    trains_X, tests_X = [], []
    trains_y, tests_y = [], []

    # generate feature one by one
    for code in df['SecuritiesCode'].unique():
        feats = df[df['SecuritiesCode'] == code].dropna().drop(columns='Target')
        labels = df.loc[df['SecuritiesCode'] == code, ['SecuritiesCode', 'Target']].dropna()

        if feats.shape[0] > 0 and labels.shape[0] > 0:
            # align label and feature indexes
            labels = labels.loc[labels.index.isin(feats.index)]
            feats = feats.loc[feats.index.isin(labels.index)]

            assert (labels.loc[:, 'SecuritiesCode'] == feats.loc[:, 'SecuritiesCode']).all()
            labels = labels['Target']

            # split data into TRAIN and TEST
            _train_X = feats[:TRAIN_END]
            _test_X = feats[TEST_START:]

            _train_y = labels[:TRAIN_END]
            _test_y = labels[TEST_START:]

            assert len(_train_X) == len(_train_y)
            assert len(_test_X) == len(_test_y)

            # store features
            trains_X.append(_train_X)
            tests_X.append(_test_X)
            # store labels
            trains_y.append(_train_y)
            tests_y.append(_test_y)

    # combine features for each codes
    train_X = pd.concat(trains_X)
    test_X = pd.concat(tests_X)
    # combine label for each codes
    train_y = pd.concat(trains_y)
    test_y = pd.concat(tests_y)

    return train_X, train_y, test_X, test_y

In [33]:
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with rank
    """
    # set 'Rank' starting from 0
    df.loc[:, 'Rank'] = np.arange(len(df))
    return df

In [34]:
def get_daily_ranked_results(df: pd.DataFrame, predict_sort_ascending: bool) -> pd.DataFrame:
    # generate feature/label
    train_X, train_y, test_X, test_y = get_features_and_label(df)
    # initialize model
    pred_model = LGBMRegressor(**LGBM_PARAMS)
    # train
    pred_model.fit(train_X[FEAT_COLS].values, train_y)
    # prepare result data
    result = test_X[['SecuritiesCode']].copy()
    # predict
    result.loc[:, 'predict'] = pred_model.predict(test_X[FEAT_COLS])
    # actual result
    result.loc[:, 'Target'] = test_y.values
    # sort results by date and ascending/descending "predict", depending on which group we want:
    # the higher the "predict" the more likely to BUY the stock
    # the lower the "predict" the more likely to SELL the stock
    result = result.reset_index().sort_values(['Date', 'predict'], ascending=[True, predict_sort_ascending])
    # set_rank
    ranked_results = result.groupby('Date').apply(set_rank)
    # Drop unnecessary columns. Keep only those which are relevant for submission:
    ranked_results_no_labels_df = ranked_results.reset_index(drop=True).drop(columns=['predict'])

    return ranked_results_no_labels_df.groupby('Date').head(NUMBER_OF_DAILY_CHOSEN_STOCKS)

In [35]:
top_daily_chosen_stocks_df = get_daily_ranked_results(top_features_df, predict_sort_ascending=False)
bottom_daily_chosen_stocks_df = get_daily_ranked_results(bottom_features_df, predict_sort_ascending=True)

In [36]:
# top_daily_chosen_stocks_df

Unnamed: 0,Date,SecuritiesCode,Target,Rank
0,2021-12-06,9010,-0.005875,0
1,2021-12-06,5713,0.006755,1
2,2021-12-06,1939,0.003014,2
3,2021-12-06,5192,-0.000971,3
4,2021-12-06,9470,-0.013540,4
...,...,...,...,...
84606,2022-05-27,6920,0.008322,10
84607,2022-05-27,3677,-0.014563,11
84608,2022-05-27,2930,-0.032432,12
84609,2022-05-27,2782,0.014286,13


In [37]:
# bottom_daily_chosen_stocks_df

## Split stable_stocks_statistics_df randomly to 2 groups each with (NUMBER_OF_STABLE_STOCKS / 2) stocks

In [38]:
def random_split_of_list_into_halves(l: list) -> [list, list]:
    random.shuffle(l)
    return l[:int(len(l) / 2)], l[int(len(l) / 2):]

## Concat top_daily_chosen_stocks_df + stable half + stable half + bottom_daily_chosen_stocks_df

In [39]:
# stable_stocks_df

In [40]:
def get_final_ranks_df(top_df: pd.DataFrame, stable_stock_list: list, stable_stocks_df: pd.DataFrame,
                       bottom_df: pd.DataFrame) -> pd.DataFrame:
    result_df = pd.DataFrame()

    for d in top_df['Date'].unique():
        first_half, second_half = random_split_of_list_into_halves(stable_stock_list)
        result_df = pd.concat([result_df,
                               top_df[top_df['Date'] == d],
                               stable_stocks_df.loc[(stable_stocks_df.index == d) & (
                                   stable_stocks_df['SecuritiesCode'].isin(first_half)), ['SecuritiesCode',
                                                                                          'Target']].reset_index(),
                               stable_stocks_df.loc[(stable_stocks_df.index == d) & (
                                   stable_stocks_df['SecuritiesCode'].isin(second_half)), ['SecuritiesCode',
                                                                                           'Target']].reset_index(),
                               bottom_df[bottom_df['Date'] == d][::-1]])
        # need to reverse bottom stocks

    return result_df


final_ranks_df = get_final_ranks_df(top_daily_chosen_stocks_df, stable_stocks_statistics_df.index.values,
                                    stable_stocks_df,
                                    bottom_daily_chosen_stocks_df)

## re-rank concatenated daily

In [41]:
final_ranks_df = final_ranks_df.groupby('Date').apply(set_rank).reset_index(drop=True)
# final_ranks_df.Date.value_counts().unique()  # 400 every day
# final_ranks_df

# Calculate score

In [42]:
def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """

    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()

    return sharpe_ratio

In [43]:
score = calc_spread_return_sharpe(final_ranks_df, portfolio_size=200, toprank_weight_ratio=2)
score

-0.03108937275281503

In [44]:
final_ranks_df = final_ranks_df.drop(columns='Target')
final_ranks_df.to_csv('final_ranks_df.csv', index=False)

# SUBMIT
Will not work on local run!
Must be run through Kaggle's Kernel - open a notebook there and copy this notebook

In [45]:
import jpx_tokyo_market_prediction

env = jpx_tokyo_market_prediction.make_env()  # initialize the environment
iter_test = env.iter_test()  # an iterator which loops over the test files

ModuleNotFoundError: No module named 'jpx_tokyo_market_prediction'

In [None]:
for prices, _, _, _, _, sample_prediction in iter_test:
    ff = final_ranks_df[final_ranks_df['Date'] == prices['Date'].iloc[0]]
    ff = ff[ff['SecuritiesCode'].isin(final_ranks_df['SecuritiesCode'].unique())]
    mp = ff.set_index('SecuritiesCode')['Rank']
    sample_prediction = sample_prediction[sample_prediction['SecuritiesCode'].isin(ff['SecuritiesCode'].unique())]
    sample_prediction['Rank'] = sample_prediction.SecuritiesCode.map(mp)
    sample_prediction = sample_prediction.sort_values(by='Rank')
    env.predict(sample_prediction)

In [None]:
! head submission.csv