# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
# import matplotlib.pyplot as plt
# from ipywidgets import HTML
# from io import BytesIO
# import base64
# from lightgbm import LGBMRegressor
# from tqdm import tqdm

# # Plotly
# import plotly.express as px
# from plotly.subplots import make_subplots
# import plotly.figure_factory as ff
# import plotly.offline as offline
# import plotly.graph_objs as go

In [None]:
stock_prices = pd.read_csv('jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv')

In [None]:
stock_prices.head(3)

In [None]:
delisted_stocks = stock_prices[stock_prices['SupervisionFlag'] == True]['SecuritiesCode'].values

In [None]:
def get_stocks_traded_every_day(stock_prices_df):
    """Return stocks that are traded in every trading day in data and remove others"""
    traded_stock_per_date = stock_prices_df['Date'].value_counts()
    stocks_by_descending_trade_count = stock_prices_df['SecuritiesCode'].value_counts()
    return stocks_by_descending_trade_count[stocks_by_descending_trade_count == len(traded_stock_per_date)].index.values


daily_traded_stocks = get_stocks_traded_every_day(stock_prices)

In [None]:
daily_traded_stocks = [x for x in daily_traded_stocks if (x not in delisted_stocks)]
print(
    f'We now have {len(daily_traded_stocks)} stocks which have available information in every trading day and which are not delisted.\n'
    'These remaining stocks are the ones we are going to analyze and "play" with from now on.')

In [None]:
clean_df = stock_prices.loc[
    stock_prices['SecuritiesCode'].isin(daily_traded_stocks), ['Date', 'SecuritiesCode', 'Close', 'Volume',
                                                               'AdjustmentFactor', 'Target']]
assert len(clean_df) == len(daily_traded_stocks) * len(
    stock_prices['Date'].unique())  # 1850 daily-traded & listed stocks * 1202 trading days == 2223700 rows

In [None]:
def adjust_price(price):
    def calculate_adjusted(df):
        # we will go from today to the past
        new = df.sort_index(ascending=False)
        split_coef = new['AdjustmentFactor'].shift(1).fillna(1).cumprod()
        new['adj_Close'] = new['Close'] / split_coef
        new['adj_Volume'] = split_coef * new['Volume']
        return new.sort_index(ascending=True)

    price = price.groupby("SecuritiesCode").apply(calculate_adjusted).reset_index(drop=True)
    price.set_index("Date", inplace=True)
    return price

In [57]:
df_adj = adjust_price(clean_df)

In [58]:
df_adj

Unnamed: 0_level_0,SecuritiesCode,Close,Volume,AdjustmentFactor,Target,adj_Close,adj_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-04,1301,2742.0,31400,1.0,0.000730,2742.0,31400.0
2017-01-04,1332,571.0,2798500,1.0,0.012324,571.0,2798500.0
2017-01-04,1333,3210.0,270800,1.0,0.006154,3210.0,270800.0
2017-01-04,1376,1550.0,11300,1.0,0.011053,1550.0,11300.0
2017-01-04,1377,3330.0,150800,1.0,0.003026,3330.0,150800.0
...,...,...,...,...,...,...,...
2021-12-03,9990,528.0,44200,1.0,0.034816,528.0,44200.0
2021-12-03,9991,794.0,35900,1.0,0.025478,794.0,35900.0
2021-12-03,9993,1645.0,7200,1.0,-0.004302,1645.0,7200.0
2021-12-03,9994,2389.0,6500,1.0,0.009098,2389.0,6500.0
