In [None]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for Statistical Models
import statsmodels.api as sm

# Time series Models
from statsmodels.tsa.arima.model import ARIMA
#from statsmodels.tsa.statespace.sarimax import SARIMAX

# Error Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Plotting 
from pandas.plotting import scatter_matrix
from statsmodels.graphics.tsaplots import plot_acf
import plotly.express as px

# Misc Utils
import glob
from decimal import ROUND_HALF_UP, Decimal
from pathlib import Path
from tqdm.notebook import tqdm

#Diable the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
working_dir = '/kaggle/input/jpx-tokyo-stock-exchange-prediction/'
example_test_files_dir, train_files_dir, supplemental_files_dir = glob.glob(working_dir+'*files*/')
csv_files = glob.glob(working_dir+'**/*.csv')
print(example_test_files_dir, train_files_dir, supplemental_files_dir, sep='\n')

In [None]:
df_train = pd.read_csv(Path(train_files_dir, 'stock_prices.csv'), index_col='RowId')
df_supp = pd.read_csv(Path(supplemental_files_dir, 'stock_prices.csv'), index_col='RowId')

In [None]:
print(len(df_train))
print(len(df_supp))

In [None]:
df_train[df_train.SecuritiesCode == 9990]

In [None]:
df_supp[df_supp.SecuritiesCode == 9990]

In [None]:
def generate_adjusted_close(df):
    """
    Args:
        df (pd.DataFrame)  : stock_price for a single SecuritiesCode
    Returns:
        df (pd.DataFrame): stock_price with AdjustedClose for a single SecuritiesCode
    """
    # sort data to generate CumulativeAdjustmentFactor
    df = df.sort_values("Date", ascending=False)
    # generate CumulativeAdjustmentFactor
    df.loc[:, "CumulativeAdjustmentFactor"] = df["AdjustmentFactor"].cumprod()
    # generate AdjustedClose
    df.loc[:, "AdjustedClose"] = (
        df["CumulativeAdjustmentFactor"] * df["Close"]
    ).map(lambda x: float(
        Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP)
    ))
    # reverse order
    df = df.sort_values("Date")
    # to fill AdjustedClose, replace 0 into np.nan
    df.loc[df["AdjustedClose"] == 0, "AdjustedClose"] = np.nan
    # forward fill AdjustedClose
    df.loc[:, "AdjustedClose"] = df.loc[:, "AdjustedClose"].ffill()    
    
    return df

def adjust_price(price):
    """
    Args:
        price (pd.DataFrame)  : pd.DataFrame include stock_price
    Returns:
        price DataFrame (pd.DataFrame): stock_price with generated AdjustedClose
    """
    # transform Date column into datetime
    price.loc[: ,"Date"] = pd.to_datetime(price.loc[: ,"Date"], format="%Y-%m-%d")

    # generate AdjustedClose
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(generate_adjusted_close).reset_index(drop=True)

    return price

In [None]:
def update_features_ARIMA(df, quote):
    # Filter for security code
    df = df[df['SecuritiesCode']==quote] 
    
    df = generate_adjusted_close(df)
    df.set_index('Date', inplace=True)
    # Calculate daily returns 
    df['DailyLogReturn'] = np.log(df.AdjustedClose).diff(1).shift(0)
    #df['DailySimpleReturn'] = df.AdjustedClose.pct_change().shift(0)
    
    return df[['DailyLogReturn']]

quote = 9990
y_train = update_features_ARIMA(df_train, quote)
y_test = update_features_ARIMA(df_supp, quote)
y_train.head(10)

In [None]:
def evaluate_ARIMA_model(y_train, order):
    X = y_train.dropna().copy()
    train_size = int(len(X) * 0.7)
    train, test = X[0:train_size], X[train_size:]
    history = train.copy()
    # make predictions
    predictions = []
    for t in range(len(test)):
        model = ARIMA(history, order=order)
        model_fit = model.fit()
        yhat = model_fit.forecast(2)
        predictions.append(round(yhat.values[-1], 8))
        history = history.append(test.iloc[t])
    # calculate out of sample error
    actual = test.shift(-2).dropna().iloc[:,0]
    predictions = predictions[:-2]
    error = mean_squared_error(actual, predictions)
    return error
    
# evaluate combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
    best_score, best_cfg = float("inf"), None
    for p in tqdm(p_values):
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    mse = evaluate_ARIMA_model(dataset, order)
                    if mse < best_score:
                        best_score, best_cfg = mse, order
                except:
                    continue
    return best_cfg
    
#best_order = evaluate_models(update_features_ARIMA(df_supp, 9990), [1, 2, 3, 5, 7, 11], [0], [0, 3, 5, 7])
#best_order = evaluate_models(update_features_ARIMA(df_supp, 9990), [1,2,3,4], [0,1], [0,1,2])
#print(best_order)

In [None]:
# Utilities 
def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio, buf

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

In [None]:
#parameters_choice = {}
def generate_prediction_ARIMA(df, quote):
    # Generate Series containing Daily Log Returns for specific quote
    y_train = update_features_ARIMA(df, quote)
    
    # Select optimal hyperparams...
    p, q = (1, 3)
    order = (p, 0, q)
#     if quote in parameters_choice:
#         best_order = parameters_choice[quote]
#     else:
#         best_order = evaluate_models(y_train, [1,2,3], [0], [0,1,2])
#         parameters_choice[quote] = best_order
#     order = best_order
    
    # Fit optimised model
    model = ARIMA(
        y_train, 
        order=order,
        trend= 't'
    ).fit()
    
    # Make Predictions
    y_pred = model.forecast(2) #.predict(start='2022-05-25', dynamic=len(y_train)-5, steps=2)
    
    return round(y_pred.values[-1], 8) 

In [None]:
%%time
import jpx_tokyo_market_prediction
from datetime import datetime, timedelta

env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

# Append new data (later using the API) to the train & supplementary data 
date_cutoff = '2021-11-05'
df_train_new = df_train[df_train['Date'] > date_cutoff].copy()

date_format='%Y-%M-%d'

In [None]:
for (prices, _, _, _, _, sample_prediction) in iter_test:
    # Loop through all stocks
    #0) set date cutoff to optimize performance on the scoring set
    #print(prices)#start from 2021-12-06
    
    dtObj = datetime.strptime(date_cutoff, date_format)
    future_date = dtObj + timedelta(days=1)
    date_cutoff = future_date.strftime(date_format)
    
    # 1) Wrangle data
    quotes = prices.SecuritiesCode.unique()
    df_train_new = pd.concat([df_train_new, prices])
    df_train_new = df_train_new.sort_values(['SecuritiesCode', 'Date'])
    df_train_new.ffill(inplace=True)
    df_train_new = df_train_new[df_train_new['Date'] > date_cutoff]

    # 2) Fit Model & Make predictions
    predictions = {}
    for i in tqdm(range(len(quotes))):
        predictions[quotes[i]] = generate_prediction_ARIMA(df_train_new, quotes[i])
    tr = df_train_new[df_train_new.Date==prices.Date.iat[0]].copy()
    tr.Target = tr["SecuritiesCode"].map(predictions) 
    tr = add_rank(tr, "Target")
    
    score = calc_spread_return_per_day(tr,200,2)
    print(f"Score: {score}")
    pred = tr.set_index("SecuritiesCode")["Rank"]
    sample_prediction['Rank'] = sample_prediction["SecuritiesCode"].map(pred)
    env.predict(sample_prediction)