In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 # ** Tổng quan dự án :**
 - Thành công trong bất kỳ thị trường tài chính nào cũng đòi hỏi người ta phải xác định được các khoản đầu tư vững chắc.
 - Khi một cổ phiếu hoặc chứng khoán phái sinh bị định giá thấp, bạn nên mua. Ngược lại, nếu nó được đánh giá quá cao, có lẽ đã đến lúc bán.
 - Trong khi các nhận định tài chính trước đây được các chuyên gia đưa ra một các thủ công, công nghệ đã mở ra cơ hội cho các nhà đầu tư. Đặc biệt, các nhà khoa học dữ liệu có thể quan tâm đến việc khám phá giao dịch định lượng, nơi các quyết định được thực hiện dựa theo các dự đoán từ các mô hình được đào tạo.

# **Các bước cơ bản:**
- Load dữ liệu và xây dựng mô hình cơ sở ARIMA.
- Xử lý dữ liệu và xây dựng các mô hình LightGBMRegressor. So sánh và đánh giá chất lượng các mô hình.
- Kiểm tra và đánh giá dựa trên API bắt buộc của cuộc thi.

# ** Các chỉ số**:
- Chỉ số chất lượng: mean_squared_error
- Chỉ số RSI
- Chỉ số VWAP và EMA

# **1 Load dữ liệu**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error,mean_absolute_error
from lightgbm import LGBMRegressor
from tqdm import tqdm
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import warnings, gc
import plotly.figure_factory as ff
from sklearn.model_selection import TimeSeriesSplit
import statsmodels.api as sm


from plotly.offline import init_notebook_mode
warnings.simplefilter('ignore')
pd.set_option('max_column', None)
sns.set_style("darkgrid")
colors = sns.color_palette('Set2')

init_notebook_mode(connected=True)
temp = dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), width=800))

# **1.1 Viết các hàm cần thiết để tính toán và xếp hạng cổ phiếu**

In [None]:
def calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
    weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
    weights_mean = weights.mean()
    df = df.sort_values(by='Rank')
    purchase = (df['Target'][:portfolio_size]  * weights).sum() / weights_mean
    short    = (df['Target'][-portfolio_size:] * weights[::-1]).sum() / weights_mean
    return purchase - short

def calc_spread_return_sharpe(df, portfolio_size=200, toprank_weight_ratio=2):
    grp = df.groupby('Date')
    min_size = grp["Target"].count().min()
    if min_size<2*portfolio_size:
        portfolio_size=min_size//2
        if portfolio_size<1:
            return 0, None
    buf = grp.apply(calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

def add_rank(df, col_name="pred"):
    df["Rank"] = df.groupby("Date")[col_name].rank(ascending=False, method="first") - 1 
    df["Rank"] = df["Rank"].astype("int")
    return df

# **1.2 Load dữ liệu cần thiết**

In [None]:
train_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
df2 = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv")
supplemental_df = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/stock_prices.csv")
supplemental_df2 = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/supplemental_files/secondary_stock_prices.csv")
testprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")
teststockprices = pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv")
stock_list=pd.read_csv("../input/jpx-tokyo-stock-exchange-prediction/stock_list.csv")
train_df=train_df.append(supplemental_df,ignore_index=True)
stock_list['SectorName']=[i.rstrip().lower().capitalize() for i in stock_list['17SectorName']]
stock_list['Name']=[i.rstrip().lower().capitalize() for i in stock_list['Name']]
df = train_df.merge(stock_list[['SecuritiesCode','Name','SectorName']], on='SecuritiesCode', how='left')

# **1.3 Hàm tính toán các chỉ số cần thiết và xử lý dữ liệu bị khuyết**

In [None]:
def getadvance(x):
    ret = 0
    if x > 0:
        ret = 1
    return(ret)
def get_month(dt):
    x = dt.strftime("%m")
    return(x)
def RSI(series, period):
    delta = series.diff().dropna()
    u = delta * 0
    d = u.copy()
    u[delta > 0] = delta[delta > 0]
    d[delta < 0] = -delta[delta < 0]
    u[u.index[period-1]] = np.mean( u[:period] ) #first value is sum of avg gains
    u = u.drop(u.index[:(period-1)])
    d[d.index[period-1]] = np.mean( d[:period] ) #first value is sum of avg losses
    d = d.drop(d.index[:(period-1)])
    rs = pd.DataFrame.ewm(u, com=period-1, adjust=False).mean() / \
         pd.DataFrame.ewm(d, com=period-1, adjust=False).mean()
    return 100 - 100 / (1 + rs)
def rsi_class(x):
    ret = "low"
    if x < 50:
        ret = "low"
    if x > 50:
        ret = "med"
    if x > 70:
        ret = "hi"
    return(ret)
def prep_prices(price, test = False):
    from decimal import ROUND_HALF_UP, Decimal
    pcols = ["Open","High","Low","Close"]
    price.ExpectedDividend.fillna(0,inplace=True)
    def qround(x):
        return float(Decimal(str(x)).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
    
    def adjust_prices(df_):
        df_ = df_.sort_values("Date", ascending=False)
        df_.loc[:, "CumAdjust"] = df_["AdjustmentFactor"].cumprod()
        # generate adjusted prices
        for p in pcols:     
            df_.loc[:, p] = (df_["CumAdjust"] * df_[p]).apply(qround)
        df_.loc[:, "Volume"] = df_["Volume"] / df_["CumAdjust"]
        df_.ffill(inplace=True)
        df_.bfill(inplace=True)
        
        # generate and fill Targets
        #df.loc[:, "Target"] = ((df.Close.shift(-2)/df.Close.shift(-1) - 1)).fillna(df.Target)
        if (not test):
            df_.Target.fillna(0,inplace=True)
        return df_
    # generate Adjusted
    price = price.sort_values(["SecuritiesCode", "Date"])
    price = price.groupby("SecuritiesCode").apply(adjust_prices).reset_index(drop=True)
    price = price.sort_values("RowId")
    return price

# **1.4 Trực quan hóa dữ liệu**

In [None]:
sns.set(rc={'figure.figsize':(14.7,6)})
sns.set_style("whitegrid")
plt.figure(figsize=(8,6))
plt.scatter(range(df.shape[0]), np.sort(df['Target'].values))
plt.xlabel('index', fontsize=12)
plt.ylabel('Target', fontsize=12)
plt.show()

In [None]:
def summarize(df_, file_name, n_rows_to_show=5):
    """Simply summarize the given DataFrame.
    
    Parameters:
        df: pd.DataFrame, raw DataFrame
        file_name: str, name of the file
        n_rows_to_show: int, number of rows to show 
    """
    print(f"=====Summary of {file_name}=====")
    print(f"Shape: {df.shape}")
    
    nan_ratio = pd.isna(df_).sum() / len(df_) * 100
    nan_ratio.sort_values(ascending=False, inplace=True)
    nan_ratio = nan_ratio.to_frame(name='NaN Ratio').T
    print("NaN ratio:")
    display(nan_ratio)   
    
    display(df_.head(n_rows_to_show))
summarize(df, "stock_prices.csv")

In [None]:
n_dates = df['Date'].nunique()
date_min, date_max = df['Date'].min(), df['Date'].max()
n_stocks = df['SecuritiesCode'].nunique()

print(f"Number of unique dates: {n_dates} ({date_min} ~ {date_max})\n"
      f"Number of unique stocks: {n_stocks}")

In [None]:
n_dates_per_stock = df.groupby(['SecuritiesCode'])['Date'].count()
n_stocks_per_date = df.groupby(['Date'])['SecuritiesCode'].count()

**Missing Values**

In [None]:
df_no_prices = df[df['Close'].isna()]
print(f"Number of samples without prices: {len(df_no_prices)}")

In [None]:
def plot_candle_with_target(stock_code):
    """Plot OHLCV plot with target series.
    
    Parameters:
        stock_code: int, code of the stock
        prime: bool, whether the stock to plot is prime or not
    """
    df_ = df.copy()
    df_ = df_[df_['SecuritiesCode'] == stock_code]
    dates = df_['Date'].values
    ohlc = {
        'open': df_['Open'].values, 
        'high': df_['High'].values, 
        'low': df_['Low'].values, 
        'close': df_['Close'].values
    }
    vol = df_['Volume'].values
    target = df_['Target'].values
    
    fig = make_subplots(rows=3, cols=1, shared_xaxes=True, x_title='Date')
    fig.add_trace(go.Candlestick(x=dates, name='OHLC', **ohlc),
                  row=1, col=1)
    fig.add_trace(go.Bar(x=dates, y=vol, name='Volume'),
                  row=2, col=1)
    fig.add_trace(go.Scatter(x=dates, y=target, name='Target'),
                  row=3, col=1)
    fig.update_layout(
        title=f"OHLCV Chart with Target Series (Stock {stock_code})",
    )
    fig.update(layout_xaxis_rangeslider_visible=False)
    fig.show()
for stock_code in df['SecuritiesCode'].unique()[:3]:
    plot_candle_with_target(stock_code)

In [None]:
target_mean_per_stock = df.groupby(['SecuritiesCode'])['Target'].mean()
target_mean_mean = target_mean_per_stock.mean()

fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(data=target_mean_per_stock.values, bins=100, palette=colors, kde=True,
             ax=ax)
ax.axvline(x=target_mean_mean, color='orange', linestyle='dotted', linewidth=2, 
           label='Mean')
ax.set_title("Target Mean Distibution\n"
             f"Min {round(target_mean_per_stock.min(), 4)} | "
             f"Max {round(target_mean_per_stock.max(), 4)} | "
             f"Skewness {round(target_mean_per_stock.skew(), 2)} | "
             f"Kurtosis {round(target_mean_per_stock.kurtosis(), 2)}")
ax.set_xlabel("Target Mean")
ax.set_ylabel("Stock Count")
ax.legend()
plt.show()

In [None]:
target_std_per_stock = df.groupby(['SecuritiesCode'])['Target'].std()
target_std_mean = target_std_per_stock.mean()

fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(data=target_std_per_stock.values, bins=100, palette=colors, kde=True,
             ax=ax)
ax.axvline(x=target_std_mean, color='orange', linestyle='dotted', linewidth=2, 
           label='Mean')
ax.set_title("Target Std Distibution\n"
             f"Min {round(target_std_per_stock.min(), 4)} | "
             f"Max {round(target_std_per_stock.max(), 4)} | "
             f"Skewness {round(target_std_per_stock.skew(), 2)} | "
             f"Kurtosis {round(target_std_per_stock.kurtosis(), 2)}")
ax.set_xlabel("Target Std")
ax.set_ylabel("Stock Count")
ax.legend()
plt.show()

In [None]:
df.select_dtypes(include=['object'])

In [None]:
 df.select_dtypes(include=['float'])

In [None]:
df = prep_prices(df)


In [None]:
df.head()

# **1.5 Xây dựng mô hình cơ sở**

In [None]:
seclist=df['SecuritiesCode'].unique()
rmse_base=[]
mae_base=[]
for SECURITY in seclist:
    df_base_i=df[df['SecuritiesCode']==SECURITY]
    df_base_i.loc[:,"MovingAvg_{}Day".format(50)] = df_base_i.groupby("SecuritiesCode")['Target'].rolling(window=50,min_periods=1).mean().values
    rmse_i= np.sqrt(mean_squared_error(df_base_i['Target'].values,df_base_i['MovingAvg_50Day'].values))
    mae_i=mean_absolute_error(df_base_i['Target'].values,df_base_i['MovingAvg_50Day'].values)
    rmse_base.append(rmse_i)
    mae_base.append(mae_i)

In [None]:
df_base_i.head()

In [None]:
 print("\n root mean squared error = {:.4f}, mean absolute error = {:.4f}.".format(np.mean(rmse_base),np.mean(mae_base)))

# **1.6 Mô hình ARIMA(4,1,4)**

In [None]:
seclist=df["SecuritiesCode"].unique()
rmses1=[]
maes1=[]
list_high_error1=[]
for SECURITY in seclist:
    df_stock = df[df['SecuritiesCode'] == SECURITY]
    values_data=df_stock['Target'].values
    split=int(0.9*len(values_data))
    train=values_data[0:split]
    valid=values_data[split:]
    model = sm.tsa.arima.ARIMA(train, order=(4,1,4))
    model_fit = model.fit()
    predictions = model_fit.forecast(len(valid),exog=valid)
    rmse = np.sqrt(mean_squared_error(valid, predictions))
    mae = mean_absolute_error(valid, predictions)
    rmses1.append(rmse)
    maes1.append(mae)
    if(rmse>0.05):
        list_high_error1.append(SECURITY)
    print("\n{} root mean squared error = {:.4f}, mean absolute error = {:.4f}.".format(SECURITY,rmse,mae))
    

In [None]:
print("\nASIMA root mean squared error = {:.4f}, mean absolute error = {:.4f}.".format(np.mean(rmses1),np.mean(maes1)))

# **2 Xử lý dữ liệu. Xây dựng các mô hình LightGBMRegressor**

In [None]:
def get_model_data(df_, SECURITY_CODE):
    df_data = df_.copy()
    df_stock = df_data[df_data['SecuritiesCode'] == SECURITY_CODE]
    # feature 'average price, vwap
    # Some optimization here.
    test_length = -1
    if len(df_stock['typ'].unique() == 2): # This means this is test data creation
        test_length = len(df_stock[df_stock['typ'] == 'test'])
    
    df_stock = df_stock.sort_values(by = "Date", ascending = False).reset_index(drop = True)
    #if test_length > 0:
            #df_stock = df_stock[0: test_length + 125]
    df_stock['average'] = (df_stock['High'] + df_stock['Low'] + df_stock['Close'])/3
    df_stock['vwap'] = (df_stock['Close'] * df_stock['Volume'])/ df_stock['Volume']
    df_stock['vwap_pct_ret'] = df_stock['vwap'].pct_change()
    df_stock['pvwap'] = df_stock['vwap_pct_ret'].shift(-1)
    df_stock['20D-EMA'] = df_stock['Close'].ewm(span=20,adjust=False).mean()
    df_stock['50D-EMA'] = df_stock['Close'].ewm(span=50,adjust=False).mean()
    df_stock['100D-EMA'] = df_stock['Close'].ewm(span=100,adjust=False).mean()
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_stock['rsi'] = RSI( df_stock['Close'], 5 )
    df_stock['rsicat'] = list(map(rsi_class, df_stock['rsi']))
    df_stock['dt'] = pd.to_datetime(df_stock['Date'], format  = "%Y-%m-%d")
    df_stock['dayofweek'] = df_stock['dt'].dt.dayofweek
    # Another feature day of the week will also be added.
    df_stock['rsicat'] = pd.Series(df_stock['rsicat'], dtype="category")
    df_stock = df_stock.sort_values(by = "Date").reset_index(drop = True)
    df_model = df_stock[['Open', 'High', 'Low', 'Close'
                        , 'average', 'vwap', 'rsi', 'pvwap'
                        , '20D-EMA' , '50D-EMA', '100D-EMA'
                        , 'rsicat', 'dayofweek', 'Target'
                        , 'typ', 'RowId', 'Date', 'SecuritiesCode']]
    # '
    return df_model

In [None]:
def get_model_blueprint():
    model = LGBMRegressor(
             objective="rmse",
             metric="rmse",
             learning_rate=0.005,
             n_estimators=50000,
             device="cpu",
             random_state=999,
             extra_trees=True,
             #categorical_feature=[0]
        )
    #model = LGBMRegressor(device_type = 'cpu')
    return(model)

In [None]:
df.head()

In [None]:
sharpe_ratio=[]
rmses3=[]
maes3=[]
df = df.assign(typ = 'train')
# # Build models
dict_models = {}
seclist = df['SecuritiesCode'].unique()
list_high_error2=[]
# #seclist = [1301, 1332, 1333, 1376, 1377]
for SECURITY in (seclist):
    df_model = get_model_data(df, SECURITY)
    df_model = df_model.dropna().reset_index(drop=True)
    del df_model['typ']
    del df_model['RowId']
    del df_model['Date']
    del df_model['SecuritiesCode']
    train_size = round(len(df_model) * .9)
    test_size = len(df_model) - train_size
    X_train = df_model[:train_size]
    y_train = df_model['Target'][:train_size].values
    X_valid = df_model[train_size:]
    y_valid = df_model['Target'][train_size:].values
    del X_train['Target']
    del X_valid['Target']
    model = get_model_blueprint()
    model.fit(
           X_train, y_train,
           eval_set=[(X_valid, y_valid)],
           early_stopping_rounds=50,
           verbose=1000)
    valid_preds = model.predict(X_valid)
    valid_score = np.sqrt(mean_squared_error(y_valid, valid_preds))
    rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
    mae = mean_absolute_error(y_valid, valid_preds)
    if(rmse>0.05):
        list_high_error2.append(SECURITY)
    rmses3.append(rmse)
    maes3.append(mae)
    print(SECURITY, valid_score)
    dict_models[SECURITY] = model

In [None]:
print("\n root mean squared error = {:.4f}, mean absolute error = {:.4f}.".format(np.mean(rmses3),np.mean(maes3)))

In [None]:
print(len(list_high_error1))
print(len(list_high_error2))

In [None]:
print(list_high_error1)
print(list_high_error2)

In [None]:
list_high_error=set(list_high_error1) & set(list_high_error2)
print(list_high_error)

In [None]:
 for SECURITY in list_high_error:
        print(df[df['SecuritiesCode']==SECURITY]['Target'].head())
        plt.plot(df[df['SecuritiesCode']==SECURITY]['Target'], marker='.')
        plt.show()
       

In [None]:
for SECURITY in list_high_error:
    df_model = get_model_data(df, SECURITY)
    df_model = df_model.dropna().reset_index(drop=True)
    del df_model['typ']
    del df_model['RowId']
    del df_model['Date']
    del df_model['SecuritiesCode']
    train_size = round(len(df_model) * .9)
    test_size = len(df_model) - train_size
    X_train = df_model[:train_size]
    y_train = df_model['Target'][:train_size].values
    X_valid = df_model[train_size:]
    y_valid = df_model['Target'][train_size:].values
    del X_train['Target']
    del X_valid['Target']
    model = LGBMRegressor(
             objective="rmse",
             metric="rmse",
             learning_rate=0.01,
             n_estimators=100000,
             device="cpu",
             random_state=999,
             extra_trees=True,
             #categorical_feature=[0]
        )
    model.fit(
           X_train, y_train,
           eval_set=[(X_valid, y_valid)],
           early_stopping_rounds=50,
           verbose=1000)
    valid_preds = model.predict(X_valid)
    valid_score = np.sqrt(mean_squared_error(y_valid, valid_preds))
    rmse = np.sqrt(mean_squared_error(y_valid, valid_preds))
    mae = mean_absolute_error(y_valid, valid_preds)
    print("\n{} root mean squared error = {:.4f}, mean absolute error = {:.4f}.".format(SECURITY,rmse,mae))

# **3 Kiểm tra mô hình với API của cuộc thi**

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()
from tqdm import tqdm

In [None]:

for i, (prices, options, financials, trades, secondary_prices, sample_prediction) in enumerate(tqdm(iter_test)):
    sample_prediction.assign(Prediction = None, inplace = True)
    df_all = df.copy()
    prices = prep_prices(prices, True)
    ##prices['Date'] = pd.to_datetime(prices['Date'], format = "%Y-%m-%d")
    ##prices['dtint'] = prices['Date'].dt.strftime("%Y%m%d").astype(int)
    df_all = df_all.append(prices)
    df_all[['typ']] = df_all[['typ']].fillna(value= 'test')
    seclist=prices["SecuritiesCode"].unique()
    for SECURITY in seclist:
        df_model = get_model_data(df_all, SECURITY)
        df_test = df_model[df_model['typ'] == 'test']
        index    = df_test.index.values[0]
        rowid    = df_test['RowId'].values[0]
        date     = df_test['Date'].values[0]
        seccode  = df_test['SecuritiesCode'].values[0]
        del df_test['RowId']
        del df_test['typ']
        del df_test['Date']
        del df_test['Target']
        del df_test['SecuritiesCode']
        pred = dict_models[SECURITY].predict(df_test)
        df_sample = sample_prediction.copy()
        df_sample['Date'] = pd.to_datetime(df_sample['Date'], format = "%Y-%m-%d")
        df_sample = df_sample[df_sample['Date'] == date]
        df_sample = df_sample[df_sample['SecuritiesCode'] == seccode]
        index = df_sample.index.values[0]
        sample_prediction.at[index, 'Prediction'] = pred[0]
    sample_prediction = sample_prediction.sort_values(by = "Prediction", ascending=False)
    sample_prediction.Rank = np.arange(0,2000)
    sample_prediction = sample_prediction.sort_values(by = "SecuritiesCode", ascending=True)
    sample_prediction.drop(["Prediction"],axis=1)
    submission = sample_prediction[["Date","SecuritiesCode","Rank"]]
    ##display(submission)
    print("Submitting:", i)
    env.predict(submission)
%time