**Apply our methodology to China Index CSI300**

In [None]:
# prepare data

import pandas as pd
pd.options.mode.chained_assignment = None

train_csi = pd.read_parquet('./drive/MyDrive/WQU/690/golden/csi300_train.parquet')
test_csi = pd.read_parquet('./drive/MyDrive/WQU/690/golden/csi300_test.parquet')

# @TODO : stock list vao trong hoac sau khoang thoi gian lay tap du lieu -> nan.
train_sp = train_csi.dropna()
test_sp = test_csi.dropna()

X_train = train_sp.drop(columns=['GB'])
y_train = train_sp['GB']

X_test = test_sp.drop(columns=['GB'])
y_test = test_sp['GB']

In [None]:
# # algorithms
# dnn_model = train_dnn(X_train.drop(columns=['Sticker', 'Date']), y_train)
# raf_model = train_random_forest(X_train.drop(columns=['Sticker', 'Date']), y_train)
# xgb_model = train_xgboost(X_train.drop(columns=['Sticker', 'Date']), y_train)

In [None]:
# import pickle

# with open('./drive/MyDrive/WQU/690/files/cs_dnn_model.pkl', 'wb') as f:
#     pickle.dump(dnn_model, f)

# with open('./drive/MyDrive/WQU/690/files/cs_raf_model.pkl', 'wb') as f:
#     pickle.dump(raf_model, f)

# with open('./drive/MyDrive/WQU/690/files/cs_xgb_model.pkl', 'wb') as f:
#     pickle.dump(xgb_model, f)

In [None]:
s, m, e = 2017, 2020, 2021

outsample = pd.read_parquet('./drive/MyDrive/WQU/690/raw/csi300.parquet')['Adj Close']

daily_returns = outsample.pct_change() # TODO check again
daily_returns = daily_returns[(daily_returns.index.year >= m) & (daily_returns.index.year < e)]


import numpy as np
import pandas as pd
import pickle

class Inference():
    def __init__(self, fee=False):
        with open('./drive/MyDrive/WQU/690/files/cs_dnn_model.pkl', 'rb') as f:
            self.dnn_model = pickle.load(f)

        with open('./drive/MyDrive/WQU/690/files/cs_xgb_model.pkl', 'rb') as f:
            self.xgb_model = pickle.load(f)

        with open('./drive/MyDrive/WQU/690/files/cs_raf_model.pkl', 'rb') as f:
            self.raf_model = pickle.load(f)

        self.models = {
            'DNN': self.dnn_model,
            'RAF': self.raf_model,
            'XGB': self.xgb_model
        }

        self.full = None
        self.fee = fee

    def predict_proba(self, frame):
        r = {}
        tmp = frame.drop(columns=['Sticker'])
        for name, model in self.models.items():
            if name == 'DNN':
                r[name] = model.predict(tmp)
            else:
                r[name] = model.predict_proba(tmp)[:, 1]
        return r

    def select_bottom_stocks(self, dataframe, n):
        bottom_stocks = dataframe.apply(lambda row: row.nsmallest(n).index.tolist(), axis=1)
        return bottom_stocks

    def select_top_stocks(self, dataframe, n):
        top_stocks = dataframe.apply(lambda row: row.nlargest(n).index.tolist(), axis=1)
        return top_stocks

    def trigger(self, frame):
        r = self.predict_proba(frame)
        rr = frame.copy()
        for name, prediction in r.items():
            rr[name] = prediction

        return {
            name: rr.pivot_table(columns='Sticker', index='Date', values=name) for name in r.keys()
        }

    def calculate_metrics(self, daily_portfolio_return):
        daily_portfolio_return = np.array(daily_portfolio_return)

        # Calculate Annualized Return
        annualized_return = np.prod(1 + daily_portfolio_return) ** (252 / len(daily_portfolio_return)) - 1

        # Calculate Standard Deviation
        std_dev = np.std(daily_portfolio_return) * np.sqrt(252)

        # Calculate Sharpe Ratio
        sharpe_ratio = annualized_return / std_dev

        return annualized_return, std_dev, sharpe_ratio

    def run(self, frame, daily_returns, n_values):
        results = {}
        self.full = self.trigger(frame)
        for n in n_values:
            bottom_ = {k: self.select_bottom_stocks(v, n) for k, v in self.full.items()}
            top_ = {k: self.select_top_stocks(v, n) for k, v in self.full.items()}
            for name in self.models.keys():
                daily_portfolio_return = []
                transaction_cost = 0.0005

                for date in daily_returns.index[:-1]:
                    top_stocks = top_[name].loc[date]
                    bottom_stocks = bottom_[name].loc[date]
                    long_returns = daily_returns.loc[date, top_stocks].mean()
                    short_returns = daily_returns.loc[date, bottom_stocks].mean()

                    portfolio_return = long_returns - short_returns - transaction_cost * int(self.fee == True)
                    daily_portfolio_return.append(portfolio_return)

                ann_return, std_dev, sharpe_ratio = self.calculate_metrics(daily_portfolio_return[1:])
                if name not in results:
                    results[name] = {}
                results[name][n] = {
                    'Annualized Return': ann_return,
                    'Standard Deviation': std_dev,
                    'Sharpe Ratio': sharpe_ratio
                }
        return results


inference = Inference(fee=True)
results_fee = inference.run(X_test.set_index('Date'), daily_returns, [10, 20, 50])

inference = Inference(fee=False)
results_nofee = inference.run(X_test.set_index('Date'), daily_returns, [10, 20, 50])


numbers = [10, 20, 50]
pip = {}

for number in numbers:
  p1 = []
  for alg, rtb in results_nofee.items():
      tpm = pd.DataFrame([rtb[number]]).T
      tpm.columns = [alg]
      p1.append(tpm)

  tpm2 = pd.concat(p1, axis=1)

  p1 = []
  for alg, rtb in results_fee.items():
      tpm = pd.DataFrame([rtb[number]]).T
      tpm.columns = [alg]
      p1.append(tpm)

  tpm3 = pd.concat(p1, axis=1)

  columns = pd.MultiIndex.from_product([['Before Fees', 'After Fees'], tpm2.columns])
  combined_df = pd.concat([tpm2, tpm3], axis=1)
  combined_df.columns = columns
  print(f'Number of stocks: {number}')
  display(combined_df)
  pip[number] = combined_df


Number of stocks: 10


Unnamed: 0_level_0,Before Fees,Before Fees,Before Fees,After Fees,After Fees,After Fees
Unnamed: 0_level_1,DNN,RAF,XGB,DNN,RAF,XGB
Annualized Return,0.12063,0.645569,1.233663,-0.012029,0.451047,0.969931
Standard Deviation,0.270878,0.272099,0.237933,0.270878,0.272099,0.237933
Sharpe Ratio,0.44533,2.372552,5.18492,-0.044407,1.657657,4.076492


Number of stocks: 20


Unnamed: 0_level_0,Before Fees,Before Fees,Before Fees,After Fees,After Fees,After Fees
Unnamed: 0_level_1,DNN,RAF,XGB,DNN,RAF,XGB
Annualized Return,0.099326,0.549944,0.651342,-0.030814,0.366693,0.456155
Standard Deviation,0.219945,0.227887,0.178822,0.219945,0.227887,0.178822
Sharpe Ratio,0.451593,2.413232,3.642413,-0.1401,1.609098,2.550895


Number of stocks: 50


Unnamed: 0_level_0,Before Fees,Before Fees,Before Fees,After Fees,After Fees,After Fees
Unnamed: 0_level_1,DNN,RAF,XGB,DNN,RAF,XGB
Annualized Return,0.155537,0.662798,0.471132,0.018773,0.466262,0.297176
Standard Deviation,0.169617,0.178995,0.126216,0.169617,0.178995,0.126216
Sharpe Ratio,0.916991,3.702881,3.732754,0.110679,2.604886,2.354511


In [None]:
tmp_1.head()

Sticker,000001.SZ,000002.SZ,000063.SZ,000069.SZ,000100.SZ,000157.SZ,000166.SZ,000301.SZ,000333.SZ,000338.SZ,...,603806.SS,603833.SS,603899.SS,603986.SS,603993.SS,688008.SS,688012.SS,688036.SS,688111.SS,688363.SS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,0.397099,0.370827,0.347489,0.541849,0.450022,0.462827,0.522399,0.372806,0.427751,0.519268,...,0.43461,0.389732,0.499458,0.509144,0.410796,,,,,
2020-01-03,0.487132,0.408185,0.462883,0.483024,0.47994,0.320624,0.415966,0.252808,0.459712,0.366618,...,0.426829,0.40007,0.4494,0.501862,0.341466,,,,,
2020-01-06,0.344314,0.392436,0.44389,0.447624,0.574316,0.45772,0.460729,0.349818,0.408893,0.251746,...,0.427967,0.537718,0.453402,0.465626,0.335929,,,,,
2020-01-07,0.366811,0.623332,0.42264,0.480778,0.442209,0.49433,0.551082,0.312016,0.480395,0.446621,...,0.595581,0.728336,0.501724,0.33737,0.383948,,,,,
2020-01-08,0.472438,0.521407,0.384682,0.507317,0.478663,0.461282,0.446664,0.352133,0.464913,0.511805,...,0.375839,0.518323,0.478585,0.291195,0.411276,,,,,


In [None]:
n_chr = 20

mm = test_sp.pivot_table(index='Date', columns='Sticker', values='GB')
mm.head()

s = Inference(fee=True)
_ = s.run(X_test.set_index('Date'), daily_returns, [n_chr])

tmp_1 = s.full['XGB']

l = []
for sticker in tmp_1.columns:
    pp = tmp_1[[sticker]].rename(columns={sticker: 'Stock'})
    pp['Sticker'] = sticker
    for day in (1, 2):
        pp['Lag_{}'.format(day)] = pp['Stock'].shift(day)
    pp = pp.merge(mm[[sticker]].rename(columns={sticker: 'GB'}), left_index=True, right_index=True)
    l.append(pp)


final_c = pd.concat(l).dropna()

final_c['Prediction'] = final_c['Stock'] * 0.6 + final_c['Lag_1'] * 0.3 + final_c['Lag_2'] * 0.1
# final_c['Prediction'] = final_c['Stock']

final_c = final_c[['Prediction', 'Sticker']].reset_index('Date')

output_short = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nsmallest(n_chr).index.tolist(), axis=1)

output_long = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nlargest(n_chr).index.tolist(), axis=1)

def calculate_metrics(daily_portfolio_return):
    daily_portfolio_return = np.array(daily_portfolio_return)

    annualized_return = np.prod(1 + daily_portfolio_return) ** (252 / len(daily_portfolio_return)) - 1

    std_dev = np.std(daily_portfolio_return) * np.sqrt(252)

    sharpe_ratio = annualized_return / std_dev

    return {
                    'Annualized Return': annualized_return,
                    'Standard Deviation': std_dev,
                    'Sharpe Ratio': sharpe_ratio
                }

aaa = []
transaction_cost = 0.0005

for date in daily_returns.index[2:]:
    top_stocks = output_long.loc[date]
    bottom_stocks = output_short.loc[date]
    long_returns = daily_returns.loc[date, top_stocks].mean()
    short_returns = daily_returns.loc[date, bottom_stocks].mean()
    portfolio_return = long_returns - short_returns - transaction_cost
    aaa.append(portfolio_return)

pd.DataFrame([calculate_metrics(aaa)])



Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,0.565466,0.191344,2.95524


In [None]:
pd.DataFrame([results_fee['XGB'][n_chr]])

Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,0.456155,0.178822,2.550895


In [None]:
n_chr = 20

mm = test_sp.pivot_table(index='Date', columns='Sticker', values='GB')
mm.head()

s = Inference(fee=False)
_ = s.run(X_test.set_index('Date'), daily_returns, [n_chr])

tmp_1 = s.full['XGB']

l = []
for sticker in tmp_1.columns:
    pp = tmp_1[[sticker]].rename(columns={sticker: 'Stock'})
    pp['Sticker'] = sticker
    for day in (1, 2):
        pp['Lag_{}'.format(day)] = pp['Stock'].shift(day)
    pp = pp.merge(mm[[sticker]].rename(columns={sticker: 'GB'}), left_index=True, right_index=True)
    l.append(pp)


final_c = pd.concat(l).dropna()

final_c['Prediction'] = final_c['Stock'] * 0.6 + final_c['Lag_1'] * 0.3 + final_c['Lag_2'] * 0.1
# final_c['Prediction'] = final_c['Stock']

final_c = final_c[['Prediction', 'Sticker']].reset_index('Date')

output_short = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nsmallest(n_chr).index.tolist(), axis=1)

output_long = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nlargest(n_chr).index.tolist(), axis=1)

def calculate_metrics(daily_portfolio_return):
    daily_portfolio_return = np.array(daily_portfolio_return)

    annualized_return = np.prod(1 + daily_portfolio_return) ** (252 / len(daily_portfolio_return)) - 1

    std_dev = np.std(daily_portfolio_return) * np.sqrt(252)

    sharpe_ratio = annualized_return / std_dev

    return {
                    'Annualized Return': annualized_return,
                    'Standard Deviation': std_dev,
                    'Sharpe Ratio': sharpe_ratio
                }

aaa = []
transaction_cost = 0.0005

for date in daily_returns.index[2:]:
    top_stocks = output_long.loc[date]
    bottom_stocks = output_short.loc[date]
    long_returns = daily_returns.loc[date, top_stocks].mean()
    short_returns = daily_returns.loc[date, bottom_stocks].mean()
    portfolio_return = long_returns - short_returns# - transaction_cost
    aaa.append(portfolio_return)

pd.DataFrame([calculate_metrics(aaa)])



Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,0.775243,0.191344,4.051577


In [None]:
pd.DataFrame([results_nofee['XGB'][n_chr]])

Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,0.651342,0.178822,3.642413


In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import tensorflow as tf

np.random.seed(101)
tf.random.set_seed(101)

sp = pd.read_parquet('./drive/MyDrive/WQU/690/golden/csi300_golden.parquet')


sp['Label'] = sp.groupby('Sticker')['return_1'].shift(-1)

output_short = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nsmallest(10).index.tolist(), axis=1)

output_long = final_c.pivot(index='Date', columns='Sticker', values='Prediction')\
                     .apply(lambda row: row.nlargest(10).index.tolist(), axis=1)

data_for_reg = []

for date in sorted(X_test['Date'].unique())[2:]:
    top_stocks = output_long.loc[date]
    bottom_stocks = output_short.loc[date]

    data_for_reg.append(X_test[X_test['Date'].eq(date) & X_test['Sticker'].isin(top_stocks)])
    data_for_reg.append(X_test[X_test['Date'].eq(date) & X_test['Sticker'].isin(bottom_stocks)])


frame_for_lstm = pd.concat(data_for_reg)

frame_for_lstm = frame_for_lstm.merge(sp[['Date', 'Sticker', 'Label']], on=['Date', 'Sticker'], how='left')

train_reg = frame_for_lstm[frame_for_lstm['Date'] <= datetime(2020,10, 30)]
test_reg = frame_for_lstm[frame_for_lstm['Date'] > datetime(2020, 10, 30)]

X_train_reg = train_reg.drop(columns=['Label'])
y_train_reg = train_reg['Label']

X_test_reg = test_reg.drop(columns=['Label'])
y_test_reg = test_reg['Label']

from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

mms = MinMaxScaler(feature_range=(0, 1))
mms.fit(X_train_reg.iloc[:, 2:])

X_train_reg_scaled = mms.transform(X_train_reg.iloc[:, 2:])
X_test_reg_scaled  = mms.transform(X_test_reg.iloc[:, 2:])

model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train_reg_scaled.shape[1], 1)))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

model.fit(X_train_reg_scaled, y_train_reg, epochs=20, batch_size=32)


predicted_stock_price = model.predict(X_test_reg_scaled)

mn = X_test_reg[['Sticker', 'Date']]
mn['Predicted'] = predicted_stock_price[:, 0]

mn = mn.pivot_table(columns='Sticker', index='Date', values='Predicted')

output_short_2 = {}
output_long_2 = {}

for date in mn.index:
    output_short_2[date] = [mn.loc[date, output_short.loc[date]].nsmallest(5).index.tolist()]
    output_long_2[date] = [mn.loc[date, output_long.loc[date]].nlargest(5).index.tolist()]


output_short_2 = pd.DataFrame(output_short_2).T[0]
output_long_2 = pd.DataFrame(output_long_2).T[0]

aaa = []
transaction_cost = 0.0005

for date in output_short_2.index:
    top_stocks = output_long.loc[date]
    bottom_stocks = output_short.loc[date]
    long_returns = daily_returns.loc[date, top_stocks].mean()
    short_returns = daily_returns.loc[date, bottom_stocks].mean()
    portfolio_return = long_returns - short_returns - transaction_cost
    aaa.append(portfolio_return)

pd.DataFrame([calculate_metrics(aaa)])

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 31, 50)            10400     
                                                                 
 dropout_4 (Dropout)         (None, 31, 50)            0         
                                                                 
 lstm_5 (LSTM)               (None, 50)                20200     
                                                                 
 dropout_5 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 30651 (119.73 KB)
Trainable params: 30651 (119.73 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/20
Epoch

Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,1.012027,0.239416,4.227062


In [None]:
aaa = []
transaction_cost = 0.0005

for date in output_short_2.index:
    top_stocks = output_long_2.loc[date]
    bottom_stocks = output_short_2.loc[date]
    long_returns = daily_returns.loc[date, top_stocks].mean()
    short_returns = daily_returns.loc[date, bottom_stocks].mean()
    portfolio_return = long_returns - short_returns - transaction_cost
    aaa.append(portfolio_return)

pd.DataFrame([calculate_metrics(aaa)])

Unnamed: 0,Annualized Return,Standard Deviation,Sharpe Ratio
0,1.108106,0.312408,3.546978


In [None]:
output_long.loc[datetime(2020, 12, 1)]

['601919.SS',
 '300142.SZ',
 '603486.SS',
 '300751.SZ',
 '603899.SS',
 '000877.SZ',
 '002938.SZ',
 '002007.SZ',
 '300223.SZ',
 '002709.SZ']

In [None]:
output_long_2.loc[datetime(2020, 12, 1)]

['300223.SZ', '601919.SS', '002938.SZ', '000877.SZ', '002007.SZ']