# 롱-숏 전략, 5부: 표본외 예측 생성

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In this section, we'll start designing, implementing, and evaluating a trading strategy for US equities driven by daily return forecasts produced by gradient boosting models.

As in the previous examples, we'll lay out a framework and build a specific example that you can adapt to run your own experiments. There are numerous aspects that you can vary, from the asset class and investment universe to more granular aspects like the features, holding period, or trading rules. See, for example, the **Alpha Factor Library** in the [Appendix](../24_alpha_factor_library) for numerous additional features.

We'll keep the trading strategy simple and only use a single ML signal; a real-life application will likely use multiple signals from different sources, such as complementary ML models trained on different datasets or with different lookahead or lookback periods. It would also use sophisticated risk management, from simple stop-loss to value-at-risk analysis.

**Six notebooks** cover our workflow sequence:

1. [preparing_the_model_data](04_preparing_the_model_data.ipyny): we engineer a few simple features from the Quandl Wiki data 
2. [trading_signals_with_lightgbm_and_catboost](05_trading_signals_with_lightgbm_and_catboost.ipynb): we tune hyperparameters for LightGBM and CatBoost to select a model, using 2015/16 as our validation period. 
3. [evaluate_trading_signals](06_evaluate_trading_signals): we compare the cross-validation performance using various metrics to select the best model. 
4. [model_interpretation](07_model_interpretation.ipynb): we take a closer look at the drivers behind the best model's predictions.
5. `making_out_of_sample_predictions` (this noteboook): we predict returns for our out-of-sample period 2017.
6. [backtesting_with_zipline](09_backtesting_with_zipline.ipynb): evaluate the historical performance of a long-short strategy based on our predictive signals using Zipline.

## Imports & Settings

In [2]:
  !pip install --upgrade catboost tables

Collecting catboost
  Downloading catboost-1.0.3-cp37-none-manylinux1_x86_64.whl (76.3 MB)
[K     |████████████████████████████████| 76.3 MB 1.2 MB/s 
Collecting tables
  Downloading tables-3.6.1-cp37-cp37m-manylinux1_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 26.9 MB/s 
Installing collected packages: tables, catboost
  Attempting uninstall: tables
    Found existing installation: tables 3.4.4
    Uninstalling tables-3.4.4:
      Successfully uninstalled tables-3.4.4
Successfully installed catboost-1.0.3 tables-3.6.1


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
%matplotlib inline

from time import time
import sys, os
from pathlib import Path

import pandas as pd
from scipy.stats import spearmanr

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
os.chdir('/content/drive/MyDrive/금융특화 AI 역량강화교육 - 증권 (심화) -/12_gradient_boosting_machines')

In [7]:
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from utils import MultipleTimeSeriesCV

In [8]:
sns.set_style('whitegrid')

In [9]:
YEAR = 252
idx = pd.IndexSlice

In [10]:
scope_params = ['lookahead', 'train_length', 'test_length']
daily_ic_metrics = ['daily_ic_mean', 'daily_ic_mean_n', 'daily_ic_median', 'daily_ic_median_n']
lgb_train_params = ['learning_rate', 'num_leaves', 'feature_fraction', 'min_data_in_leaf']
catboost_train_params = ['max_depth', 'min_child_samples']

## LightGBM 예측 생성

### Model 설정

In [11]:
base_params = dict(boosting='gbdt',
                   objective='regression',
                   verbose=-1)

categoricals = ['year', 'month', 'sector', 'weekday']

In [12]:
lookahead = 1
store = Path('data/predictions.h5')

### 데이터 수집

In [13]:
data = pd.read_hdf('data.h5', 'model_data').sort_index()

In [14]:
labels = sorted(data.filter(like='_fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'r{lookahead:02}_fwd'

In [15]:
data = data.loc[idx[:, '2010':], features + [label]].dropna()

In [16]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

In [17]:
lgb_data = lgb.Dataset(data=data[features],
                       label=data[label],
                       categorical_feature=categoricals,
                       free_raw_data=False)

### 예측 생성

In [18]:
lgb_ic = pd.read_hdf('data/model_tuning.h5', 'lgb/ic')
lgb_daily_ic = pd.read_hdf('data/model_tuning.h5', 'lgb/daily_ic')

In [19]:
def get_lgb_params(data, t=5, best=0):
    param_cols = scope_params[1:] + lgb_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

In [20]:
for position in range(10):
    params = get_lgb_params(lgb_daily_ic,
                            t=lookahead,
                            best=position)

    params = params.to_dict()

    for p in ['min_data_in_leaf', 'num_leaves']:
        params[p] = int(params[p])
    train_length = int(params.pop('train_length'))
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    params.update(base_params)

    print(f'\nPosition: {position:02}')

    # 1-year out-of-sample period
    n_splits = int(YEAR / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              test_period_length=test_length,
                              lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        lgb_train = lgb_data.subset(used_indices=train_idx.tolist(),
                                    params=params).construct()

        model = lgb.train(params=params,
                          train_set=lgb_train,
                          num_boost_round=num_boost_round,
                          verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        y_pred = model.predict(test_set.loc[:, model.feature_name()])  #테스트 세트에 대해 예측
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')
for position in range(10):
    if position == 0:
        ic_by_day = by_day.apply(lambda x: spearmanr(
            x.y_test, x[position])[0]).to_frame()
    else:
        ic_by_day[position] = by_day.apply(
            lambda x: spearmanr(x.y_test, x[position])[0])
print(ic_by_day.describe())
test_predictions.to_hdf(store, f'lgb/test/{lookahead:02}')


Position: 00
1 2 3 4 
Position: 01
1 2 3 4 
Position: 02
1 2 3 4 
Position: 03
1 2 3 4 
Position: 04
1 2 3 4 
Position: 05
1 2 3 4 
Position: 06
1 2 3 4 
Position: 07
1 2 3 4 
Position: 08
1 2 3 4 
Position: 09
1 2 3 4                 0           1           2  ...           7           8           9
count  252.000000  252.000000  252.000000  ...  252.000000  252.000000  252.000000
mean     0.010198    0.010488    0.010391  ...    0.006679    0.007663    0.007577
std      0.116918    0.119606    0.117120  ...    0.111273    0.109976    0.109862
min     -0.456584   -0.458844   -0.459468  ...   -0.435198   -0.429153   -0.430865
25%     -0.053297   -0.055344   -0.053129  ...   -0.060188   -0.055278   -0.054031
50%      0.018046    0.018239    0.017159  ...    0.009048    0.011481    0.016402
75%      0.084004    0.087733    0.081369  ...    0.080200    0.076819    0.081074
max      0.287681    0.288186    0.292582  ...    0.311119    0.309319    0.319478

[8 rows x 10 columns]


## CatBoost 예측 생성

In [21]:
# lookaheads = [1, 5, 21]
lookaheads = [1]

In [22]:
label_dict = dict(zip(lookaheads, labels))

### Model 설정

In [23]:
lookahead = 1
store = Path('data/predictions.h5')

### 데이터 수집

In [24]:
data = pd.read_hdf('data.h5', 'model_data').sort_index()

In [25]:
labels = sorted(data.filter(like='_fwd').columns)
features = data.columns.difference(labels).tolist()
label = f'r{lookahead:02}_fwd'

In [26]:
data = data.loc[idx[:, '2010':], features + [label]].dropna()

In [27]:
for feature in categoricals:
    data[feature] = pd.factorize(data[feature], sort=True)[0]

In [28]:
cat_cols_idx = [data.columns.get_loc(c) for c in categoricals]

In [29]:
catboost_data = Pool(label=data[label],
                     data=data.drop(label, axis=1),
                     cat_features=cat_cols_idx)

### 예측 생성

In [30]:
catboost_ic = pd.read_hdf('data/model_tuning.h5', 'catboost/ic')
catboost_ic_avg = pd.read_hdf('data/model_tuning.h5', 'catboost/daily_ic')

In [31]:
def get_cb_params(data, t=1, best=0):
    param_cols = scope_params[1:] + catboost_train_params + ['boost_rounds']
    df = data[data.lookahead==t].sort_values('ic', ascending=False).iloc[best]
    return df.loc[param_cols]

In [32]:
# for position in range(10):
for position in range(3):
    params = get_cb_params(catboost_ic_avg,
                    t=lookahead,
                    best=position)
    
    params = params.to_dict()
    
    for p in ['max_depth', 'min_child_samples']:
        params[p] = int(params[p])
    train_length = int(params.pop('train_length'))
    test_length = int(params.pop('test_length'))
    num_boost_round = int(params.pop('boost_rounds'))
    # params['task_type'] = 'GPU'

    print(f'\nPosition: {position:02}')
    
    # 1-year out-of-sample period
    n_splits = int(YEAR / test_length)
    cv = MultipleTimeSeriesCV(n_splits=n_splits,
                              test_period_length=test_length,
                              lookahead=lookahead,
                              train_period_length=train_length)

    predictions = []
    start = time()
    for i, (train_idx, test_idx) in enumerate(cv.split(X=data), 1):
        print(i, end=' ', flush=True)
        train_set = catboost_data.slice(train_idx.tolist())

        model = CatBoostRegressor(**params)
        model.fit(X=train_set,
                  verbose_eval=False)

        test_set = data.iloc[test_idx, :]
        y_test = test_set.loc[:, label].to_frame('y_test')
        print(model)
#         print(model.feature_names_)
        
        y_pred = model.predict(test_set.loc[:, model.feature_names_])   #테스트 세트에 대해 예측
        predictions.append(y_test.assign(prediction=y_pred))

    if position == 0:
        test_predictions = (pd.concat(predictions)
                            .rename(columns={'prediction': position}))
    else:
        test_predictions[position] = pd.concat(predictions).prediction

by_day = test_predictions.groupby(level='date')
for position in range(10):
  try:
      if position == 0:
          ic_by_day = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0]).to_frame()
      else:
          ic_by_day[position] = by_day.apply(lambda x: spearmanr(x.y_test, x[position])[0])
  except Exception as e:
    print(e)
print(ic_by_day.describe())
test_predictions.to_hdf(store, f'catboost/test/{lookahead:02}')


Position: 00
1 <catboost.core.CatBoostRegressor object at 0x7fb932ae1fd0>
2 <catboost.core.CatBoostRegressor object at 0x7fb937518510>
3 <catboost.core.CatBoostRegressor object at 0x7fb933edecd0>
4 <catboost.core.CatBoostRegressor object at 0x7fb933e6ba50>

Position: 01
1 <catboost.core.CatBoostRegressor object at 0x7fb932ae0850>
2 <catboost.core.CatBoostRegressor object at 0x7fb933f73bd0>
3 <catboost.core.CatBoostRegressor object at 0x7fb933eab910>
4 <catboost.core.CatBoostRegressor object at 0x7fb933b21fd0>

Position: 02
1 <catboost.core.CatBoostRegressor object at 0x7fb933ae2110>
2 <catboost.core.CatBoostRegressor object at 0x7fb93476f810>
3 <catboost.core.CatBoostRegressor object at 0x7fb933e6be90>
4 <catboost.core.CatBoostRegressor object at 0x7fb937518510>
3
4
5
6
7
8
9
                0           1           2
count  252.000000  252.000000  252.000000
mean     0.003013    0.003013    0.003013
std      0.130248    0.130248    0.130248
min     -0.406199   -0.406199   -0.406199
25