In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

import yfinance as yf
from ta import *

from utils.ta_utils import *

from bokeh.plotting import figure, show
from bokeh.io import output_notebook
output_notebook()

from tqdm import tqdm_notebook

# Obtenemos el histórico desde Yahoo! Finance

In [2]:
df_NVDA = get_dataset_for_stock('NVDA')

In [3]:
show_stock_analysis(df_NVDA)

Unnamed: 0,open,high,low,close,volume,dividends,stock_splits
count,5230.0,5230.0,5230.0,5230.0,5230.0,5231.0,5231.0
mean,37.803795,38.416831,37.142735,37.792016,15988840.0,0.000624,0.001816
std,61.585468,62.441628,60.608007,61.541794,11606880.0,0.008819,0.059044
min,1.286413,1.310413,1.228813,1.257613,492000.0,0.0,0.0
25%,8.531796,8.765273,8.294485,8.556375,8811600.0,0.0,0.0
50%,13.647702,13.861773,13.409417,13.649036,13381600.0,0.0,0.0
75%,21.759197,22.172385,21.412041,21.838721,19692600.0,0.0,0.0
max,288.118825,291.544545,284.394331,288.158661,230771400.0,0.16,2.0


Timestamp('1999-01-22 00:00:00')

Timestamp('2019-11-01 00:00:00')

-------------------------


# Definición del escenario de inversión:

- Se dispone de capital inicial para comprar una única acción y siempre se opera "all-in"
- Sólo se puede operar con un único activo
- 6 meses de horizonte (01/04 al 30/09)
- Se trabajará con el precio de cierre como referencia
- No existen costos o fees por operación

### Filtro para backtesting

In [4]:
backtesting_min = pd.datetime(2019, 4, 1)
backtesting_max = pd.datetime(2019, 9, 30)
test_filter = (df_NVDA.index >= backtesting_min) & (df_NVDA.index <= backtesting_max)

### Cálculo de rendimiento

In [5]:
def get_pnl(df):
    return df[df.order == 'sell'].close.sum() - df[df.order == 'buy'].close.sum()

------------------

# 1. Estrategia "buy and hold"

In [6]:
def buy_and_hold(df):
    df['order'] = 'hold'
    
    # Enviar opción de compra el día 1
    df.iloc[0, df.columns.get_loc('order')] = 'buy'
    
    # Enviar opción de venta el último día
    df.iloc[-1, df.columns.get_loc('order')] = 'sell'
    
    return df

In [7]:
df_NVDA_bah = buy_and_hold(df_NVDA[test_filter].copy())
pnl_bah = get_pnl(df_NVDA_bah)
pnl_bah

-7.822113037109375

In [8]:
df_NVDA_bah[df_NVDA_bah.order != 'hold']

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock_splits,order
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2019-04-01,182.870031,183.109526,179.986181,181.89212,12095600.0,0.0,0.0,buy
2019-09-30,172.800003,174.479996,171.539993,174.070007,4654600.0,0.0,0.0,sell


In [9]:
show_strategy_actions(df_NVDA_bah)

------------------

# 2. Estrategia TA (medias móviles)

In [10]:
df_NVDA_with_ti = add_technical_indicators(df_NVDA.copy())

In [11]:
df_NVDA_with_ti.sample(2)

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock_splits,ti_macd,ti_macd_diff,ti_macd_dm1,...,ti_rsi_oversold,ti_cmf,ti_cmf_dm1,ti_cmf_dm2,ti_vpi,ti_bb,ti_atr,ti_ic,ti_mfi,ti_cr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-08-23,2.035221,2.035221,1.872019,1.93922,12699600.0,0.0,0.0,0.111494,1.827726,0.105751,...,False,0.281796,0.307199,0.290394,-849241.026471,0.0,0.109016,1.855219,79.504389,-6.23705
2018-10-19,240.756294,241.543023,226.754669,228.218567,15340200.0,0.0,0.0,-9.085014,237.30358,-7.904408,...,False,-0.18975,-0.166595,-0.139714,-853744.749079,0.0,0.0,253.174527,30.850912,-4.421464


In [12]:
def moving_average_convergence_divergence(df):
    open_position = False
    
    df['order'] = 'hold'
    
    for _ix, (index, row) in enumerate(df.iloc[3:].iterrows()):
        # MACD > 0, señal de compra.
        if((df.iloc[_ix - 3].ti_macd < 0) & (df.iloc[_ix - 2].ti_macd < 0) & 
           (df.iloc[_ix - 1].ti_macd > 0) & (df.iloc[_ix].ti_macd > 0) & 
           (not open_position)):
            open_position = True
            df.loc[index, 'order'] = 'buy'
        
        # MACD < 0, señal de compra.
        if((df.iloc[_ix - 3].ti_macd > 0) & (df.iloc[_ix - 2].ti_macd > 0) & 
           (df.iloc[_ix - 1].ti_macd < 0) & (df.iloc[_ix].ti_macd < 0) & 
           (open_position)):
            open_position = False
            df.loc[index, 'order'] = 'sell'
    
    if(open_position):
        df.iloc[-1, df.columns.get_loc('order')] = 'sell'
        
    return df

In [13]:
df_NVDA_macd = moving_average_convergence_divergence(df_NVDA_with_ti[test_filter].copy())
pnl_macd = get_pnl(df_NVDA_macd)
pnl_macd

1.1411895751953125

In [14]:
df_NVDA_macd[df_NVDA_macd.order != 'hold']

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock_splits,ti_macd,ti_macd_diff,ti_macd_dm1,...,ti_cmf,ti_cmf_dm1,ti_cmf_dm2,ti_vpi,ti_bb,ti_atr,ti_ic,ti_mfi,ti_cr,order
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-03,163.008633,163.238401,160.18143,162.58905,6165900.0,0.0,0.0,2.576859,160.012192,2.337754,...,-0.114359,-0.161624,-0.048793,-217478.483333,0.0,0.0,157.648948,69.892719,0.32002,buy
2019-08-12,151.929614,154.027528,150.421113,151.300232,6980000.0,0.0,0.0,-2.491427,153.791659,-1.894627,...,0.089075,0.099399,0.043065,-343894.801764,0.0,0.0,162.566576,31.232052,-1.786518,sell
2019-08-27,166.824869,166.934761,160.461158,161.639999,7274200.0,0.0,0.0,0.663825,160.976174,0.869555,...,-0.057091,-0.02042,-0.006572,-13066.89729,0.0,0.0,161.620011,56.326262,-2.230795,buy
2019-09-30,172.800003,174.479996,171.539993,174.070007,4654600.0,0.0,0.0,1.724645,172.345362,2.018918,...,0.055966,0.005328,0.043525,-222202.183872,0.0,0.0,174.535,47.876541,1.335944,sell


In [15]:
show_strategy_actions(df_NVDA_macd)

------------------

# 3. Estrategia basada en ML

In [16]:
from sklearn.model_selection import TimeSeriesSplit

### Calculamos la variable que vamos a querer predecir

In [17]:
def add_binary_target(df, days_window=-5, price_threshold=5):
    df["target"] = 0
    df["price_diff"] = (df.close.shift(days_window) - df.close).fillna(0)
    # Si el precio va a aumentar más de un valor determinado, comprar (1).
    df.loc[df["price_diff"] > price_threshold, 'target'] = 1
    # Si el precio va a bajar más de un valor determinado, vender (-1).
    df.loc[df["price_diff"] < -price_threshold, 'target'] = -1
    
    return df

In [19]:
df_NVDA_with_ti_and_target = add_binary_target(df_NVDA_with_ti.copy())
df_NVDA_with_ti_and_target.tail(8)[['close', 'target', 'price_diff']]

Unnamed: 0_level_0,close,target,price_diff
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-23,195.089996,1,7.910004
2019-10-24,196.860001,0,4.160004
2019-10-25,204.539993,0,-2.119995
2019-10-28,206.789993,0,0.0
2019-10-29,202.919998,0,0.0
2019-10-30,203.0,0,0.0
2019-10-31,201.020004,0,0.0
2019-11-01,202.419998,0,0.0


In [20]:
del df_NVDA_with_ti_and_target['price_diff']

### Mostramos la cantidad de ejemplos en cada clase

In [21]:
df_NVDA_with_ti_and_target.target.value_counts()

 0    4756
 1     278
-1     197
Name: target, dtype: int64

### Desarrollo estrategia y generación de predicciones

In [22]:
def model_based(df, prob_threshold=0.9):
    open_position = False
    
    df['order'] = 'hold'
    
    for _ix, (index, row) in enumerate(df.iterrows()):
        # Si el modelo nos da una probabilidad de compra fuerte, "comprar".
        if(row.pred_proba_buy >= prob_threshold) & (open_position == False):
            open_position = True
            df.loc[index, 'order'] = 'buy'
        else:
            # Si el modelo nos da una probabilidad de venta fuerte, "vender".
            if(row.pred_proba_sell >= prob_threshold) & (open_position == True):
                open_position = False
                df.loc[index, 'order'] = 'sell'
        
    if(open_position):
        df.iloc[-1, df.columns.get_loc('order')] = 'sell'
    
    return df

In [23]:
def add_model_predictions(df, model, target_column_name='target', use_scaler=False, max_train_size=1000):
    tscv = TimeSeriesSplit(n_splits=int(max_train_size/5), max_train_size=max_train_size)
    df_experimentation = df.copy()
    
    X = df_experimentation[[c for c in df_experimentation.columns if c != target_column_name]]
    y = df_experimentation[target_column_name]
    
    df['pred_proba_sell'] = 0
    df['pred_proba_buy'] = 0
    
    for _ix, (train_index, test_index) in enumerate(tscv.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        if(y_train.nunique() > 1):       
            if(use_scaler):
                from sklearn.preprocessing import StandardScaler
                scaler = StandardScaler()
                scaler.fit(X_train)
                X_train = scaler.transform(X_train)
                X_test = scaler.transform(X_test)

            model.fit(X_train, y_train)

            y_pred = model.predict_proba(X_test)

            sell_class_index = np.where(-1 == model.classes_)[0]

            if(len(sell_class_index) > 0):
                df.iloc[test_index, df.columns.get_loc('pred_proba_sell')] = y_pred[:, sell_class_index]

            buy_class_index = np.where(1 == model.classes_)[0]

            if(len(buy_class_index) > 0):
                df.iloc[test_index, df.columns.get_loc('pred_proba_buy')] = y_pred[:, buy_class_index]
    
    return df

In [41]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=5)
df_with_preds = add_model_predictions(df_NVDA_with_ti_and_target.tail(3000).copy(), 
                                      model,
                                      max_train_size=2500)
df_with_preds.tail(3)

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock_splits,ti_macd,ti_macd_diff,ti_macd_dm1,...,ti_cmf_dm2,ti_vpi,ti_bb,ti_atr,ti_ic,ti_mfi,ti_cr,target,pred_proba_sell,pred_proba_buy
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-10-30,204.0,204.229996,200.259995,203.0,5139900.0,0.0,0.0,6.932818,196.067182,6.939527,...,-0.067875,-96234.66491,0.0,0.0,193.654999,68.18348,0.039418,0,0.376013,0.519015
2019-10-31,202.160004,203.0,197.809998,201.020004,5239600.0,0.0,0.0,6.690607,194.329398,6.932818,...,-0.038742,-49078.926528,0.0,0.0,194.537498,61.743817,-0.980155,0,0.34716,0.550026
2019-11-01,199.600006,203.889999,198.610001,202.419998,2934750.0,0.0,0.0,6.536274,195.883724,6.690607,...,-0.056895,-30666.42635,0.0,0.0,194.537498,65.539932,0.694031,0,0.271752,0.596937


In [59]:
filter_ = (df_with_preds.index >= pd.datetime(2019, 4, 1)) & (df_with_preds.index <= pd.datetime(2019, 9, 30))
df_with_strategy = model_based(df_with_preds[filter_].copy(), prob_threshold=0.4)
df_with_strategy[df_with_strategy.order != 'hold'][['close', 'order']]

Unnamed: 0_level_0,close,order
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-01,181.892120,buy
2019-04-24,190.763214,sell
2019-04-25,186.512268,buy
2019-04-26,177.711029,sell
2019-04-29,178.948410,buy
...,...,...
2019-09-24,172.529999,sell
2019-09-25,178.229996,buy
2019-09-26,177.339996,sell
2019-09-27,171.759995,buy


In [60]:
pnl_ml = get_pnl(df_with_strategy)
pnl_ml

15.059860229492188

In [61]:
show_strategy_actions(df_with_strategy)

### Análisis por variable (último modelo)

In [62]:
plot_features_relevance(df_NVDA_with_ti.columns.values, model.feature_importances_)

## Calibración de probabilidades

In [63]:
hist_values, edges = np.histogram(df_with_preds[filter_].pred_proba_buy, bins=50)
p = figure(plot_width=650, plot_height=350, title="Probabilidades de comprar")
p.quad(top=hist_values, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="green", line_color="white", alpha=0.5)
show(p)

In [64]:
hist_values, edges = np.histogram(df_with_preds[filter_].pred_proba_sell, bins=50)
p = figure(plot_width=650, plot_height=350, title="Probabilidades de vender")
p.quad(top=hist_values, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="red", line_color="white", alpha=0.5)
show(p)

# Estabilidad de resultados

### Repetimos el experimento para 100 iteraciones

In [34]:
filter_ = (df_with_preds.index >= pd.datetime(2019, 4, 1)) & (df_with_preds.index <= pd.datetime(2019, 9, 30))

In [None]:
pnls = []

for i in range(100):
    model = RandomForestClassifier(class_weight='balanced', n_estimators=100)
    df_with_preds = add_model_predictions(df_NVDA_with_ti_and_target.tail(3000).copy(),
                                          model=model,
                                          max_train_size=2500)
    df_with_strategy = model_based(df_with_preds[filter_].copy(), prob_threshold=0.5)
    pnls.append(get_pnl(df_with_strategy))

### Obtentemos el histograma con el PNL de cada iteración

In [37]:
hist_values, edges = np.histogram(pnls, density=False, bins=25)
p = figure(plot_width=650, plot_height=350)
        
p.quad(top=hist_values, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="navy", line_color="white", alpha=0.5)

show(p)

### Medimos resultados ponderados por frecuencias

In [38]:
edges_means = (edges[1:] + edges[:-1]) / 2
results = hist_values / 100 * edges_means
results_pos = results[edges_means > 0].sum()
results_neg = results[edges_means < 0].sum()

In [39]:
(results_neg, results_pos)

(-0.1589466064453125, 21.92019715576172)

In [40]:
results_pos / results_neg * (-1)

137.90918627321327

# Comparamos vs utilizar un único entrenamiento

In [112]:
def add_model_predictions_train_test_split(df, 
                                           model, 
                                           target_column_name='target', 
                                           use_scaler=False, 
                                           train_max_date=None):
    
    df_experimentation = df.copy()
    
    X = df_experimentation[[c for c in df_experimentation.columns if c != target_column_name]]
    y = df_experimentation[target_column_name]
    
    df['pred_proba_sell'] = 0
    df['pred_proba_buy'] = 0
    
    X_train, X_test = X[X.index <= train_max_date], X[X.index > train_max_date]
    y_train, y_test = y[X.index <= train_max_date], y[X.index > train_max_date]
       
    if(y_train.nunique() > 1):       
        if(use_scaler):
            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)

        model.fit(X_train, y_train)

        y_pred = model.predict_proba(X_test)

        sell_class_index = np.where(-1 == model.classes_)[0]

        if(len(sell_class_index) > 0):
            df.loc[X_test.index, 'pred_proba_sell'] = y_pred[:, sell_class_index]

        buy_class_index = np.where(1 == model.classes_)[0]

        if(len(buy_class_index) > 0):
            df.loc[X_test.index, 'pred_proba_buy'] = y_pred[:, buy_class_index]
    
    return df

### Volvemos a repetir el experimento para 100 iteraciones

In [113]:
pnls_train_test = []

for i in range(100):
    model = RandomForestClassifier(class_weight='balanced', n_estimators=10)
    df_with_preds = add_model_predictions_train_test_split(df_NVDA_with_ti_and_target.tail(1250).copy(), model, train_max_date='2019-03-31')
    df_with_strategy = model_based(df_with_preds[filter_].copy())
    pnls_train_test.append(get_pnl(df_with_strategy))

### Obtentemos el histograma con el PNL de cada iteración

In [114]:
hist_values, edges = np.histogram(pnls_train_test, density=False, bins=25)
p = figure(plot_width=650, plot_height=350)
        
p.quad(top=hist_values, bottom=0, left=edges[:-1], right=edges[1:],
       fill_color="blue", line_color="white", alpha=0.5)

show(p)

# Utilizando otras ventanas para el dataset de entrenamiento

In [129]:
model = RandomForestClassifier(class_weight='balanced', n_estimators=10)
pnls_time_windows = []

time_windows = [250, 500, 1500, 2000]

for time_window in time_windows:
    df_with_preds = add_model_predictions(df_NVDA_with_ti_and_target.tail(2500).copy(),
                                          model, 
                                          max_train_size=time_window)
    df_with_strategy = model_based(df_with_preds[(df_with_preds.index >= backtesting_min) & 
                                                 (df_with_preds.index <= backtesting_max)].copy())
    pnls_time_windows.append(get_pnl(df_with_strategy))

In [130]:
plot_time_windows_pnls(pnls_time_windows, time_windows)