# 0. Introduction

En este notebook se va a comparar el rendimiento medido en términos de tiempo de entrenamiento y precisión obtenida fuera de muestra de diferentes modelos supervisados.

In [72]:
# If you'd like to install packages that are not installed by default, uncomment the last two lines of this cell and replace  with a list of your packages.
# This will ensure your notebook has all the dependencies and works everywhere
import sys
!{sys.executable} -m pip install xgboost sklearn pandas numpy seaborn scipy matplotlib yfinance pandas_ta 

Defaulting to user installation because normal site-packages is not writeable


In [73]:
# import libraries
import timeit
#
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta
import yfinance as yf
#
from datetime import datetime
from sklearn import svm
from sklearn.cluster import KMeans
from sklearn.ensemble import AdaBoostClassifier , GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier 
from xgboost import XGBClassifier

In [74]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

# 1. Dataset construction

To keep things simple, we use 6 Technical Indicators (with multiple parameters) and construct the target label as a 1-day return (Up or Down).

In [75]:
#Function to generate the dataset
def dataset(ticker, days):
    # Define time to download data
    today = datetime.today()
    initial = today - dt.timedelta(days=days)
    today = today.strftime('%Y-%m-%d')
    initial = initial.strftime('%Y-%m-%d')
    #Download data with YFINANCE
    data = yf.download(ticker, start=initial, end=today, progress=False)
    # list with parameter values to use with the Technical Indcators:
    periods = range(14,50)
    # dataframe creation with the Technical Indicators
    df = pd.DataFrame(data[['High', 'Low', 'Close']])
    for n in periods:
        #CCI - Commodity Channel Index
        df.ta.cci(length=n, append=True)
        #RSI - Relative Strength Index
        df.ta.rsi(length=n, append=True)
        #STOCH - Stochastic
        df.ta.stoch(k=n, append=True)
        #WILLR - Williams' %R
        df.ta.willr(length=n, append=True)
        #ADX
        df.ta.adx(length=n, append=True)
        #MACD - Moving Average Convergence/Divergence
        df.ta.macd(slow=n, append=True)
    df.index = data.index
    # 1-day return
    df["Return"] = df["Close"].pct_change().shift(-1)
    # Target variable label
    df["Label"] = np.where(df["Return"]>0,"UP","DOWN")
    # drop nan values
    df = df.dropna()
    df = df.drop(columns=['High', 'Low'])
    return df

# 2. Out-of-sample (OOS) & In-sample (IS) datasets

We assume that 1 year corresponds with 250 observations.

We keep the last year as an out of sample period (OOS) and we define multiple in sample periods (IS).

We use EURUSD as an instrument, which suffer less from a bullish bias like the stock market.

In [76]:
instrument = "EURUSD=X"
data = dataset(instrument, 4000)
data

Unnamed: 0_level_0,Close,CCI_14_0.015,RSI_14,STOCHk_14_3_3,STOCHd_14_3_3,WILLR_14,ADX_14,DMP_14,DMN_14,MACD_12_14_9,...,STOCHd_49_3_3,WILLR_49,ADX_49,DMP_49,DMN_49,MACD_12_49_9,MACDh_12_49_9,MACDs_12_49_9,Return,Label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-04-27,1.319087,98.478489,51.038945,81.712149,80.730070,-28.649670,19.276641,17.866089,21.199062,0.000062,...,41.295740,-60.355674,49.622792,15.343039,36.281616,0.000703,0.000773,-0.000071,0.003840,UP
2012-04-30,1.324152,127.422385,54.601184,83.716661,82.018227,-10.017628,18.507741,16.972117,20.138317,0.000193,...,43.336723,-50.003167,49.335275,15.165505,35.861803,0.001449,0.001216,0.000233,-0.000132,DOWN
2012-05-01,1.323977,115.498837,54.453574,82.281899,82.570236,-14.487005,17.553797,17.214907,19.094008,0.000282,...,45.116149,-50.361327,49.037676,15.242374,35.436954,0.002027,0.001435,0.000592,0.000000,DOWN
2012-05-02,1.323977,50.088132,54.453574,87.002787,84.333783,-14.487005,17.581050,15.894581,22.841856,0.000341,...,47.377939,-50.361327,48.796498,14.973029,36.011659,0.002484,0.001514,0.000970,-0.006968,DOWN
2012-05-03,1.314752,-42.210902,46.738667,74.685202,81.323296,-46.970384,17.932568,14.397787,22.755423,0.000189,...,46.512672,-69.217312,48.582694,14.640618,35.698904,0.001789,0.000655,0.001134,0.000658,UP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-18,1.036173,85.819574,67.410973,86.033421,85.577773,-15.284101,25.304350,26.385322,9.847386,0.002756,...,88.536146,-12.161970,14.541093,20.520709,15.585239,0.017970,0.008588,0.009382,-0.002975,DOWN
2022-11-21,1.033090,54.709338,65.119551,84.825723,84.997108,-19.426472,25.407215,24.611691,14.225020,0.002673,...,88.061795,-15.458166,14.424605,20.090752,16.829510,0.018830,0.007558,0.011272,-0.008248,DOWN
2022-11-22,1.024569,35.254031,59.136547,78.138112,82.999085,-30.875092,25.502733,23.506028,13.585971,0.002382,...,86.471915,-24.568141,14.310495,19.815362,16.598823,0.018442,0.005737,0.012706,0.006383,UP
2022-11-23,1.031109,54.579860,62.020620,75.695111,79.552982,-22.613104,26.221633,26.581242,12.633576,0.002255,...,83.776138,-17.576438,14.270861,20.830194,16.244630,0.018758,0.004842,0.013916,0.010198,UP


In [77]:
# OOS period
oos = data.tail(250)

#IS period, remove oos data
in_sample = data.drop(oos.index)

#1-year IS
is_1A = in_sample.tail(250)

#2-year IS
is_2A = in_sample.tail(500)

#3-year IS
is_3A = in_sample.tail(750)

#4-year IS
is_4A = in_sample.tail(1000)

#5-year IS
is_5A = in_sample.tail(1250)

#6-year IS
is_6A = in_sample.tail(1500)

#7-year IS
is_7A = in_sample.tail(1750)

#8-year IS
is_8A = in_sample.tail(2000)

#9-year IS
is_9A = in_sample.tail(2250)

#10-year IS
is_10A = in_sample.tail(2500)

#11-year IS
is_11A = in_sample.tail(2750)

#12-year IS
is_12A = in_sample.tail(3000)

# 3. Function to generate the OOS accuracy Score for all different models

In [78]:
def score(model, in_sample, out_of_sample):
  if model == "Decision Tree":
    model = DecisionTreeClassifier()
  if model == "Random Forest":
    model = RandomForestClassifier()
  if model == "SVM":
    model = svm.SVC()
  if model == "KNN":
    model = KNeighborsClassifier()
  if model == "XGBoost":
    model = XGBClassifier()
  if model == "AdaBoost":
    model = AdaBoostClassifier()
  if model == "Gaussian Naive Bayes":
    model = GaussianNB()
  if model == "Bernoulli Naive Bayes":
    model = BernoulliNB()
    
  le = LabelEncoder()

  #IS independent variables (features)
  X = in_sample.iloc[:,:-3]
  #IS dependent variable (target label)
  Y = in_sample["Label"]
  Y = le.fit_transform(Y)

  #OOS independent variables (features)
  X_oos = oos.iloc[:,:-3]
  #OOS dependent variable (target label)
  Y_oos = oos["Label"]  
  Y_oos = le.fit_transform(Y_oos)

  # store OOS score and timing of the training
  scores = []
  times = []
  for n in range(10):
    # training
    start = timeit.default_timer()
    model.fit(X, Y)
    end = timeit.default_timer() - start
    #OOS ACCURACY
    score = accuracy_score(Y_oos, model.predict(X_oos))
    scores.append(score)
    times.append(end)

  #RETORNAMOS RESULTADO PROMEDIO TRAS 10 ITERACIONES
  return [round(np.mean(scores),2), round(np.mean(times),2)]

# 4. Function to construct a results table for each model

In [79]:
models = ["Decision Tree", "Random Forest", "SVM", "KNN", "XGBoost", "AdaBoost", "Gaussian Naive Bayes", "Bernouilli Naive Bayes"]
in_samples = [is_1A, is_2A, is_3A, is_4A, is_5A, is_6A, is_7A, is_8A, is_9A, is_10A, is_11A, is_12A]

def tabla_scores(model, in_samples, oos):
  years_is = []
  accuracy = []
  time = [] 
  for years, period in enumerate(in_samples):
    years_is.append(years+1)
    scores = score(model, period, oos)
    accuracy.append(scores[0])
    time.append(scores[1])

  df_scores = pd.DataFrame({"years in sample": years_is, "accuracy oos": accuracy, "training time": time})
  df_scores =df_scores.set_index("years in sample")

  return df_scores


# 5. Display results for each model

In [80]:
dt = tabla_scores("Decision Tree", in_samples, oos)
dt

NameError: name 'le' is not defined

In [None]:
rf = tabla_scores("Random Forest", in_samples, oos)
rf

In [None]:
svm = tabla_scores("SVM", in_samples, oos)
svm

In [None]:
knn = tabla_scores("KNN", in_samples, oos)
knn

In [None]:
xgb = tabla_scores("XGBoost", in_samples, oos)
xgb

In [None]:
ab = tabla_scores("AdaBoost", in_samples, oos)
ab

In [None]:
gnb = tabla_scores("Gaussian Naive Bayes", in_samples, oos)
gnb

In [None]:
bnb = tabla_scores("Bernoulli Naive Bayes", in_samples, oos)
bnb

# 6. Display results altogether

In [None]:
results = pd.DataFrame()
results.index = dt.index
results["Decision Tree"] = dt["accuracy oos"]
results["Random Forest"] = rf["accuracy oos"]
results["SVM"] = svm["accuracy oos"]
results["KNN"] = knn["accuracy oos"]
results["XGBoost"] = xgb["accuracy oos"]
results["AdaBoost"] = ab["accuracy oos"]
results["Gaussian Naive Bayes"] = gnb["accuracy oos"]
results["Bernoulli Naive Bayes"] = bnb["accuracy oos"]

results

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(resultados["Decision Tree"], label="Decision Tree")
plt.plot(resultados["Random Forest"], label="Random Forest")
plt.plot(resultados["SVM"], label="SVM")
plt.plot(resultados["KNN"], label="KNN")
plt.plot(resultados["XGBoost"], label="XGBoost")
plt.plot(resultados["AdaBoost"], label="AdaBoost")
plt.plot(resultados["Gaussian Naive Bayes"], label="Gausian Naive Bayes")
plt.plot(resultados["Bernoulli Naive Bayes"], label="Bernoulli Naive Bayes")
plt.legend(bbox_to_anchor = (1, 0.5))
plt.ylabel('% accuracy')
plt.xlabel('years of in-saple training')
plt.title(f'Out-of-Sample accuracy score, {instrument}')
plt.show()