In [229]:
import pandas as pd
import os


DATA_FILEPATH = "data/CAC40.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
cac40df = fetch_data()
cac40df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,1832.000000,0
1,1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,1860.000000,0
2,1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,1874.000000,0
3,1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,1872.000000,0
4,1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,1880.000000,0
...,...,...,...,...,...,...,...
7999,2021-09-15,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200
8000,2021-09-16,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500
8001,2021-09-17,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500
8002,2021-09-20,6450.390137,6471.089844,6389.620117,6455.810059,6455.810059,120069400


# Build a Decision tree model.  determine  the  number  attributes  that  is  capable  of  giving the best prediction of ‘daily returns’.
* (a)  Find  the  best  tree  depth  for  the  model  with  the  selected attributes

# Preprocessing
We can remove the date col as it is useless for predictions

In [230]:
cac40df.drop(columns=['Date'], axis=1, inplace=True)

MA signal - we don't want them numeric, so we will signal with the MA50 and ma200 collide with the closing price as classes (0=clash, 1=below, 2=above

In [231]:
import numpy as np

def isColliding(row, wind):
    if row[f"maPosition{wind}"] == 0 or (np.sign(row[f"maPosition{maWindow}"]) != np.sign(row[f"maPositionPrev{wind}"])):
        return 0
    if row[f"maPosition{wind}"] < 0:
        return 1
    if row[f"maPosition{wind}"] > 0:
        return 2
    return np.NaN

maWindows = [50, 200]

cols = ["Close"]

for maWindow in maWindows:
    cols += [f"ma{maWindow}", f"maPosition{maWindow}", f"maPositionPrev{maWindow}", f"maSignal{maWindow}"]
    cac40df[f"ma{maWindow}"] = cac40df["Close"].rolling(maWindow).mean()
    cac40df[f"maPosition{maWindow}"] = cac40df["Close"] - cac40df[f"ma{maWindow}"]
    cac40df[f"maPositionPrev{maWindow}"] = cac40df[f"maPosition{maWindow}"].shift(-1)
    cac40df[f"maSignal{maWindow}"] = cac40df.apply(lambda row : isColliding(row, maWindow), axis = 1)

cac40df[cols]

Unnamed: 0,Close,ma50,maPosition50,maPositionPrev50,maSignal50,ma200,maPosition200,maPositionPrev200,maSignal200
0,1832.000000,,,,0,,,,0
1,1860.000000,,,,0,,,,0
2,1874.000000,,,,0,,,,0
3,1872.000000,,,,0,,,,0
4,1880.000000,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...
7999,6583.620117,6654.120215,-70.500098,-36.047568,1,6180.638843,402.981274,436.883801,2
8000,6622.589844,6658.637412,-36.047568,-89.262871,1,6185.706042,436.883801,379.499849,2
8001,6570.189941,6659.452812,-89.262871,-201.573955,1,6190.690093,379.499849,260.644265,2
8002,6455.810059,6657.384014,-201.573955,-104.539229,1,6195.165793,260.644265,352.534636,2


Drop the construction columns

In [232]:
for maWindow in maWindows:
    cols = [f"ma{maWindow}", f"maPosition{maWindow}", f"maPositionPrev{maWindow}"]
    cac40df.drop(columns=cols, axis=1, inplace=True)

cac40df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,maSignal50,maSignal200
0,1836.000000,1838.000000,1827.000000,1832.000000,1832.000000,0,0,0
1,1831.000000,1860.000000,1831.000000,1860.000000,1860.000000,0,0,0
2,1866.000000,1874.000000,1862.000000,1874.000000,1874.000000,0,0,0
3,1869.000000,1875.000000,1866.000000,1872.000000,1872.000000,0,0,0
4,1874.000000,1881.000000,1874.000000,1880.000000,1880.000000,0,0,0
...,...,...,...,...,...,...,...,...
7999,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,1,2
8000,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,1,2
8001,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,1,2
8002,6450.390137,6471.089844,6389.620117,6455.810059,6455.810059,120069400,1,2


Now we have the signal for as classes instead of the moving average. Similarly, we add the Bollinger Bands signals

In [233]:
x = np.linspace(0, len(cac40df), len(cac40df))

meanPrice = cac40df["Close"].mean()
threshold = meanPrice * 0.0005

maWindows = 200
# calculate mean rolling
mean = cac40df["Close"].rolling(maWindow).mean()
# calculate std rolling
std = cac40df["Close"].rolling(maWindow).std()
# calculate high and low band
cac40df["Boll_bands_high_200"] = mean + 2*std
cac40df["Boll_bands_low_200"] = mean - 2*std

cac40df["boll_low_signal"] = cac40df.apply(lambda row: 1 if abs(row["Close"] - row["Boll_bands_low_200"]) < threshold else 0, axis=1) 
cac40df["boll_high_signal"] = cac40df.apply(lambda row: 1 if abs(row["Boll_bands_high_200"] - row["Close"]) < threshold else 0, axis=1)

#Drop construction cols
cac40df.drop(columns=["Boll_bands_high_200", "Boll_bands_low_200"], axis=1, inplace=True)

print(f"Buy/sell signals : {len(cac40df[cac40df['boll_high_signal'] == 1])} / {len(cac40df[cac40df['boll_low_signal'] == 1])}")

Buy/sell signals : 42 / 13


Sochastic oscillator, rate=50

In [234]:
mw = 50
cac40df[f"min{mw}"] = cac40df["Close"].rolling(mw).min()
cac40df[f"max{mw}"] = cac40df["Close"].rolling(mw).max()
cac40df["Stochastic"] = 100 * ((cac40df["Close"] - cac40df[f"min{mw}"]) / (cac40df[f"max{mw}"] - cac40df[f"min{mw}"]))
#Drop construction cols
cac40df.drop(columns=[f"min{mw}", f"max{mw}"], axis=1, inplace=True)


Now we add the labels, also as a class

In [235]:
from math import isnan

def binaryPrediction(number):
    #Up
    if isnan(number):
        return float('NaN')
    if number > 0:
        return 1 #Up
    return 2 #Down

#Daily return on closing price
cac40df["Daily_return"] = (cac40df["Close"] - cac40df["Close"].shift(1)) / cac40df["Close"].shift(1)

#Label
cac40df["Daily_return"] = cac40df["Daily_return"].apply(binaryPrediction)
#Shift to have next day prediction
cac40df["Daily_return"] = cac40df["Daily_return"].shift(-1)

#Drop all the cols with Nana (MA, shifts)
cac40df.dropna(inplace=True)
cac40df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,maSignal50,maSignal200,boll_low_signal,boll_high_signal,Stochastic,Daily_return
49,2088.000000,2088.000000,2064.000000,2070.000000,2070.000000,0,2,0,0,0,80.134680,1.0
50,2068.000000,2080.000000,2057.000000,2071.000000,2071.000000,0,2,0,0,0,78.438662,1.0
51,2069.000000,2105.000000,2069.000000,2104.000000,2104.000000,0,2,0,0,0,90.272374,2.0
52,2110.000000,2120.000000,2096.000000,2100.000000,2100.000000,0,2,0,0,0,88.715953,2.0
53,2083.000000,2094.000000,2075.000000,2092.000000,2092.000000,0,2,0,0,0,85.140562,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
7998,6672.189941,6677.069824,6613.520020,6652.970215,6652.970215,63626300,1,2,0,0,59.493077,2.0
7999,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,1,2,0,0,47.936072,1.0
8000,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,1,2,0,0,54.430271,2.0
8001,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,1,2,0,0,45.697970,2.0


Now we apply the same treatment as the Titanic to the data - once with only the signals, and once with every feature, to see if the tree can manage to predict something :-)

In [236]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree

* First with a restricted number of attrbibutes (signals)

In [237]:
X,y = cac40df[['maSignal50', 'maSignal200', 'boll_low_signal', 'boll_high_signal', 'Stochastic']], cac40df["Daily_return"]

treeClf = tree.DecisionTreeClassifier()

#GridSearch will apply cross-validation and test every possible parameter to determine the best combination. The max depth is our main target.
Depths = range(1,15)
parameters = {
    "criterion": ["gini", "entropy"],
    "max_depth": Depths,
}
gridTreeClf = GridSearchCV(treeClf, parameters, n_jobs=-1, return_train_score=True)
 
gridTreeClf.fit(X,y)

print(f"Best parameter found : {gridTreeClf.best_params_}, score: {round(100*gridTreeClf.best_score_, 1)}%")

print(f"Mean test/train scored :")
for i in Depths:
 print(f"Max depth {i}, test={round(100*gridTreeClf.cv_results_['mean_test_score'][i], 1)}%, train={round(100*gridTreeClf.cv_results_['mean_train_score'][i], 1)}%")

Best parameter found : {'criterion': 'entropy', 'max_depth': 7}, score: 56.6%
Mean test/train scored :
Max depth 1, test=53.4%, train=53.9%
Max depth 2, test=55.3%, train=56.7%
Max depth 3, test=54.8%, train=57.5%
Max depth 4, test=56.5%, train=58.7%
Max depth 5, test=56.1%, train=59.3%
Max depth 6, test=56.4%, train=60.1%
Max depth 7, test=56.1%, train=60.9%
Max depth 8, test=56.0%, train=62.0%
Max depth 9, test=55.7%, train=63.0%
Max depth 10, test=55.9%, train=64.3%
Max depth 11, test=55.5%, train=65.5%
Max depth 12, test=54.9%, train=66.8%
Max depth 13, test=55.0%, train=68.1%
Max depth 14, test=53.9%, train=53.9%


* Now with all the attributes, even if they are not restricted into a class for the close/open/etc to test if the signal are relevant

In [238]:
X,y = cac40df.drop(columns=['Daily_return'], axis=1, inplace=False), cac40df["Daily_return"]

treeClf = tree.DecisionTreeClassifier()

#GridSearch will apply cross-validation and test every possible parameter to determine the best combination. The max depth is our main target.
Depths = range(1,15)
parameters = {
    "criterion": ["gini", "entropy"],
    "max_depth": Depths,
}
gridTreeClf = GridSearchCV(treeClf, parameters, n_jobs=-1, return_train_score=True)
 
gridTreeClf.fit(X,y)

print(f"Best parameter found : {gridTreeClf.best_params_}, score: {round(100*gridTreeClf.best_score_, 1)}%")

print(f"Mean test/train scored :")
for i in Depths:
 print(f"Max depth {i}, test={round(100*gridTreeClf.cv_results_['mean_test_score'][i], 1)}%, train={round(100*gridTreeClf.cv_results_['mean_train_score'][i], 1)}%")

Best parameter found : {'criterion': 'gini', 'max_depth': 11}, score: 54.4%
Mean test/train scored :
Max depth 1, test=52.4%, train=54.0%
Max depth 2, test=53.7%, train=56.4%
Max depth 3, test=53.0%, train=57.3%
Max depth 4, test=54.0%, train=58.8%
Max depth 5, test=53.4%, train=59.8%
Max depth 6, test=53.3%, train=60.9%
Max depth 7, test=53.5%, train=62.3%
Max depth 8, test=54.2%, train=63.8%
Max depth 9, test=54.2%, train=65.5%
Max depth 10, test=54.4%, train=67.4%
Max depth 11, test=53.1%, train=69.9%
Max depth 12, test=52.9%, train=71.9%
Max depth 13, test=52.8%, train=74.0%
Max depth 14, test=52.7%, train=53.6%


Test without any signal to once again verify our work is fruitfull

In [239]:
cac40df.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'maSignal50',
       'maSignal200', 'boll_low_signal', 'boll_high_signal', 'Stochastic',
       'Daily_return'],
      dtype='object')

In [240]:
X,y = cac40df.drop(columns=['Daily_return', 'maSignal50', 'maSignal200', 'boll_low_signal', 'boll_high_signal'], axis=1, inplace=False), cac40df["Daily_return"]

treeClf = tree.DecisionTreeClassifier()

#GridSearch will apply cross-validation and test every possible parameter to determine the best combination. The max depth is our main target.
Depths = range(1,15)
parameters = {
    "criterion": ["gini", "entropy"],
    "max_depth": Depths,
}
gridTreeClf = GridSearchCV(treeClf, parameters, n_jobs=-1, return_train_score=True)
 
gridTreeClf.fit(X,y)

print(f"Best parameter found : {gridTreeClf.best_params_}, score: {round(100*gridTreeClf.best_score_, 1)}%")

print(f"Mean test/train scored :")
for i in Depths:
 print(f"Max depth {i}, test={round(100*gridTreeClf.cv_results_['mean_test_score'][i], 1)}%, train={round(100*gridTreeClf.cv_results_['mean_train_score'][i], 1)}%")

Best parameter found : {'criterion': 'gini', 'max_depth': 4}, score: 50.9%
Mean test/train scored :
Max depth 1, test=50.5%, train=52.6%
Max depth 2, test=50.5%, train=52.8%
Max depth 3, test=50.9%, train=53.2%
Max depth 4, test=50.9%, train=53.7%
Max depth 5, test=50.8%, train=54.2%
Max depth 6, test=50.5%, train=55.1%
Max depth 7, test=50.5%, train=56.0%
Max depth 8, test=50.7%, train=56.8%
Max depth 9, test=50.5%, train=57.9%
Max depth 10, test=50.2%, train=59.2%
Max depth 11, test=50.4%, train=60.5%
Max depth 12, test=49.9%, train=62.4%
Max depth 13, test=50.2%, train=63.7%
Max depth 14, test=50.5%, train=52.3%


We have the results:
* Signals -> accuracy=56.6% with a max depth of 7
* All -> accuracy of 54.4% with a max depth of 11
* Raw -> accuracy of 50.9% with a max depth of 4

The accuracy seems to be consistently best considering only the signal and not the raw attributes (open, close, etc):\
The best accuracy is achieved only taking the signals (stochastic oscillator, moving average 50/200 touching the closing price, boiller bands touching the closing price). Adding the raw attributes consistently reduces the accuracy to a max of 54.4%, and taking only the raw attributes yields an accuracy of 50.9%, which is very close to a random choice with our binary classification. The best depth is also lower when taking the signals in account. This could indicate a better resistance to overfitting as the pre-prunning is more aggressive than with all the parameters.