In [398]:
import pandas as pd
import os


DATA_FILEPATH = "data/CAC40.csv"

def fetch_data():
    """Import the data from csv to pd dataframe"""
    relaviteFilepath = os.path.join(os.path.abspath(''), DATA_FILEPATH)
    return pd.read_csv(relaviteFilepath)

# df stands for dataframe. This is the object that we will manipulate throughouht the notebook
cac40df = fetch_data()
cac40df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,1990-03-01,1836.000000,1838.000000,1827.000000,1832.000000,1832.000000,0
1,1990-03-02,1831.000000,1860.000000,1831.000000,1860.000000,1860.000000,0
2,1990-03-05,1866.000000,1874.000000,1862.000000,1874.000000,1874.000000,0
3,1990-03-06,1869.000000,1875.000000,1866.000000,1872.000000,1872.000000,0
4,1990-03-07,1874.000000,1881.000000,1874.000000,1880.000000,1880.000000,0
...,...,...,...,...,...,...,...
7999,2021-09-15,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200
8000,2021-09-16,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500
8001,2021-09-17,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500
8002,2021-09-20,6450.390137,6471.089844,6389.620117,6455.810059,6455.810059,120069400


# 2.  Build  Decision  tree  model  with  your  selected  stock  /  marketindex  

* (a)  Choose  Three  feature  selection  methods  to  evaluate  themodel
* (b)  Compare the feature selection methods and explain the dif-ferences observed;

# Methodology

We will keep the pre-processing from the past series and build a decision tree so we can use the three feature selection methods from the ex1.

# Preprocessing
We preprocess the data to have only clean signals, as is way better, just like s6 showed

In [399]:
cac40df.drop(columns=['Date'], axis=1, inplace=True)

MA signal - we don't want them numeric, so we will signal with the MA50 and ma200 collide with the closing price as classes (0=clash, 1=below, 2=above

In [400]:
import numpy as np

def isColliding(row, wind):
    if row[f"maPosition{wind}"] == 0 or (np.sign(row[f"maPosition{maWindow}"]) != np.sign(row[f"maPositionPrev{wind}"])):
        return 0
    if row[f"maPosition{wind}"] < 0:
        return 1
    if row[f"maPosition{wind}"] > 0:
        return 2
    return np.NaN

maWindows = [50, 200]

cols = ["Close"]

for maWindow in maWindows:
    cols += [f"ma{maWindow}", f"maPosition{maWindow}", f"maPositionPrev{maWindow}", f"maSignal{maWindow}"]
    cac40df[f"ma{maWindow}"] = cac40df["Close"].rolling(maWindow).mean()
    cac40df[f"maPosition{maWindow}"] = cac40df["Close"] - cac40df[f"ma{maWindow}"]
    cac40df[f"maPositionPrev{maWindow}"] = cac40df[f"maPosition{maWindow}"].shift(-1)
    cac40df[f"maSignal{maWindow}"] = cac40df.apply(lambda row : isColliding(row, maWindow), axis = 1)

cac40df[cols]

Unnamed: 0,Close,ma50,maPosition50,maPositionPrev50,maSignal50,ma200,maPosition200,maPositionPrev200,maSignal200
0,1832.000000,,,,0,,,,0
1,1860.000000,,,,0,,,,0
2,1874.000000,,,,0,,,,0
3,1872.000000,,,,0,,,,0
4,1880.000000,,,,0,,,,0
...,...,...,...,...,...,...,...,...,...
7999,6583.620117,6654.120215,-70.500098,-36.047568,1,6180.638843,402.981274,436.883801,2
8000,6622.589844,6658.637412,-36.047568,-89.262871,1,6185.706042,436.883801,379.499849,2
8001,6570.189941,6659.452812,-89.262871,-201.573955,1,6190.690093,379.499849,260.644265,2
8002,6455.810059,6657.384014,-201.573955,-104.539229,1,6195.165793,260.644265,352.534636,2


Drop the construction columns

In [401]:
for maWindow in maWindows:
    cols = [f"ma{maWindow}", f"maPosition{maWindow}", f"maPositionPrev{maWindow}"]
    cac40df.drop(columns=cols, axis=1, inplace=True)

cac40df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,maSignal50,maSignal200
0,1836.000000,1838.000000,1827.000000,1832.000000,1832.000000,0,0,0
1,1831.000000,1860.000000,1831.000000,1860.000000,1860.000000,0,0,0
2,1866.000000,1874.000000,1862.000000,1874.000000,1874.000000,0,0,0
3,1869.000000,1875.000000,1866.000000,1872.000000,1872.000000,0,0,0
4,1874.000000,1881.000000,1874.000000,1880.000000,1880.000000,0,0,0
...,...,...,...,...,...,...,...,...
7999,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,1,2
8000,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,1,2
8001,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,1,2
8002,6450.390137,6471.089844,6389.620117,6455.810059,6455.810059,120069400,1,2


Now we have the signal for as classes instead of the moving average. Similarly, we add the Bollinger Bands signals

In [402]:
x = np.linspace(0, len(cac40df), len(cac40df))

meanPrice = cac40df["Close"].mean()
threshold = meanPrice * 0.0005

maWindows = 200
# calculate mean rolling
mean = cac40df["Close"].rolling(maWindow).mean()
# calculate std rolling
std = cac40df["Close"].rolling(maWindow).std()
# calculate high and low band
cac40df["Boll_bands_high_200"] = mean + 2*std
cac40df["Boll_bands_low_200"] = mean - 2*std

cac40df["boll_low_signal"] = cac40df.apply(lambda row: 1 if abs(row["Close"] - row["Boll_bands_low_200"]) < threshold else 0, axis=1) 
cac40df["boll_high_signal"] = cac40df.apply(lambda row: 1 if abs(row["Boll_bands_high_200"] - row["Close"]) < threshold else 0, axis=1)

#Drop construction cols
cac40df.drop(columns=["Boll_bands_high_200", "Boll_bands_low_200"], axis=1, inplace=True)

print(f"Buy/sell signals : {len(cac40df[cac40df['boll_high_signal'] == 1])} / {len(cac40df[cac40df['boll_low_signal'] == 1])}")

Buy/sell signals : 42 / 13


Sochastic oscillator, rate=50

In [403]:
mw = 50
cac40df[f"min{mw}"] = cac40df["Close"].rolling(mw).min()
cac40df[f"max{mw}"] = cac40df["Close"].rolling(mw).max()
cac40df["Stochastic"] = 100 * ((cac40df["Close"] - cac40df[f"min{mw}"]) / (cac40df[f"max{mw}"] - cac40df[f"min{mw}"]))
#Drop construction cols
cac40df.drop(columns=[f"min{mw}", f"max{mw}"], axis=1, inplace=True)


add some trend as the classifiers can not retain memory of the last value of the close

Now we add the labels, also as a class

In [404]:
from math import isnan

def binaryPrediction(number):
    #Up
    if isnan(number):
        return float('NaN')
    if number > 0:
        return 1 #Up
    return 2 #Down

#Daily return on closing price
cac40df["Daily_return"] = (cac40df["Close"] - cac40df["Close"].shift(1)) / cac40df["Close"].shift(1)

#Shift to have next day prediction
cac40df["prediction"] = cac40df["Daily_return"].shift(-1)
#Label
cac40df["prediction"] = cac40df["prediction"].apply(binaryPrediction)


#Drop all the cols with Nana (MA, shifts)
cac40df.dropna(inplace=True)
cac40df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,maSignal50,maSignal200,boll_low_signal,boll_high_signal,Stochastic,Daily_return,prediction
49,2088.000000,2088.000000,2064.000000,2070.000000,2070.000000,0,2,0,0,0,80.134680,-0.009569,1.0
50,2068.000000,2080.000000,2057.000000,2071.000000,2071.000000,0,2,0,0,0,78.438662,0.000483,1.0
51,2069.000000,2105.000000,2069.000000,2104.000000,2104.000000,0,2,0,0,0,90.272374,0.015934,2.0
52,2110.000000,2120.000000,2096.000000,2100.000000,2100.000000,0,2,0,0,0,88.715953,-0.001901,2.0
53,2083.000000,2094.000000,2075.000000,2092.000000,2092.000000,0,2,0,0,0,85.140562,-0.003810,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7998,6672.189941,6677.069824,6613.520020,6652.970215,6652.970215,63626300,1,2,0,0,59.493077,-0.003588,2.0
7999,6654.830078,6659.270020,6577.020020,6583.620117,6583.620117,81904200,1,2,0,0,47.936072,-0.010424,1.0
8000,6613.709961,6663.410156,6612.160156,6622.589844,6622.589844,79574500,1,2,0,0,54.430271,0.005919,2.0
8001,6679.450195,6697.080078,6551.620117,6570.189941,6570.189941,214025500,1,2,0,0,45.697970,-0.007912,2.0


And finally, import the usefull kfold functions

In [405]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def get_acc_metrics(Model, xTest, yTest):
    ''' Returns accuracy, precision, recall and f1-score of the given model '''
    yPred = Model.predict(xTest)

    acc = accuracy_score(yTest, yPred)
    prec = precision_score(yTest, yPred)
    recall = recall_score(yTest, yPred)
    f1 = f1_score(yTest, yPred)

    return [acc, prec, recall, f1]

In [406]:
from sklearn.model_selection import train_test_split
from sklearn.base import clone

def compare_models(model, x1 : pd.DataFrame, y1 : pd.Series, x2: pd.DataFrame, y2 : pd.Series) -> None:
    '''
    Train the model with the given [x1,y1] dataset and compare it to the model trained with the [x2,y2] dataset.
    '''

    metrics = []
    passes = ["original", "selected"]

    print(f"Performing model analysis")
    for X,y in [(x1,y1), (x2, y2)]:
        currentModel = clone(model)

        #Prepare basis train-test
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

        #Fit
        currentModel.fit(X_train, y_train)

        #Add metrics to list
        modelMetrics = get_acc_metrics(currentModel, X_test, y_test)

        print(f"Score on {passes.pop(0)} data : \n\tAccuracy = {100*modelMetrics[0]:.2f}%\n\tPrecision = {modelMetrics[1]:.2f}\n\tRecall = {modelMetrics[2]:.2f}\n\tF1 = {modelMetrics[3]:.2f}")

        #Add in list for final comparison
        metrics.append(modelMetrics)

    diff = []
    for i in range(4):
        diff.append(metrics[1][i] - metrics[0][i])
    print(f"Gain on model by using the second dataset : \n\tAccuracy = {100*diff[0]:.2f}\n\tPrecision = {diff[1]:.2f}\n\tRecall = {diff[2]:.2f}\n\tF1 = {diff[3]:.2f}")

## Feature selections

In [407]:
from sklearn.decomposition import PCA

def comparePCA(model, x: pd.DataFrame, y: pd.Series, n: int):
    pca = PCA(n_components=n)

    xTransformed = pca.fit(x).transform(x)

    compare_models(model, x, y, xTransformed, y)

In [408]:
from sklearn.feature_selection import RFE

def compareRFE(model, x: pd.DataFrame, y: pd.Series, n: int):
    rfe = RFE(clone(model), n_features_to_select=n)

    rfeFitted = rfe.fit(x, y)
    print(f"Features: {rfeFitted.feature_names_in_}.\nRanking : {rfeFitted.ranking_}.\nSelected features : {rfeFitted.feature_names_in_[rfeFitted.get_support(True)]}")

    xTransformed = rfeFitted.transform(x)

    compare_models(model, x, y, xTransformed, y)

In [409]:
from sklearn.feature_selection import SelectKBest, chi2

def compareChi2(model, x: pd.DataFrame, y: pd.Series, k: int):
    selector = SelectKBest(score_func=chi2, k=k)

    selectorFitted = selector.fit(x, y)
    print(f"Features: {selectorFitted.feature_names_in_}.\nSelected features : {selectorFitted.feature_names_in_[selectorFitted.get_support(True)]}")

    xTransformed = selectorFitted.transform(x)

    compare_models(model, x, y, xTransformed, y)

Now we simply compare the whole dataset with the truncated dataset using feature selection.\
It is interesting to know if the signals created will be considered better predictors than raw values.\
Finally, we try for different values of k to evaluate its impact.

In [410]:
X,y = cac40df.drop(columns=['prediction'], axis=1, inplace=False), cac40df["prediction"]
k=6

# Decision tree

In [411]:
from sklearn import preprocessing
from sklearn import tree

k = 6
print(f"{len(X.columns)} initial columns : {X.columns}")

12 initial columns : Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'maSignal50',
       'maSignal200', 'boll_low_signal', 'boll_high_signal', 'Stochastic',
       'Daily_return'],
      dtype='object')


Objective: going from 11 features to 6 features, all the while trying to have the most accuracy possible

## PCA

In [412]:
comparePCA(tree.DecisionTreeClassifier(), X, y, k)

Performing model analysis
Score on original data : 
	Accuracy = 52.83%
	Precision = 0.54
	Recall = 0.53
	F1 = 0.54
Score on selected data : 
	Accuracy = 49.81%
	Precision = 0.51
	Recall = 0.51
	F1 = 0.51
Gain on model by using the second dataset : 
	Accuracy = -3.02
	Precision = -0.03
	Recall = -0.02
	F1 = -0.02


### Results
The PCA achieves a -4% accuracy going from 11 to 5 features. This puts the classifier just under 50% and is thus bad for the prediction (worse than flipping a coin). This means that our feature selection actually did the opposite of the desired effect: it messed with the features.

## PCA with normalization

In [413]:
#Normalize data for PCA
XNormalizedArray = preprocessing.MinMaxScaler().fit_transform(X)
#Go back to DF to feed in the fct
XNormalizedDf = pd.DataFrame(XNormalizedArray, index=X.index, columns=X.columns)
comparePCA(tree.DecisionTreeClassifier(), XNormalizedDf, y, k)

Performing model analysis
Score on original data : 
	Accuracy = 52.07%
	Precision = 0.54
	Recall = 0.53
	F1 = 0.53
Score on selected data : 
	Accuracy = 53.96%
	Precision = 0.55
	Recall = 0.56
	F1 = 0.56
Gain on model by using the second dataset : 
	Accuracy = 1.89
	Precision = 0.02
	Recall = 0.02
	F1 = 0.02


The normalization once again did wonder to the PCA.\
Going from 11 to 5 features, the classifier went from 52.62% to 53.96%. The scaler used is the MinMax one, so that we stay between [0,1] for each feature.\
The final model is very performant using only 5 features aglomerated from the 11 ones.

## RFE

In [414]:
compareRFE(tree.DecisionTreeClassifier(), X, y, k)

Features: ['Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume' 'maSignal50'
 'maSignal200' 'boll_low_signal' 'boll_high_signal' 'Stochastic'
 'Daily_return'].
Ranking : [2 1 1 1 3 1 4 5 7 6 1 1].
Selected features : ['High' 'Low' 'Close' 'Volume' 'Stochastic' 'Daily_return']
Performing model analysis
Score on original data : 
	Accuracy = 53.25%
	Precision = 0.55
	Recall = 0.53
	F1 = 0.54
Score on selected data : 
	Accuracy = 50.94%
	Precision = 0.52
	Recall = 0.53
	F1 = 0.53
Gain on model by using the second dataset : 
	Accuracy = -2.30
	Precision = -0.02
	Recall = -0.00
	F1 = -0.01


## Chi2

In [415]:
compareChi2(tree.DecisionTreeClassifier(), XNormalizedDf, y, k)

Features: ['Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume' 'maSignal50'
 'maSignal200' 'boll_low_signal' 'boll_high_signal' 'Stochastic'
 'Daily_return'].
Selected features : ['Adj Close' 'maSignal50' 'maSignal200' 'boll_low_signal'
 'boll_high_signal' 'Stochastic']
Performing model analysis
Score on original data : 
	Accuracy = 52.16%
	Precision = 0.54
	Recall = 0.52
	F1 = 0.53
Score on selected data : 
	Accuracy = 52.66%
	Precision = 0.54
	Recall = 0.53
	F1 = 0.54
Gain on model by using the second dataset : 
	Accuracy = 0.50
	Precision = 0.00
	Recall = 0.01
	F1 = 0.01


## Bonus

In [416]:
compareRFE(tree.DecisionTreeClassifier(), XNormalizedDf, y, 1)


Features: ['Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume' 'maSignal50'
 'maSignal200' 'boll_low_signal' 'boll_high_signal' 'Stochastic'
 'Daily_return'].
Ranking : [ 7  4  2  6  8  5 10  9 12 11  3  1].
Selected features : ['Daily_return']
Performing model analysis
Score on original data : 
	Accuracy = 52.37%
	Precision = 0.54
	Recall = 0.53
	F1 = 0.53
Score on selected data : 
	Accuracy = 51.07%
	Precision = 0.53
	Recall = 0.52
	F1 = 0.52
Gain on model by using the second dataset : 
	Accuracy = -1.30
	Precision = -0.01
	Recall = -0.01
	F1 = -0.01


In [417]:
compareChi2(tree.DecisionTreeClassifier(), XNormalizedDf, y, 1)

Features: ['Open' 'High' 'Low' 'Close' 'Adj Close' 'Volume' 'maSignal50'
 'maSignal200' 'boll_low_signal' 'boll_high_signal' 'Stochastic'
 'Daily_return'].
Ranking : [ 5  4  2  7  8  6 10  9 12 11  3  1].
Selected features : ['Daily_return']
Performing model analysis
Score on original data : 
	Accuracy = 53.12%
	Precision = 0.55
	Recall = 0.54
	F1 = 0.54
Score on selected data : 
	Accuracy = 51.11%
	Precision = 0.53
	Recall = 0.51
	F1 = 0.52
Gain on model by using the second dataset : 
	Accuracy = -2.01
	Precision = -0.02
	Recall = -0.02
	F1 = -0.02
