In [81]:
import yfinance as yf
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import requests
import time
import os
import joblib
from io import StringIO
from dash import dcc, html
from dash.dependencies import Input, Output
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [82]:
start_time = time.time()
if not os.path.exists("data"):    
    os.makedirs("data")
if not os.path.exists("data/history"):           
    os.makedirs("data/history")
if not os.path.exists("data/data"):   
    os.makedirs("data/data")
if not os.path.exists("data/training_data"):
    os.makedirs("data/training_data")
if not os.path.exists("data/model"):
    os.makedirs("data/model")
if not os.path.exists("data/training_data_x_classifier"):
    os.makedirs("data/training_data_x_classifier")
    
if not os.path.exists("data/training_data/training_sp.csv"):
    tickers = ["AAPL", "MSFT", "NVDA","^TNX"]
    training_sp = yf.download(tickers,start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_sp = yf.download("^GSPC",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_sp_vol = yf.download("^VIX",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_sp.to_csv("data/training_data/training_sp.csv")
    training_indice_sp.to_csv("data/training_data/training_indice_sp.csv")
    training_indice_sp_vol.to_csv("data/training_data/training_indice_sp_vol.csv")
if not os.path.exists("data/history/sp500.txt"):
    with open("data/history/sp500.txt","w") as sp:
        initialisation = "2000-01-01 00:00:00"
        initialisation = np.datetime64(initialisation,'s')
        initialisation = str(initialisation.astype("int64"))
        sp.write(initialisation)
with open("data/history/sp500.txt","r") as sp:
    row = sp.readlines()
    last_line = int(row[-1].strip())
if last_line//86400 != start_time//86400:                 
    url = "https://stockanalysis.com/list/sp-500-stocks/" 
    headers = {"user-agent":"Mozilla/5.0"}                
    reponse = requests.get(url, headers=headers)
    tickers = pd.read_html(StringIO(reponse.text))
    tickers = tickers[0]["Symbol"]
    tickers = tickers.to_list()
    if "GOOG" in tickers:                                     
        tickers.remove("GOOG") 
    tickers = tickers[0:3]                
    tickers = [ticker.replace(".","-") for ticker in tickers] 
    sp_data = yf.download(tickers,period="5d",interval="1d")[["Open","Close","Volume"]]
    sp_indice_data = yf.download("^GSPC", period="5d",interval="1d")[["Open","Close","Volume"]]
    sp_indice_vol_data = yf.download("^VIX",period="5d",interval="1d")[["Open","Close"]]
    sp_data.to_csv("data/data/sp_data.csv")                   
    sp_indice_data.to_csv("data/data/sp_indice_data.csv")    
    sp_indice_vol_data.to_csv("data/data/sp_indice_vol_data.csv") 
    with open("data/history/sp500.txt","a") as sp:
        note = str(np.int64(start_time))
        sp.write(f"\n{note}")
    print("Données actualisées depuis Yfinance")
else:                                                             
    sp_data = pd.read_csv("data/data/sp_data.csv",index_col=[0],header=[0,1])                
    sp_indice_data = pd.read_csv("data/data/sp_indice_data.csv",index_col=[0],header=[0,1])
    sp_indice_vol_data = pd.read_csv("data/data/sp_indice_vol_data.csv",index_col=[0],header=[0,1])                                  
    sp_data.index = pd.to_datetime(sp_data.index)                 
    sp_indice_data.index = pd.to_datetime(sp_indice_data.index)
    sp_indice_vol_data.index = pd.to_datetime(sp_indice_vol_data.index)
    print("Données importées depuis ./data/data/sp_data.csv et ./data/data/sp_indice_data.csv")
training_sp = pd.read_csv("data/training_data/training_sp.csv",index_col=[0],header=[0,1])
training_indice_sp = pd.read_csv("data/training_data/training_indice_sp.csv",index_col=[0],header=[0,1])
training_indice_sp_vol = pd.read_csv("data/training_data/training_indice_sp_vol.csv",index_col=[0],header=[0,1])
training_sp.index = pd.to_datetime(training_sp.index)
training_indice_sp.index = pd.to_datetime(training_indice_sp.index)
training_indice_sp_vol.index = pd.to_datetime(training_indice_sp_vol.index)
print("Données d'entrainement importé depuis ./data/training_data/training_sp.csv,"
" ./data/training_data/training_indice_sp.csv et ./data/training_data/training_indice_sp_vol.csv")

Données importées depuis ./data/data/sp_data.csv et ./data/data/sp_indice_data.csv
Données d'entrainement importé depuis ./data/training_data/training_sp.csv, ./data/training_data/training_indice_sp.csv et ./data/training_data/training_indice_sp_vol.csv


In [83]:
if not os.path.exists("data/training_data/training_cac.csv"):
    tickers = ["MC.PA","TTE.PA","SAN.PA"]                
    training_cac = yf.download(tickers,start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_cac = yf.download("^FCHI",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_cac.to_csv("data/training_data/training_cac.csv")
    training_indice_cac.to_csv("data/training_data/training_indice_cac.csv")
if not os.path.exists("data/history/cac40.txt"):
    with open("data/history/cac40.txt","w") as cac:
        initialisation = "2000-01-01 00:00:00"
        initialisation = np.datetime64(initialisation,'s')
        initialisation = str(initialisation.astype("int64"))
        cac.write(initialisation)
with open("data/history/cac40.txt","r") as cac:
    row = cac.readlines()
    last_line = int(row[-1].strip())
if last_line//86400 != start_time//86400:                  
    url = "https://fr.finance.yahoo.com/quote/%5EFCHI/components/" 
    headers = {"user-agent":"Mozilla/5.0"}                 
    reponse = requests.get(url, headers=headers)
    tickers = pd.read_html(StringIO(reponse.text))
    tickers = tickers[0]["Symbole"]
    tickers = tickers[0:10]
    tickers = tickers.to_list()
    cac_data = yf.download(tickers,period="5d",interval="1d")[["Open","Close","Volume"]]
    cac_indice_data = yf.download("^FCHI", period="5d",interval="1d")[["Open","Close","Volume"]]
    cac_data.to_csv("data/data/cac_data.csv")                   
    cac_indice_data.to_csv("data/data/cac_indice_data.csv")     
    with open("data/history/cac40.txt","a") as cac:
        note = str(np.int64(start_time))
        cac.write(f"\n{note}")
    print("Données actualisées depuis Yfinance")
else:                                                              
    cac_data = pd.read_csv(                                        
        "data/data/cac_data.csv",                                  
        index_col=[0],header=[0,1]
        )                
    cac_indice_data = pd.read_csv("data/data/cac_indice_data.csv",index_col=[0],header=[0,1])                                  
    cac_data.index = pd.to_datetime(cac_data.index)                 
    cac_indice_data.index = pd.to_datetime(cac_indice_data.index)
    print("Données importées depuis ./data/data/cac_data.csv et ./data/data/cac_indice_data.csv")
training_cac = pd.read_csv("data/training_data/training_cac.csv",index_col=[0],header=[0,1])
training_indice_cac = pd.read_csv("data/training_data/training_indice_cac.csv",index_col=[0],header=[0,1])
training_cac.index, training_indice_cac.index = pd.to_datetime(training_cac.index), pd.to_datetime(training_indice_cac.index)
print("Données d'entrainement importé depuis ./data/training_data/training_cac.csv et ./data/training_data/training_indice_cac.csv")

Données importées depuis ./data/data/cac_data.csv et ./data/data/cac_indice_data.csv
Données d'entrainement importé depuis ./data/training_data/training_cac.csv et ./data/training_data/training_indice_cac.csv


In [84]:
if not os.path.exists("data/training_data/training_ftse.csv"):
    tickers = ["AZN.L","HSBA.L","ULVR.L"]                
    training_ftse = yf.download(tickers,start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_ftse = yf.download("^FTSE",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_ftse.to_csv("data/training_data/training_ftse.csv")
    training_indice_ftse.to_csv("data/training_data/training_indice_ftse.csv")
if not os.path.exists("data/history/ftse100.txt"):
    with open("data/history/ftse100.txt","w") as ftse:
        initialisation = "2000-01-01 00:00:00"
        initialisation = np.datetime64(initialisation,'s')
        initialisation = str(initialisation.astype("int64"))
        ftse.write(initialisation)
with open("data/history/ftse100.txt","r") as ftse:
    row = ftse.readlines()
    last_line = int(row[-1].strip())
if last_line//86400 != start_time//86400:                                          
    url = "https://uk.finance.yahoo.com/quote/%5EFTSE/components/" 
    headers = {"user-agent":"Mozilla/5.0"}                                         
    reponse = requests.get(url, headers=headers)
    tickers = pd.read_html(StringIO(reponse.text))
    tickers = tickers[0]["Symbol"]
    tickers = tickers.to_list()
    tickers = tickers[0:10]
    ftse_data = yf.download(tickers,period="5d",interval="1d")[["Open","Close","Volume"]]
    ftse_indice_data = yf.download("^FTSE", period="5d",interval="1d")[["Open","Close","Volume"]]
    ftse_data.to_csv("data/data/ftse_data.csv")                   
    ftse_indice_data.to_csv("data/data/ftse_indice_data.csv")     
    with open("data/history/ftse100.txt","a") as ftse:
        note = str(np.int64(start_time))
        ftse.write(f"\n{note}")
    print("Données actualisées depuis Yfinance")
else:                                                              
    ftse_data = pd.read_csv(                                        
        "data/data/ftse_data.csv",                                  
        index_col=[0],header=[0,1]
        )                
    ftse_indice_data = pd.read_csv("data/data/ftse_indice_data.csv",index_col=[0],header=[0,1])                                  
    ftse_data.index = pd.to_datetime(ftse_data.index)                 
    ftse_indice_data.index = pd.to_datetime(ftse_indice_data.index)
    print("Données importées depuis ./data/data/ftse_data.csv et ./data/data/ftse_indice_data.csv")
training_ftse = pd.read_csv("data/training_data/training_ftse.csv",index_col=[0],header=[0,1])
training_indice_ftse = pd.read_csv("data/training_data/training_indice_ftse.csv",index_col=[0],header=[0,1])
training_ftse.index, training_indice_ftse.index = pd.to_datetime(training_ftse.index), pd.to_datetime(training_indice_ftse.index)
print("Données d'entrainement importé depuis ./data/training_data/training_ftse.csv et ./data/training_data/training_indice_ftse.csv")

Données importées depuis ./data/data/ftse_data.csv et ./data/data/ftse_indice_data.csv
Données d'entrainement importé depuis ./data/training_data/training_ftse.csv et ./data/training_data/training_indice_ftse.csv


In [85]:
if not os.path.exists("data/training_data/training_dax.csv"):
    tickers = ["SAP.DE","SIE.DE","ALV.DE"]              
    training_dax = yf.download(tickers,start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_dax = yf.download("^GDAXI",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_dax.to_csv("data/training_data/training_dax.csv")
    training_indice_dax.to_csv("data/training_data/training_indice_dax.csv")
if not os.path.exists("data/history/dax40.txt"):
    with open("data/history/dax40.txt","w") as dax:
        initialisation = "2000-01-01 00:00:00"
        initialisation = np.datetime64(initialisation,'s')
        initialisation = str(initialisation.astype("int64"))
        dax.write(initialisation)
with open("data/history/dax40.txt","r") as dax:
    row = dax.readlines()
    last_line = int(row[-1].strip())
if last_line//86400 != start_time//86400:                                          
    url = "https://finance.yahoo.com/quote/%5EGDAXI/components/" 
    headers = {"user-agent":"Mozilla/5.0"}                                         
    reponse = requests.get(url, headers=headers)
    tickers = pd.read_html(StringIO(reponse.text))
    tickers = tickers[0]["Symbol"]
    tickers = tickers.to_list()
    tickers = tickers[0:10]
    dax_data = yf.download(tickers,period="5d",interval="1d")[["Open","Close","Volume"]]
    dax_indice_data = yf.download("^GDAXI", period="5d",interval="1d")[["Open","Close","Volume"]]
    dax_data.to_csv("data/data/dax_data.csv")                   
    dax_indice_data.to_csv("data/data/dax_indice_data.csv")     
    with open("data/history/dax40.txt","a") as dax:
        note = str(np.int64(start_time))
        dax.write(f"\n{note}")
    print("Données actualisées depuis Yfinance")
else:                                                              
    dax_data = pd.read_csv(                                        
        "data/data/dax_data.csv",                                  
        index_col=[0],header=[0,1]
        )                
    dax_indice_data = pd.read_csv("data/data/dax_indice_data.csv",index_col=[0],header=[0,1])                                  
    dax_data.index = pd.to_datetime(dax_data.index)                 
    dax_indice_data.index = pd.to_datetime(dax_indice_data.index)
    print("Données importées depuis ./data/data/dax_data.csv et ./data/data/dax_indice_data.csv")
training_dax = pd.read_csv("data/training_data/training_dax.csv",index_col=[0],header=[0,1])
training_indice_dax = pd.read_csv("data/training_data/training_indice_dax.csv",index_col=[0],header=[0,1])
training_dax.index, training_indice_dax.index = pd.to_datetime(training_dax.index), pd.to_datetime(training_indice_dax.index)
print("Données d'entrainement importé depuis ./data/training_data/training_dax.csv et ./data/training_data/training_indice_dax.csv")

Données importées depuis ./data/data/dax_data.csv et ./data/data/dax_indice_data.csv
Données d'entrainement importé depuis ./data/training_data/training_dax.csv et ./data/training_data/training_indice_dax.csv


In [86]:
tickers = sorted(set(training_sp.columns.get_level_values(1)))
for ticker in tickers:
    training_sp[("log_return",ticker)] = np.log(training_sp[("Close",ticker)]/
                                                training_sp[("Open",ticker)])
for ticker in tickers: 
    if ticker != "^TNX":
        training_sp[("open_gap_up",ticker)] = np.where(
            training_sp[("Open",ticker)].shift(-1)>training_sp[("Close",ticker)]*1.01,1,0
            )
for ticker in tickers:
    if ticker != "^TNX":
        training_sp[("open_gap_down",ticker)] = np.where(
            training_sp[("Open",ticker)].shift(-1)<training_sp[("Close",ticker)]*0.99,1,0
            )
training_sp = training_sp.iloc[:,-10:]

training_indice_sp_vol[("log_return","^VIX")] = np.log(training_indice_sp_vol[("Close","^VIX")]/
                                                training_indice_sp_vol[("Open","^VIX")])
training_indice_sp_vol = training_indice_sp_vol.iloc[:,-1]

x_classifier_sp = training_sp.merge(training_indice_sp_vol,how="inner",left_index=True,right_index=True)
previous_indice_day = {}
for i in range(0,5):
    previous_indice_day[("shift",f"shift {str(i)}")] = np.log(
        training_indice_sp[("Close","^GSPC")].shift(i)/training_indice_sp[("Open","^GSPC")].shift(i)
        )
previous_indice_day[("gap","open_gap_up")] = np.where(
    training_indice_sp[("Open","^GSPC")].shift(-1)>training_indice_sp[("Close","^GSPC")]*1.01,1,0
    )
previous_indice_day[("gap","open_gap_down")] = np.where(
    training_indice_sp[("Open","^GSPC")].shift(-1)<training_indice_sp[("Close","^GSPC")]*0.99,1,0
    )
previous_indice_day  = pd.DataFrame(previous_indice_day, index=training_indice_sp.index)
x_classifier_sp_csv = x_classifier_sp.merge(previous_indice_day,how="inner",left_index=True,right_index=True)
x_classifier_sp_csv.to_csv("data/training_data_x_classifier/x_classifier_sp.csv")
previous_indice_day = previous_indice_day.iloc[i:-2,:]
x_classifier_sp = x_classifier_sp.iloc[i:-2,:]
x_classifier_sp = x_classifier_sp.to_numpy()
previous_indice_day = previous_indice_day.to_numpy()
x_classifier_sp = np.concatenate((x_classifier_sp,previous_indice_day), axis=1)
y_classifier_sp = np.where(training_indice_sp[("Close", "^GSPC")].shift(-1)>
training_indice_sp[("Open", "^GSPC")].shift(-1),1,0)
y_classifier_sp = y_classifier_sp[i:-2]

In [87]:
trained_sp = True
if trained_sp == False:
    class_weight_dict = {   "standard":None,
                            "balanced":"balanced",
                            "signal":{0:2.0,1:1.0}  }
    classifier_model_sp = {}
    model_accuracy_sp = {}
    classification_report_sp = {}
    x_train, x_test, y_train, y_test = train_test_split(x_classifier_sp, y_classifier_sp, test_size=0.3, shuffle=False)
    for name,weight in class_weight_dict.items():
        model = RandomForestClassifier( n_estimators=4000,
                                        max_depth=10,
                                        min_samples_split=10,
                                        min_samples_leaf=4,
                                        max_features='sqrt',
                                        class_weight=weight,
                                        n_jobs=-1   )
        model = model.fit(x_train,y_train)
        classifier_model_sp[name] = model
        y_pred = classifier_model_sp[name].predict(x_test)
        model_accuracy_sp[name] = accuracy_score(y_test, y_pred)
        print(f"La précision du modèle {name} est de {round(model_accuracy_sp[name]*100,3)} %")
        classification_report_sp[name] = classification_report(y_test, y_pred)
        print(classification_report_sp[name])
    joblib.dump(classifier_model_sp, "data/model/classifier_model_sp.joblib")
    joblib.dump(model_accuracy_sp, "data/model/model_accuracy_sp.joblib")
    joblib.dump(classification_report_sp, "data/model/classification_report_sp.joblib")

In [88]:
training_indice_sp_vol = pd.read_csv( 
    "data/training_data/training_indice_sp_vol.csv",index_col=[0],header=[0,1])
training_indice_sp_vol.index = pd.to_datetime(training_indice_sp_vol.index)

training_cac, training_indice_sp_vol = training_cac.align(training_indice_sp_vol, join="inner", axis=0)
training_cac, training_indice_cac = training_cac.align(training_indice_cac, join="inner",axis=0)

tickers = sorted(set(training_cac.columns.get_level_values(1)))
for ticker in tickers:
    training_cac[("log_return",ticker)] = np.log(training_cac[("Close",ticker)]/
                                                training_cac[("Open",ticker)])
for ticker in tickers:
    training_cac[("open_gap_up",ticker)] = np.where(
        training_cac[("Open",ticker)].shift(-1)>training_cac[("Close",ticker)]*1.01,1,0
        )
for ticker in tickers:
    training_cac[("open_gap_down",ticker)] = np.where(
        training_cac[("Open",ticker)].shift(-1)<training_cac[("Close",ticker)]*0.99,1,0
        )
training_cac = training_cac.iloc[:,-9:]

training_indice_sp_vol[("log_return","^VIX")] = np.log(training_indice_sp_vol[("Close","^VIX")]/
                                                training_indice_sp_vol[("Open","^VIX")])
training_indice_sp_vol = training_indice_sp_vol.iloc[:,-1]

x_classifier_cac = training_cac.merge(training_indice_sp_vol,how="inner",left_index=True,right_index=True)
previous_indice_day = {}
for i in range(0,5):
    previous_indice_day[("shift",f"shift {str(i)}")] = np.log(
        training_indice_cac[("Close","^FCHI")].shift(i)/training_indice_cac[("Open","^FCHI")].shift(i)
        )
previous_indice_day[("gap","open_gap_up")] = np.where(
    training_indice_cac[("Open","^FCHI")].shift(-1)>training_indice_cac[("Close","^FCHI")]*1.01,1,0
    )
previous_indice_day[("gap","open_gap_down")] = np.where(
    training_indice_cac[("Open","^FCHI")].shift(-1)<training_indice_cac[("Close","^FCHI")]*0.99,1,0
    )
previous_indice_day = pd.DataFrame(previous_indice_day, index=training_indice_cac.index)
x_classifier_cac_csv = x_classifier_cac.merge(previous_indice_day,how="inner",left_index=True,right_index=True)
x_classifier_cac_csv.to_csv("data/training_data_x_classifier/x_classifier_cac.csv")
previous_indice_day = previous_indice_day.iloc[i:-2,:]
x_classifier_cac = x_classifier_cac.iloc[i:-2,:]
x_classifier_cac = x_classifier_cac.to_numpy()
previous_indice_day = previous_indice_day.to_numpy()
x_classifier_cac = np.concatenate((x_classifier_cac,previous_indice_day), axis=1)
y_classifier_cac = np.where(training_indice_cac[("Close", "^FCHI")].shift(-1)>
training_indice_cac[("Open", "^FCHI")].shift(-1),1,0)
y_classifier_cac = y_classifier_cac[i:-2]

In [89]:
trained_cac =   True
if trained_cac == False:
    class_weight_dict = {   "standard":None,
                            "balanced":"balanced",
                            "signal":{0:1.2,1:1.0}  }
    classifier_model_cac = {}
    model_accuracy_cac = {}
    classification_report_cac = {}
    x_train, x_test, y_train, y_test = train_test_split(x_classifier_cac, y_classifier_cac, test_size=0.1, shuffle=False)
    for name,weight in class_weight_dict.items():
        model = RandomForestClassifier( n_estimators=4000,
                                        max_depth=10,
                                        min_samples_split=10,
                                        min_samples_leaf=4,
                                        max_features='sqrt',
                                        class_weight=weight,
                                        n_jobs=-1   )
        model = model.fit(x_train,y_train)
        classifier_model_cac[name] = model
        y_pred = classifier_model_cac[name].predict(x_test)
        model_accuracy_cac[name] = accuracy_score(y_test, y_pred)
        print(f"La précision du modèle {name} est de {round(model_accuracy_cac[name]*100,3)} %")
        classification_report_cac[name] = classification_report(y_test, y_pred)
        print(classification_report_cac[name])
    joblib.dump(classifier_model_cac, "data/model/classifier_model_cac.joblib")
    joblib.dump(model_accuracy_cac, "data/model/model_accuracy_cac.joblib")
    joblib.dump(classification_report_cac, "data/model/classification_report_cac.joblib")

In [90]:
training_indice_sp_vol = pd.read_csv( 
    "data/training_data/training_indice_sp_vol.csv",index_col=[0],header=[0,1])
training_indice_sp_vol.index = pd.to_datetime(training_indice_sp_vol.index)

training_dax, training_indice_sp_vol = training_dax.align(training_indice_sp_vol, join="inner", axis=0)
training_dax, training_indice_dax = training_dax.align(training_indice_dax, join="inner",axis=0)

tickers = sorted(set(training_dax.columns.get_level_values(1)))
for ticker in tickers:
    training_dax[("log_return",ticker)] = np.log(training_dax[("Close",ticker)]/
                                                training_dax[("Open",ticker)])
for ticker in tickers:
    training_dax[("open_gap_up",ticker)] = np.where(
        training_dax[("Open",ticker)].shift(-1)>training_dax[("Close",ticker)]*1.01,1,0
        )
for ticker in tickers:
    training_dax[("open_gap_down",ticker)] = np.where(
        training_dax[("Open",ticker)].shift(-1)<training_dax[("Close",ticker)]*0.99,1,0
        )
training_dax = training_dax.iloc[:,-9:]

training_indice_sp_vol[("log_return","^VIX")] = np.log(training_indice_sp_vol[("Close","^VIX")]/
                                                training_indice_sp_vol[("Open","^VIX")])
training_indice_sp_vol = training_indice_sp_vol.iloc[:,-1]

x_classifier_dax = training_dax.merge(training_indice_sp_vol,how="inner",left_index=True,right_index=True)
previous_indice_day = {}
for i in range(0,5):
    previous_indice_day[("shift",f"shift {str(i)}")] = np.log(
        training_indice_dax[("Close","^GDAXI")].shift(i)/training_indice_dax[("Open","^GDAXI")].shift(i)
        )
previous_indice_day[("gap","open_gap_up")] = np.where(
    training_indice_dax[("Open","^GDAXI")].shift(-1)>training_indice_dax[("Close","^GDAXI")]*1.01,1,0
    )
previous_indice_day[("gap","open_gap_down")] = np.where(
    training_indice_dax[("Open","^GDAXI")].shift(-1)<training_indice_dax[("Close","^GDAXI")]*0.99,1,0
    )
previous_indice_day = pd.DataFrame(previous_indice_day, index=training_indice_dax.index)
x_classifier_dax_csv = x_classifier_dax.merge(previous_indice_day,how="inner",left_index=True,right_index=True)
x_classifier_dax_csv.to_csv("data/training_data_x_classifier/x_classifier_dax.csv")
previous_indice_day = previous_indice_day.iloc[i:-2,:]
x_classifier_dax = x_classifier_dax.iloc[i:-2,:]
x_classifier_dax = x_classifier_dax.to_numpy()
previous_indice_day = previous_indice_day.to_numpy()
x_classifier_dax = np.concatenate((x_classifier_dax,previous_indice_day), axis=1)
y_classifier_dax = np.where(training_indice_dax[("Close", "^GDAXI")].shift(-1)>
training_indice_dax[("Open", "^GDAXI")].shift(-1),1,0)
y_classifier_dax = y_classifier_dax[i:-2]

In [91]:
trained_dax = True
if trained_dax == False:
    class_weight_dict = {   "standard":None,
                            "balanced":"balanced",
                            "signal":{0:1.3,1:1.0}  }
    classifier_model_dax = {}
    model_accuracy_dax = {}
    classification_report_dax = {}
    x_train, x_test, y_train, y_test = train_test_split(x_classifier_dax, y_classifier_dax, test_size=0.1, shuffle=False)
    for name,weight in class_weight_dict.items():
        model = RandomForestClassifier( n_estimators=4000,
                                        max_depth=10,
                                        min_samples_split=10,
                                        min_samples_leaf=4,
                                        max_features='sqrt',
                                        class_weight=weight,
                                        n_jobs=-1   )
        model = model.fit(x_train,y_train)
        classifier_model_dax[name] = model
        y_pred = classifier_model_dax[name].predict(x_test)
        model_accuracy_dax[name] = accuracy_score(y_test, y_pred)
        print(f"La précision du modèle {name} est de {round(model_accuracy_dax[name]*100,3)} %")
        classification_report_dax[name] = classification_report(y_test, y_pred)
        print(classification_report_dax[name])
    joblib.dump(classifier_model_dax, "data/model/classifier_model_dax.joblib")
    joblib.dump(model_accuracy_dax, "data/model/model_accuracy_dax.joblib")
    joblib.dump(classification_report_dax, "data/model/classification_report_dax.joblib")

In [92]:
if not os.path.exists("data/training_data/training_indice_sp_vol.csv"):
    training_indice_sp_vol = yf.download("^VIX",start="2001-01-01",end="2024-12-31",interval="1d")[["Open","Close"]]
    training_indice_sp_vol.to_csv("data/training_data/training_indice_sp_vol.csv")
training_indice_sp_vol = pd.read_csv( 
    "data/training_data/training_indice_sp_vol.csv",index_col=[0],header=[0,1])
training_indice_sp_vol.index = pd.to_datetime(training_indice_sp_vol.index)

training_ftse, training_indice_sp_vol = training_ftse.align(training_indice_sp_vol, join="inner", axis=0)
training_ftse, training_indice_ftse = training_ftse.align(training_indice_ftse, join="inner",axis=0)

tickers = sorted(set(training_ftse.columns.get_level_values(1)))
for ticker in tickers:
    training_ftse[("log_return",ticker)] = np.log(training_ftse[("Close",ticker)]/
                                                training_ftse[("Open",ticker)])
for ticker in tickers:
    training_ftse[("open_gap_up",ticker)] = np.where(
        training_ftse[("Open",ticker)].shift(-1)>training_ftse[("Close",ticker)]*1.01,1,0
        )
for ticker in tickers:
    training_ftse[("open_gap_down",ticker)] = np.where(
        training_ftse[("Open",ticker)].shift(-1)<training_ftse[("Close",ticker)]*0.99,1,0
        )
training_ftse = training_ftse.iloc[:,-9:]

training_indice_sp_vol[("log_return","^VIX")] = np.log(training_indice_sp_vol[("Close","^VIX")]/
                                                training_indice_sp_vol[("Open","^VIX")])
training_indice_sp_vol = training_indice_sp_vol.iloc[:,-1]

x_classifier_ftse = training_ftse.merge(training_indice_sp_vol,how="inner",left_index=True,right_index=True)
previous_indice_day = {}
for i in range(0,5):
    previous_indice_day[("shift",f"shift {str(i)}")] = np.log(
        training_indice_ftse[("Close","^FTSE")].shift(i)/training_indice_ftse[("Open","^FTSE")].shift(i)
        )
previous_indice_day[("gap","open_gap_up")] = np.where(
    training_indice_ftse[("Open","^FTSE")].shift(-1)>training_indice_ftse[("Close","^FTSE")]*1.01,1,0
    )
previous_indice_day[("gap","open_gap_down")] = np.where(
    training_indice_ftse[("Open","^FTSE")].shift(-1)<training_indice_ftse[("Close","^FTSE")]*0.99,1,0
    )
previous_indice_day = pd.DataFrame(previous_indice_day, index=training_indice_ftse.index)
x_classifier_ftse_csv = x_classifier_ftse.merge(previous_indice_day,how="inner",left_index=True,right_index=True)
x_classifier_ftse_csv.to_csv("data/training_data_x_classifier/x_classifier_ftse.csv")
previous_indice_day = previous_indice_day.iloc[i:-2,:]
x_classifier_ftse = x_classifier_ftse.iloc[i:-2,:]
x_classifier_ftse = x_classifier_ftse.to_numpy()
previous_indice_day = previous_indice_day.to_numpy()
x_classifier_ftse = np.concatenate((x_classifier_ftse,previous_indice_day), axis=1)
y_classifier_ftse = np.where(training_indice_ftse[("Close", "^FTSE")].shift(-1)>
training_indice_ftse[("Open", "^FTSE")].shift(-1),1,0)
y_classifier_ftse = y_classifier_ftse[i:-2]

In [93]:
trained_ftse = False
if trained_ftse == False:
    class_weight_dict = {   "standard":None,
                            "balanced":"balanced",
                            "signal":{0:1.7,1:1.0}  }
    classifier_model_ftse = {}
    model_accuracy_ftse = {}
    classification_report_ftse = {}
    x_train, x_test, y_train, y_test = train_test_split(x_classifier_ftse, y_classifier_ftse, test_size=0.3, shuffle=False)
    for name,weight in class_weight_dict.items():
        model = RandomForestClassifier( n_estimators=4000,
                                        max_depth=10,
                                        min_samples_split=10,
                                        min_samples_leaf=4,
                                        max_features='sqrt',
                                        class_weight=weight,
                                        n_jobs=-1   )
        model = model.fit(x_train,y_train)
        classifier_model_ftse[name] = model
        y_pred = classifier_model_ftse[name].predict(x_test)
        model_accuracy_ftse[name] = accuracy_score(y_test, y_pred)
        print(f"La précision du modèle {name} est de {round(model_accuracy_ftse[name]*100,3)} %")
        classification_report_ftse[name] = classification_report(y_test, y_pred)
        print(classification_report_ftse[name])
    joblib.dump(classifier_model_ftse, "data/model/classifier_model_ftse.joblib")
    joblib.dump(model_accuracy_ftse, "data/model/model_accuracy_ftse.joblib")
    joblib.dump(classification_report_ftse, "data/model/classification_report_ftse.joblib")

La précision du modèle standard est de 60.698 %
              precision    recall  f1-score   support

           0       0.59      0.52      0.55       835
           1       0.62      0.69      0.65       941

    accuracy                           0.61      1776
   macro avg       0.61      0.60      0.60      1776
weighted avg       0.61      0.61      0.60      1776

La précision du modèle balanced est de 61.092 %
              precision    recall  f1-score   support

           0       0.59      0.55      0.57       835
           1       0.63      0.66      0.64       941

    accuracy                           0.61      1776
   macro avg       0.61      0.61      0.61      1776
weighted avg       0.61      0.61      0.61      1776

La précision du modèle signal est de 54.561 %
              precision    recall  f1-score   support

           0       0.51      0.83      0.63       835
           1       0.66      0.29      0.40       941

    accuracy                           0