In [1]:
import tensorflow as tf
from tensorflow.keras.layers import GRU, Dense, Input, Bidirectional, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import pandas as pd
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix




In [2]:
# LOAD data
X_train =pd.read_csv('C:\\Users\\Dell\\Desktop\\CFM\\data\\X_train.csv')
Y_train =pd.read_csv("C:\\Users\\Dell\\Desktop\\CFM\\data\\Y_train.csv")

X_test =pd.read_csv("C:\\Users\\Dell\\Desktop\\CFM\\data\\X_test.csv")
Y_test =pd.read_csv("C:\\Users\\Dell\\Desktop\\CFM\\data\\Y_test.csv")

In [3]:
X_train.head(5)

Unnamed: 0,obs_id,venue,order_id,action,side,price,bid,ask,bid_size,ask_size,trade,flux
0,0,4,0,A,A,0.3,0.0,0.01,100,1,False,100
1,0,4,1,A,B,-0.17,0.0,0.01,100,1,False,100
2,0,4,2,D,A,0.28,0.0,0.01,100,1,False,-100
3,0,4,3,A,A,0.3,0.0,0.01,100,1,False,100
4,0,4,4,D,A,0.37,0.0,0.01,100,1,False,-100


# <span style="color:red;">Data Preparation </span>

## <span style="color:orange;">1)Embedding training data </span>

In [4]:
def one_hot_encode(df, column, num_categories=8):
    # Encodage one-hot avec pd.get_dummies
    encoded_df = pd.get_dummies(df[column], prefix=column)
    
    # Générer les noms des colonnes attendues
    expected_columns = [f"{column}_{i}" for i in range(num_categories)]
    
    # Ajouter des colonnes manquantes avec des zéros
    for col in expected_columns:
        if col not in encoded_df.columns:
            encoded_df[col] = 0
    
    # Réorganiser les colonnes dans l'ordre attendu
    encoded_df = encoded_df[expected_columns]
    
    return encoded_df

In [5]:
df1 = one_hot_encode(X_train, 'venue', 8)
df3 = one_hot_encode(X_train, 'trade', 8)

In [6]:
df1.head(5)

Unnamed: 0,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5,venue_6,venue_7
0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0


In [7]:
# Encodage one-hot avec 8 colonnes
encoded_columns = pd.get_dummies(X_train['action'], prefix='action')

# Création d'une colonne 'action_0' remplie de zéros
encoded_columns['action_0'] = 0

# Réorganiser les colonnes pour avoir 'action_0' en première position
encoded_columns = encoded_columns[['action_0'] + [col for col in encoded_columns.columns if col != 'action_0']]

# Ajout des colonnes manquantes avec des zéros si nécessaire
num_of_new_columns = 8
for i in range(num_of_new_columns - encoded_columns.shape[1]):
    encoded_columns[f'action_{encoded_columns.shape[1]}'] = 0

In [8]:
df2=pd.DataFrame(encoded_columns)

In [9]:
df2.head(5)

Unnamed: 0,action_0,action_A,action_D,action_U,action_4,action_5,action_6,action_7
0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0


In [10]:
df3.head(5)

Unnamed: 0,trade_0,trade_1,trade_2,trade_3,trade_4,trade_5,trade_6,trade_7
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [11]:
# Log Transform
X_train['flux'] = X_train['flux'] - X_train['flux'].min() + 1
data = {
    'log(bid_size+1)': np.sign(X_train['bid_size']) * np.log(np.abs(X_train['bid_size']) + 1),
    'log(ask_size+1)': np.log(X_train['ask_size'] + 1),
    'log(flux)': np.log(X_train['flux'])
}
dfa = pd.DataFrame(data)
X_selected = X_train[['bid', 'ask', 'price']]

In [12]:
# Combiner les features
df_combined = pd.concat([df1, df2, df3, X_selected, dfa], axis=1)

# Reshape the data to create sequences of (100, 30)
# We need to ensure we have enough rows to create full sequences
num_sequences = len(df_combined) // 100
X_reshaped = df_combined.iloc[:num_sequences * 100].values.reshape(num_sequences, 100, 30)

# Convertir les labels en tenseurs
Y_labels = pd.get_dummies(Y_train['eqt_code_cat']).values
Y_reshaped = Y_labels[:num_sequences]

# Conversion en tenseurs
X_tensor = tf.convert_to_tensor(X_reshaped, dtype=tf.float32)
Y_tensor = tf.convert_to_tensor(Y_reshaped, dtype=tf.float32)

## <span style="color:orange;">2)Embedding test Data </span>

In [13]:
# Appliquer le one-hot encoding aux colonnes 'venue', 'action', et 'trade' pour X_test
df1_test = one_hot_encode(X_test, 'venue', 8)
df3_test = one_hot_encode(X_test, 'trade', 8)

In [14]:
# Encodage one-hot avec 8 colonnes
encoded_columns = pd.get_dummies(X_test['action'], prefix='action')

# Création d'une colonne 'action_0' remplie de zéros
encoded_columns['action_0'] = 0

# Réorganiser les colonnes pour avoir 'action_0' en première position
encoded_columns = encoded_columns[['action_0'] + [col for col in encoded_columns.columns if col != 'action_0']]

# Ajout des colonnes manquantes avec des zéros si nécessaire
num_of_new_columns = 8
for i in range(num_of_new_columns - encoded_columns.shape[1]):
    encoded_columns[f'action_{encoded_columns.shape[1]}'] = 0

In [15]:
df2_test=pd.DataFrame(encoded_columns)

In [16]:
#Log Transform pour X_test
X_test['flux'] = X_test['flux'] - X_test['flux'].min() + 1
data_test = {
    'log(bid_size+1)':np.sign(X_test['bid_size']) * np.log(np.abs(X_test['bid_size']) + 1),
    'log(ask_size+1)': np.log(X_test['ask_size'] + 1),
    'log(flux)': np.log(X_test['flux'])
}
dfa_test = pd.DataFrame(data_test)
X_selected_test = X_test[['bid', 'ask', 'price']]

In [17]:
# Combiner les features pour X_test
df_combined_test = pd.concat([df1_test, df2_test, df3_test, X_selected_test, dfa_test], axis=1)

# Reshape the data to create sequences of (100, 30)
# We need to ensure we have enough rows to create full sequences for X_test
num_sequences_test = len(df_combined_test) // 100

# Only take enough rows to form complete sequences
X_combined_test = df_combined_test.iloc[:num_sequences_test * 100]

# Reshape the data to the required shape
X_reshaped_test = X_combined_test.values.reshape(num_sequences_test, 100, 30)

# Convertir les labels en tenseurs pour Y_test
Y_labels_test = pd.get_dummies(Y_test['eqt_code_cat']).values

# Ensure Y_reshaped_test has the same number of sequences as X_reshaped_test
Y_reshaped_test = Y_labels_test[:num_sequences_test]

# Conversion en tenseurs pour X_test et Y_test
X_tensor_test = tf.convert_to_tensor(X_reshaped_test, dtype=tf.float32)
Y_tensor_test = tf.convert_to_tensor(Y_reshaped_test, dtype=tf.float32)

# <span style="color:red;">Creation de modele </span>

In [None]:

# Définir la taille d'une observation
sequence_length = 100
feature_dim = 30

# Définir l'entrée du modèle
input_layer = Input(shape=(sequence_length, feature_dim))

# Ajouter des couches GRU bidirectionnelles avec Dropout et Batch Normalization
gru_1 = Bidirectional(GRU(64, return_sequences=True))(input_layer)
dropout_1 = Dropout(0.3)(gru_1)
batch_norm_1 = BatchNormalization()(dropout_1)
gru_2 = Bidirectional(GRU(64))(batch_norm_1)
dropout_2 = Dropout(0.3)(gru_2)
batch_norm_2 = BatchNormalization()(dropout_2)

# Ajouter des couches Denses
dense_1 = Dense(128, activation='selu')(batch_norm_2)
dropout_3 = Dropout(0.3)(dense_1)
dense_2 = Dense(64, activation='selu')(dropout_3)
output_layer = Dense(24, activation='softmax')(dense_2)

# Créer le modèle
model = Model(inputs=input_layer, outputs=output_layer


Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 30)]         0         
                                                                 
 bidirectional (Bidirection  (None, 100, 128)          36864     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 100, 128)          0         
                                                                 
 batch_normalization (Batch  (None, 100, 128)          512       
 Normalization)                                                  
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               74496     
 onal)                                                           
                                                            

In [None]:
# Compiler le modèle
optimizer = Adam(learning_rate=3e-3)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Afficher un résumé du modèle
model.summary()

# <span style="color:red;">Entrainement de modele </span>

In [None]:
# Définir le callback EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Entraîner le modèle avec EarlyStopping
history = model.fit(
    X_tensor, Y_tensor,
    validation_split=0.2,
    batch_size=1000,
    epochs=50,
    callbacks=[early_stopping]
)

In [None]:
param=model.save_weights('model_weights.h5')

# <span style="color:red;">Evaluation de modele </span>

In [None]:
# Évaluer le modèle sur l'ensemble de test
loss, accuracy = model.evaluate(X_tensor_test, Y_tensor_test, verbose=0)
print(f'Précision sur l\'ensemble de test: {accuracy}')

# Faire des prédictions et évaluer les performances
y_pred = model.predict(X_tensor_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(Y_reshaped_test, axis=1)

print('Rapport de classification:')
print(classification_report(y_true, y_pred_classes))
print('Matrice de confusion:')
print(confusion_matrix(y_true, y_pred_classes))