In [1]:
import os
import s3fs
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from tensorflow.keras import models, layers
from tensorflow.keras.callbacks import EarlyStopping
from keras_tuner import HyperModel, RandomSearch

2025-01-21 13:11:53.899264: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-21 13:11:53.901383: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-21 13:11:53.906738: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-21 13:11:53.921926: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737465113.948005 2052160 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737465113.95

# 1. Data

In [2]:
S3_ENDPOINT_URL = "https://" + os.environ['AWS_S3_ENDPOINT']

fs = s3fs.S3FileSystem(client_kwargs = {'endpoint_url' : S3_ENDPOINT_URL})

BUCKET = "malcouffe1/Module_1"
FILE_KEY_S3 = "/readmission_avc.parquet"
FILE_PATH_S3 = BUCKET + FILE_KEY_S3

with fs.open(FILE_PATH_S3, 'rb') as file_in:
    df = pd.read_parquet(file_in)


In [3]:
df = df.dropna(subset="id_D", axis=0)
df.loc[:, 'rea'] = df.loc[:, 'id_D'].apply(lambda x : 1 if x != '' else 0)

# On convertit les colonnes str dans le bon format :

str_col = ['modeEntree', 'modeSortie', 'sexe']

df.loc[:, str_col] = df.loc[:, str_col].map(str)

df = df.replace('nan', np.nan)

df['nbda'] = df['nbda'].fillna(0)

df = df[df["modeSortie"] != '9'].drop(['id', 'id_D'], axis=1)

  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)


# 2. Pre-processing

In [4]:
features = df.drop('rea', axis=1)
label = df['rea']

In [5]:
num_features = features.select_dtypes(include=["int32", "float64"]).columns
cat_features = features.select_dtypes(include=["object"]).columns

In [6]:
# On instancie les pipelines pour les colonnes numériques et catégorielles
steps = [
    ('imp', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
]
num_transformer = Pipeline(steps=steps)

steps = [
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]
cat_transformer = Pipeline(steps=steps)

# On instancie le ColumnTransformers pour les features numériques et catégorielles
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('ncatum', cat_transformer, cat_features)
])

In [7]:
# Première étape : séparer un jeu de test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    features, label, test_size=0.1, stratify=label, shuffle=True, random_state=42
)

# Affichage des dimensions
print(f"X_train shape: {X_train_val.shape}")
print(f"y_train shape: {y_train_val.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_test shape:  {y_test.shape}")

X_train shape: (1189, 10)
y_train shape: (1189,)
X_test shape:  (133, 10)
y_test shape:  (133,)


# 3. Modeling

In [8]:
from keras_tuner import RandomSearch
from sklearn.model_selection import StratifiedKFold
import numpy as np
import tensorflow as tf

# Définir le modèle pour Keras Tuner
def build_model(hp):
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(
            units=hp.Int('units', min_value=8, max_value=128, step=8),
            activation=hp.Choice('activation', values=['relu', 'tanh']),
            input_shape=(X_train_val.shape[1],)
        ),
        tf.keras.layers.Dropout(rate=hp.Float('dropout', min_value=0.0, max_value=0.5, step=0.1)),
        tf.keras.layers.Dense(1, activation='sigmoid')  # Classification binaire
    ])
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Configuration du tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=1,
    directory='random_search',
    project_name='tuning_project'
)

# Validation croisée stratifiée
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Sauvegarder les scores pour chaque fold
all_scores = []

X_train_val_np = X_train_val.to_numpy()  # Convertir en tableau NumPy
y_train_val_np = y_train_val.to_numpy()

# Validation croisée stratifiée
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, valid_idx in kf.split(X_train_val_np, y_train_val_np):
    # Utilisation des indices pour NumPy
    X_fold_train, X_fold_valid = X_train_val_np[train_idx], X_train_val_np[valid_idx]
    y_fold_train, y_fold_valid = y_train_val_np[train_idx], y_train_val_np[valid_idx]

    # Recherche d'hyperparamètres sur chaque fold
    tuner.search(
        X_fold_train, y_fold_train,
        validation_data=(X_fold_valid, y_fold_valid),
        epochs=50,
        batch_size=32,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        ],
        verbose=1
    )

    # Évaluation du meilleur modèle pour ce fold
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    best_model = tuner.hypermodel.build(best_hps)

    history = best_model.fit(
        X_fold_train, y_fold_train,
        validation_data=(X_fold_valid, y_fold_valid),
        epochs=50,
        batch_size=32,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
        ],
        verbose=0
    )

    score = best_model.evaluate(X_fold_valid, y_fold_valid, verbose=0)
    all_scores.append(score[1])  # Ajoute l'accuracy

print(f"Validation Cross-Validation Scores: {all_scores}")
print(f"Mean Validation Accuracy: {np.mean(all_scores)}")


Trial 2 Complete [00h 00m 01s]

Best val_accuracy So Far: None
Total elapsed time: 00h 00m 01s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
72                |72                |units
relu              |relu              |activation
0.1               |0.2               |dropout
0.0041277         |0.0058159         |lr



Traceback (most recent call last):
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 274, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 239, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 233, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/hypermodel.py", line 149, in fit
    return model.fit(*args, **kwargs)
  File "/home/onyxia/work/.ven

RuntimeError: Number of consecutive failures exceeded the limit of 3.
Traceback (most recent call last):
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 274, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/base_tuner.py", line 239, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 314, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/tuner.py", line 233, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras_tuner/src/engine/hypermodel.py", line 149, in fit
    return model.fit(*args, **kwargs)
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/onyxia/work/.venv/lib/python3.10/site-packages/optree/ops.py", line 766, in tree_map
    return treespec.unflatten(map(func, *flat_args))
ValueError: Invalid dtype: object
