In [1]:
import os
import s3fs
import pandas as pd
import numpy as np

In [2]:
S3_ENDPOINT_URL = "https://" + os.environ['AWS_S3_ENDPOINT']

fs = s3fs.S3FileSystem(client_kwargs = {'endpoint_url' : S3_ENDPOINT_URL})

BUCKET = "malcouffe1/Module_1"
FILE_KEY_S3 = "/readmission_avc.parquet"
FILE_PATH_S3 = BUCKET + FILE_KEY_S3

In [3]:
with fs.open(FILE_PATH_S3, 'rb') as file_in:
    df = pd.read_parquet(file_in)

In [4]:
df

Unnamed: 0,modeEntree,modeSortie,duree,ghm2,dp,sexe,age,nbActe,nbRum,nbda,id,id_D
0,8,9,0,01M37E,I671,2.0,76.0,4,1,,l19,
1,8,8,3,01C061,I652,2.0,77.0,4,1,1.0,s7e,
2,8,7,13,01M303,I634,,,4,1,7.0,23f,
3,8,8,11,01M301,I639,1.0,83.0,4,2,2.0,8oi,
4,8,6,8,01M303,I635,1.0,71.0,4,1,9.0,otz,ld
...,...,...,...,...,...,...,...,...,...,...,...,...
1695,8,7,1,01M30T,I614,1.0,88.0,4,1,4.0,kjg,
1696,8,6,10,01M303,I635,1.0,81.0,10,3,7.0,gie,my
1697,8,8,8,01M301,I639,1.0,68.0,5,3,6.0,6bl,
1698,8,8,11,01M301,I676,2.0,28.0,16,5,7.0,7m8,


## 1. Data pre-processing

In [5]:
df.isna().sum()

modeEntree      0
modeSortie      0
duree           0
ghm2            0
dp              0
sexe           20
age            20
nbActe          0
nbRum           0
nbda          134
id              0
id_D          200
dtype: int64

On peut supprimer les lignes ou id_D est nul car on ne peut pas préduire dans ce cas la (id_D est la target).

In [6]:
df = df.dropna(subset="id_D", axis=0)

In [7]:
df.loc[:, 'rea'] = df.loc[:, 'id_D'].apply(lambda x : 1 if x != '' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'rea'] = df.loc[:, 'id_D'].apply(lambda x : 1 if x != '' else 0)


In [8]:
df

Unnamed: 0,modeEntree,modeSortie,duree,ghm2,dp,sexe,age,nbActe,nbRum,nbda,id,id_D,rea
0,8,9,0,01M37E,I671,2.0,76.0,4,1,,l19,,0
1,8,8,3,01C061,I652,2.0,77.0,4,1,1.0,s7e,,0
2,8,7,13,01M303,I634,,,4,1,7.0,23f,,0
4,8,6,8,01M303,I635,1.0,71.0,4,1,9.0,otz,ld,1
6,8,7,8,01M302,I639,1.0,92.0,7,2,4.0,np7,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,8,7,1,01M30T,I614,1.0,88.0,4,1,4.0,kjg,,0
1696,8,6,10,01M303,I635,1.0,81.0,10,3,7.0,gie,my,1
1697,8,8,8,01M301,I639,1.0,68.0,5,3,6.0,6bl,,0
1698,8,8,11,01M301,I676,2.0,28.0,16,5,7.0,7m8,,0


In [9]:
# On convertit les colonnes str dans le bon format :

str_col = ['modeEntree', 'modeSortie', 'sexe']

df.loc[:, str_col] = df.loc[:, str_col].map(str)

df = df.replace('nan', np.nan)

  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)


In [10]:
df.isna().sum()

modeEntree      0
modeSortie      0
duree           0
ghm2            0
dp              0
sexe           19
age            19
nbActe          0
nbRum           0
nbda          121
id              0
id_D            0
rea             0
dtype: int64

On voit que la colonne nbda pose problème. IL s'agit du nombre de comorbidité et on ne sait pas si les valeurs nulles sont 0 ou inconnues. Dans ce cas après consultation avec le métier, les NaN sont des 0 donc on peut les remplacer par des 0.

In [11]:
df['nbda'] = df['nbda'].fillna(0)

Il reste à supprimer les colonnes id et id_D et supprimer les lignes des clients qui sont des décédés puisque ceux la ne seront pas re-admis...

In [12]:
df = df[df["modeSortie"] != '9'].drop(['id', 'id_D'], axis=1)

In [13]:
file_path = os.path.join(BUCKET, "dataset.parquet")
with fs.open(file_path, 'wb') as file_out:
    df.to_parquet(file_out)

# 2. Features Engineering
## 2.1 Principes

In [14]:
from sklearn.impute import SimpleImputer

In [15]:
pd.DataFrame(SimpleImputer(strategy='most_frequent').fit_transform(df[["sexe"]]))

Unnamed: 0,0
0,2.0
1,1.0
2,1.0
3,1.0
4,2.0
...,...
1317,1.0
1318,1.0
1319,1.0
1320,2.0


## 2.2 Preprocessing Pipeline

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.ensemble import RandomForestClassifier

from pprint import pprint

On sépare les données en features et label. ATTENTION il ne faudra pas fit la Pipeline sur tout le dataset que sur le training set.

In [17]:
features = df.drop('rea', axis=1)
label = df['rea']

In [18]:
features.dtypes

modeEntree     object
modeSortie     object
duree           int32
ghm2           object
dp             object
sexe           object
age           float64
nbActe          int32
nbRum           int32
nbda          float64
dtype: object

In [19]:
num_features = features.select_dtypes(include=["int32", "float64"]).columns
cat_features = features.select_dtypes(include=["object"]).columns

In [20]:
# On instancie les pipelines pour les colonnes numériques et catégorielles
steps = [
    ('imp', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
]
num_transformer = Pipeline(steps=steps)

steps = [
    ('imp', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
]
cat_transformer = Pipeline(steps=steps)

In [21]:
# On instancie le ColumnTransformers pour les features numériques et catégorielles
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('ncatum', cat_transformer, cat_features)
])

On sépare le dataset en training et test sets.

In [22]:
# Première étape : séparer un jeu de test
X_train_val, X_test, y_train_val, y_test = train_test_split(
    features, label, test_size=0.1, stratify=label, shuffle=True, random_state=42
)

# Affichage des dimensions
print(f"X_train shape: {X_train_val.shape}")
print(f"y_train shape: {y_train_val.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_test shape:  {y_test.shape}")

X_train shape: (1189, 10)
y_train shape: (1189,)
X_test shape:  (133, 10)
y_test shape:  (133,)


# 3. Model

## 3.1. LogisticRegression

In [23]:
reg_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression())
])

reg_pipe_fitted = reg_pipe.fit(X_train_val, y_train_val)

In [24]:
y_pred = reg_pipe_fitted.predict(X_test)

print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC  : {roc_auc_score(y_test, y_pred):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")

Accuracy : 0.9323
ROC AUC  : 0.7955
F1 Score : 0.7429


## 3.2. RandomForest

In [25]:
# Pipeline
reg_pipe = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Grille de paramètres
param_grid = {
    'classifier__n_estimators': [50, 100, 300, 500],
    'classifier__max_depth': [5, 10, 20, None],
    'classifier__max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Création et fit du GridSearch
grid_search = GridSearchCV(
    estimator=reg_pipe,
    param_grid=param_grid,
    cv=cv,
    n_jobs=-1
)

grid_search.fit(X_train_val, y_train_val)

# Récupération du meilleur modèle
reg_pipe_fitted = grid_search.best_estimator_

# Prédictions sur le test
y_pred = reg_pipe_fitted.predict(X_test)

# Affichage des métriques
print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC  : {roc_auc_score(y_test, y_pred):.4f}")
print(f"F1 Score : {f1_score(y_test, y_pred):.4f}")

Accuracy : 0.9248
ROC AUC  : 0.7727
F1 Score : 0.7059
