In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split


In [3]:

df_freMTPL = pd.read_csv("/home/onyxia/work/Federated_Learning_Milliman/data/french_data.csv")
df_beMTPL = pd.read_csv("/home/onyxia/work/Federated_Learning_Milliman/data/belgium_data.csv")
df_euMTPL = pd.read_csv("/home/onyxia/work/Federated_Learning_Milliman/data/european_data.csv")

In [4]:

# Colonnes catégorielles : Fuel_type (col 3), Sex (col 5)
cat_features = [3, 5]

# Séparer les features et la target
X = df_freMTPL.drop(columns='Sinistre')
y = df_freMTPL['Sinistre']

# Appliquer SMOTENC avec sampling_strategy pour atteindre 30 % de sinistres
smote_nc = SMOTENC(categorical_features=cat_features, sampling_strategy=0.43, random_state=42)
X_res, y_res = smote_nc.fit_resample(X, y)

# Reformer la base
df_freMTPL = X_res.copy()
df_freMTPL['Sinistre'] = y_res

# Vérification
proportions = df_freMTPL['Sinistre'].value_counts(normalize=True)
print(proportions)


Sinistre
0    0.699301
1    0.300699
Name: proportion, dtype: float64


In [5]:
import os; os.makedirs("/home/onyxia/work/Federated_Learning_Milliman/data_augmentation", exist_ok=True)
df_freMTPL.to_csv('/home/onyxia/work/Federated_Learning_Milliman/data_augmentation/french_data.csv', index=False)

In [6]:
# Colonnes catégorielles : Fuel_type (col 3), Sex (col 5)
cat_features = [3, 5]

# Séparer les features et la target
X = df_beMTPL.drop(columns='Sinistre')
y = df_beMTPL['Sinistre']

# Appliquer SMOTENC avec sampling_strategy pour atteindre 30 % de sinistres
smote_nc = SMOTENC(categorical_features=cat_features, sampling_strategy=0.43, random_state=42)
X_res, y_res = smote_nc.fit_resample(X, y)

# Reformer la base
df_beMTPL = X_res.copy()
df_beMTPL['Sinistre'] = y_res

# Vérification
proportions = df_beMTPL['Sinistre'].value_counts(normalize=True)
print(proportions)


Sinistre
0    0.699302
1    0.300698
Name: proportion, dtype: float64


In [7]:
import os; os.makedirs("/home/onyxia/work/Federated_Learning_Milliman/data_augmentation", exist_ok=True)
df_beMTPL.to_csv('/home/onyxia/work/Federated_Learning_Milliman/data_augmentation/belgium_data.csv', index=False)

In [8]:
# Colonnes catégorielles : Fuel_type (col 3), Sex (col 5)
cat_features = [3, 5]
df_euMTPL = df_euMTPL.dropna()

# Séparer les features et la target
X = df_euMTPL.drop(columns='Sinistre')
y = df_euMTPL['Sinistre']

# Appliquer SMOTENC avec sampling_strategy pour atteindre 30 % de sinistres
smote_nc = SMOTENC(categorical_features=cat_features, sampling_strategy=0.43, random_state=42)
X_res, y_res = smote_nc.fit_resample(X, y)

# Reformer la base
df_euMTPL = X_res.copy()
df_euMTPL['Sinistre'] = y_res

# Vérification
proportions = df_euMTPL['Sinistre'].value_counts(normalize=True)
print(proportions)


Sinistre
0    0.699301
1    0.300699
Name: proportion, dtype: float64


In [9]:
import os; os.makedirs("/home/onyxia/work/Federated_Learning_Milliman/data_augmentation", exist_ok=True)
df_euMTPL.to_csv('/home/onyxia/work/Federated_Learning_Milliman/data_augmentation/european_data.csv', index=False)

### REG LOGISTIC

In [11]:
import statsmodels.api as sm

def logistic_regression_summary(df, target='Sinistre', exclude=['Exposure']):
    X = df.drop(columns=exclude + [target])
    y = df[target]
    X = sm.add_constant(X)
    
    model = sm.Logit(y, X).fit()
    print(model.summary())
    return model

In [12]:
logistic_regression_summary(df_beMTPL)

Optimization terminated successfully.
         Current function value: 0.604790
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               Sinistre   No. Observations:               207258
Model:                          Logit   Df Residuals:                   207252
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 May 2025   Pseudo R-squ.:                 0.01090
Time:                        22:57:15   Log-Likelihood:            -1.2535e+05
converged:                       True   LL-Null:                   -1.2673e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.6425      0.017    -37.625      0.000      -0.676      -0.609
Power          0.5369      0.

<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f8c88133050>

In [13]:
logistic_regression_summary(df_freMTPL)


Optimization terminated successfully.
         Current function value: 0.609694
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               Sinistre   No. Observations:              1487420
Model:                          Logit   Df Residuals:                  1487414
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 May 2025   Pseudo R-squ.:                0.002880
Time:                        22:58:08   Log-Likelihood:            -9.0687e+05
converged:                       True   LL-Null:                   -9.0949e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8083      0.005   -152.488      0.000      -0.819      -0.798
Power         -0.8252      0.

<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f8adc9ebb00>

In [14]:
logistic_regression_summary(df_euMTPL)


Optimization terminated successfully.
         Current function value: 0.609409
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               Sinistre   No. Observations:              3130005
Model:                          Logit   Df Residuals:                  3129999
Method:                           MLE   Df Model:                            5
Date:                Thu, 08 May 2025   Pseudo R-squ.:                0.003347
Time:                        22:58:28   Log-Likelihood:            -1.9075e+06
converged:                       True   LL-Null:                   -1.9139e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.8672      0.006   -149.357      0.000      -0.879      -0.856
Power          1.9634      0.

<statsmodels.discrete.discrete_model.BinaryResultsWrapper at 0x7f8b0eb4d670>