In [1]:
import pandas as pd
import pyreadr
from rapidfuzz import process, fuzz

In [2]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [3]:
def missing_values(df):
  return(df.isnull().sum())

In [4]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [5]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## beMTPL

In [6]:
raw_data_path = ['../raw_data/beMTPL97.rda']
df = load_rda(raw_data_path)
df_1=clean_data(df)
df_beMTPL=df_1['beMTPL97']
df_beMTPL

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat
0,1,1.000000,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386
1,2,1.000000,0,0,0.000000,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386
2,3,1.000000,0,0,0.000000,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386
3,4,1.000000,0,0,0.000000,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163207,163208,1.000000,0,0,0.000000,,TPL,37,male,5,40,10,diesel,work,0,9990,3.421256,51.199975
163208,163209,1.000000,0,0,0.000000,,TPL,44,male,0,55,6,diesel,private,0,9990,3.421256,51.199975
163209,163210,1.000000,0,0,0.000000,,TPL,50,male,0,40,10,diesel,private,0,9990,3.421256,51.199975
163210,163211,1.000000,0,0,0.000000,,TPL,43,male,0,66,7,gasoline,private,0,9990,3.421256,51.199975


In [7]:
df_belge=pd.read_csv('/home/onyxia/work/Federated_Learning_Milliman/clean_data/code-postaux-belge.csv', sep=';')

In [8]:
df_code_postaux_belge=df_belge[['Code','Localite']]
df_code_postaux_belge

Unnamed: 0,Code,Localite
0,1000,Bruxelles
1,1020,Laeken
2,1040,Etterbeek
3,1070,Anderlecht
4,1140,Evere
...,...,...
2756,9880,Aalter
2757,9890,Baaigem
2758,9940,Ertvelde
2759,9968,Bassevelde


In [9]:
df_code_postaux_belge = df_code_postaux_belge.rename(columns={'Code': 'postcode'})

df_code_postaux_belge_clean = df_code_postaux_belge.drop_duplicates(subset='postcode', keep='first')

df_beMTPL = df_beMTPL.merge(
    df_code_postaux_belge_clean,
    on='postcode',
    how='left'
)


In [10]:
df_beMTPL

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat,Localite
0,1,1.000000,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
1,2,1.000000,0,0,0.000000,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
2,3,1.000000,0,0,0.000000,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386,Bruxelles
3,4,1.000000,0,0,0.000000,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163207,163208,1.000000,0,0,0.000000,,TPL,37,male,5,40,10,diesel,work,0,9990,3.421256,51.199975,Maldegem
163208,163209,1.000000,0,0,0.000000,,TPL,44,male,0,55,6,diesel,private,0,9990,3.421256,51.199975,Maldegem
163209,163210,1.000000,0,0,0.000000,,TPL,50,male,0,40,10,diesel,private,0,9990,3.421256,51.199975,Maldegem
163210,163211,1.000000,0,0,0.000000,,TPL,43,male,0,66,7,gasoline,private,0,9990,3.421256,51.199975,Maldegem


In [11]:
#url = "https://fr.wikipedia.org/wiki/Liste_des_communes_de_Belgique_par_population"

#tables = pd.read_html(url)
#df_densite = tables[0][['Commune', 'Hab. par km²']]

# Nettoyer les valeurs de densité : retirer \xa0 et les espaces
#df_densite['Hab. par km²'] = df_densite['Hab. par km²'].astype(str).str.replace('\xa0', '', regex=False).str.replace(' ', '', regex=False).astype(int)
#df_densite.to_csv('wiki_scraping.csv')
# Afficher les valeurs uniques
#print(df_densite["Hab. par km²"].unique())
df_densite=pd.read_csv('/home/onyxia/work/Federated_Learning_Milliman/clean_data/wiki_scraping.csv')

In [12]:

# --- 1. Préparer le DataFrame wiki avec densité ---
df_densite['Commune'] = df_densite['Commune'].str.replace(r'\*\*?', '', regex=True).str.strip().str.lower()
df_densite = df_densite.drop_duplicates(subset='Commune', keep='first')

# --- 2. Nettoyer les noms de commune dans df_beMTPL ---
df_beMTPL['Localite'] = df_beMTPL['Localite'].astype(str).str.strip().str.lower()

# --- 3. Créer un mapping flou des noms ---
communes_wiki = df_densite['Commune'].tolist()
unique_communes = df_beMTPL['Localite'].dropna().unique()

fuzzy_map = {
    c: process.extractOne(c, communes_wiki, scorer=fuzz.token_sort_ratio)[0]
    for c in unique_communes
}

# --- 4. Appliquer le mapping et fusionner ---
df_beMTPL['commune_match'] = df_beMTPL['Localite'].map(fuzzy_map)

df_beMTPL = df_beMTPL.merge(
    df_densite.rename(columns={'Commune': 'commune_match', 'Hab. par km²': 'hab_par_km2'}),
    on='commune_match',
    how='left'
)

In [13]:
print(df_beMTPL.columns)

Index(['id', 'expo', 'claim', 'nclaims', 'amount', 'average', 'coverage',
       'ageph', 'sex', 'bm', 'power', 'agec', 'fuel', 'use', 'fleet',
       'postcode', 'long', 'lat', 'Localite', 'commune_match', 'Unnamed: 0',
       'hab_par_km2'],
      dtype='object')


In [14]:
df_beMTPL=df_beMTPL[["id","sex","amount","nclaims","expo","power", "ageph","fuel","Localite", "hab_par_km2"]]

In [15]:
df_beMTPL

Unnamed: 0,id,sex,amount,nclaims,expo,power,ageph,fuel,Localite,hab_par_km2
0,1,male,1618.001036,1,1.000000,77,50,gasoline,bruxelles,13928
1,2,female,0.000000,0,1.000000,66,64,gasoline,bruxelles,13928
2,3,male,0.000000,0,1.000000,70,60,diesel,bruxelles,13928
3,4,male,0.000000,0,1.000000,57,77,gasoline,bruxelles,13928
4,5,female,155.974606,1,0.046575,70,28,gasoline,bruxelles,13928
...,...,...,...,...,...,...,...,...,...,...
163207,163208,male,0.000000,0,1.000000,40,37,diesel,maldegem,259
163208,163209,male,0.000000,0,1.000000,55,44,diesel,maldegem,259
163209,163210,male,0.000000,0,1.000000,40,50,diesel,maldegem,259
163210,163211,male,0.000000,0,1.000000,66,43,gasoline,maldegem,259


In [16]:
df_beMTPL = df_beMTPL.rename(columns={
    'id': 'Id_policy',
    'amount': 'ClaimAmount',
    'nclaims': 'ClaimNb',
    'expo': 'Exposure',
    'power': 'Power',
    'ageph': 'DriverAge',
    'sex': 'Sex',
    'fuel': 'Fuel_type',
    'hab_par_km2': 'Density'
})
features = ["Exposure", 'Power', 'DriverAge', "Fuel_type", "Density", "Sex", 'Sinistre']


df_beMTPL['Sinistre'] = (df_beMTPL['ClaimAmount'] > 0).astype(int)
df_beMTPL['Fuel_type'] = df_beMTPL['Fuel_type'].apply(lambda x: 1 if x == 'diesel' else 0)
df_beMTPL['Sex'] = df_beMTPL['Sex'].apply(lambda x: 1 if x == 'male' else 0)

df_beMTPL=df_beMTPL[features]
df_beMTPL

Unnamed: 0,Exposure,Power,DriverAge,Fuel_type,Density,Sex,Sinistre
0,1.000000,77,50,0,13928,1,1
1,1.000000,66,64,0,13928,0,0
2,1.000000,70,60,1,13928,1,0
3,1.000000,57,77,0,13928,1,0
4,0.046575,70,28,0,13928,0,1
...,...,...,...,...,...,...,...
163207,1.000000,40,37,1,259,1,0
163208,1.000000,55,44,1,259,1,0
163209,1.000000,40,50,1,259,1,0
163210,1.000000,66,43,0,259,1,0


In [17]:
# Proportion des classes dans df_beMTPL
print(df_beMTPL['Sinistre'].value_counts(normalize=True))


Sinistre
0    0.888023
1    0.111977
Name: proportion, dtype: float64


In [18]:
df_beMTPL.to_csv('../data/belgium_data.csv', index=False)

In [19]:
from imblearn.over_sampling import SMOTENC
from sklearn.model_selection import train_test_split

# Colonnes catégorielles : Fuel_type (col 3), Sex (col 5)
cat_features = [3, 5]

# Séparer les features et la target
X = df_beMTPL.drop(columns='Sinistre')
y = df_beMTPL['Sinistre']

# Appliquer SMOTENC avec sampling_strategy pour atteindre 30 % de sinistres
smote_nc = SMOTENC(categorical_features=cat_features, sampling_strategy=0.43, random_state=42)
X_res, y_res = smote_nc.fit_resample(X, y)

# Reformer la base
df_beMTPL = X_res.copy()
df_beMTPL['Sinistre'] = y_res

# Vérification
proportions = df_beMTPL['Sinistre'].value_counts(normalize=True)
print(proportions)


Sinistre
0    0.699302
1    0.300698
Name: proportion, dtype: float64


In [20]:
df_beMTPL

Unnamed: 0,Exposure,Power,DriverAge,Fuel_type,Density,Sex,Sinistre
0,1.000000,77,50,0,13928,1,1
1,1.000000,66,64,0,13928,0,0
2,1.000000,70,60,1,13928,1,0
3,1.000000,57,77,0,13928,1,0
4,0.046575,70,28,0,13928,0,1
...,...,...,...,...,...,...,...
207253,0.653100,51,34,1,1985,1,1
207254,1.000000,52,30,0,2639,1,1
207255,1.000000,58,53,1,4762,1,1
207256,1.000000,85,42,0,473,1,1


In [22]:
df_beMTPL.to_csv('/home/onyxia/work/Federated_Learning_Milliman/data_augmentation/belgium_data.csv', index=False)