In [57]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadr
import os
import numpy as np

In [58]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [59]:
def missing_values(df):
  return(df.isnull().sum())

In [60]:
import pandas as pd

def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers


In [63]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## euMTPL

In [64]:
file_paths = ['euMTPL.rda']

In [65]:
df = load_rda(file_paths)

In [66]:
df1=clean_data(df)

In [67]:
df_euMTPL=df1['euMTPL']
df_euMTPL

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0.0,0,0.0,0,0.0,0,0.0,0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0.0,0,0.0,0,0.0,0,0.0,0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0.0,0,0.0,0,0.0,0,0.0,0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0.0,0,0.0,0,0.0,0,0.0,0


In [68]:
# Création des nouvelles variables
df_euMTPL["ClaimNb"] = df_euMTPL[["num_nc", "num_cg", "num_cd", "num_fcg"]].fillna(0).sum(axis=1)
df_euMTPL["ClaimAmount"] = df_euMTPL[['cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']].fillna(0).sum(axis=1)

# Suppression des anciennes colonnes
cols_to_drop = ["num_nc", "num_cg", "num_cd", "num_fcg", 'cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']
df_euMTPL.drop(columns=cols_to_drop, inplace=True)

# Vérification
print(df_euMTPL.head())




   policy_id  group fuel_type  year vehicle_category vehicle_use province  \
0          1   test         B   7.0                1           1       PA   
1          2  train         B   7.0                1           1       NA   
2          4  train         B   7.0                1           1       CN   
3          5  train         B   7.0                1           1       NA   
4          6  train         B   7.0                1           1       NA   

   horsepower gender  age  exposure  ClaimNb  ClaimAmount  
0          14      M   77  0.487671        0          0.0  
1          12      M   40  0.019178        0          0.0  
2          14      M   75  0.032877        0          0.0  
3          13      M   48  0.043836        0          0.0  
4          12      F   54  0.046575        0          0.0  


In [69]:
df_euMTPL.head()

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,ClaimNb,ClaimAmount
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0,0.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0,0.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0,0.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0,0.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0,0.0


## beMTPL

In [79]:
file_paths = ['/home/onyxia/work/Federated_Learning_Milliman/raw_data/beMTPL97.rda']
df = load_rda(file_paths)
df_1=clean_data(df)
df_beMTPL=df_1['beMTPL97']
df_beMTPL

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat
0,1,1.000000,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386
1,2,1.000000,0,0,0.000000,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386
2,3,1.000000,0,0,0.000000,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386
3,4,1.000000,0,0,0.000000,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163207,163208,1.000000,0,0,0.000000,,TPL,37,male,5,40,10,diesel,work,0,9990,3.421256,51.199975
163208,163209,1.000000,0,0,0.000000,,TPL,44,male,0,55,6,diesel,private,0,9990,3.421256,51.199975
163209,163210,1.000000,0,0,0.000000,,TPL,50,male,0,40,10,diesel,private,0,9990,3.421256,51.199975
163210,163211,1.000000,0,0,0.000000,,TPL,43,male,0,66,7,gasoline,private,0,9990,3.421256,51.199975


## freMTPL

In [85]:
df_freMTPL1 = pd.read_csv("freMTPL1.csv")
df_freMTPL2 = pd.read_csv("freMTPL2.csv")
# Afficher les premières lignes
print(df_freMTPL1.head())
print(df_freMTPL2.head())


   PolicyID  ClaimAmount  ClaimNb  Exposure Power  CarAge  DriverAge  \
0        33          302        1      0.75     g       1         61   
1        41         2001        1      0.14     l       5         50   
2        92         1449        1      0.14     d       0         36   
3        96         9924        2      0.62     j       0         51   
4        96          946        2      0.62     j       0         51   

                                Brand      Gas           Region  Density  
0  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
1  Japanese (except Nissan) or Korean   Diesel  Basse-Normandie       56  
2  Japanese (except Nissan) or Korean  Regular    Ile-de-France     4792  
3  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
4  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
   IDpol  ClaimAmount  ClaimNb  Exposure  VehPower  VehAge  DrivAge  \
0    139       303.00        1      0.75      

In [86]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

Index(['PolicyID', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'Brand', 'Gas', 'Region', 'Density'],
      dtype='object')
Index(['IDpol', 'ClaimAmount', 'ClaimNb', 'Exposure', 'VehPower', 'VehAge',
       'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Area', 'Density',
       'Region'],
      dtype='object')


In [87]:
# Renommer les colonnes pour df_freMTPL1
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

# Renommer les colonnes pour df_freMTPL2
df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

# Supprimer les colonnes BonusMalus et Area de df_freMTPL2
df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

# Réorganiser les colonnes dans le même ordre
columns_order = ["IdPolicy", "ClaimAmount", "ClaimNb", "Exposure", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]


In [88]:
#Fusion verticale des bases
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)

# Vérification de la fusion
print(df_freMTPL.shape)  # Vérifie le nombre total de lignes et de colonnes
print(df_freMTPL.head())  # Affiche les premières lignes du DataFrame final


(42625, 11)
   IdPolicy  ClaimAmount  ClaimNb  Exposure power  CarAge  DriverAge  \
0        33        302.0        1      0.75     g       1         61   
1        41       2001.0        1      0.14     l       5         50   
2        92       1449.0        1      0.14     d       0         36   
3        96       9924.0        2      0.62     j       0         51   
4        96        946.0        2      0.62     j       0         51   

                             CarBrand   CarGas           Region  Density  
0  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
1  Japanese (except Nissan) or Korean   Diesel  Basse-Normandie       56  
2  Japanese (except Nissan) or Korean  Regular    Ile-de-France     4792  
3  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
4  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  


### Comparaison et fusion des bases euMTPL,beMTPL et freMTPL

In [89]:
print(df_freMTPL.columns)
print(df_beMTPL.columns)
print(df_euMTPL.columns)



Index(['IdPolicy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'power', 'CarAge',
       'DriverAge', 'CarBrand', 'CarGas', 'Region', 'Density'],
      dtype='object')
Index(['Id_policy', 'Exposure', 'claim', 'ClaimNb', 'ClaimAmount', 'average',
       'coverage', 'DriverAge', 'sex', 'bm', 'Power', 'agec', 'Fuel_type',
       'use', 'fleet', 'postcode', 'long', 'lat'],
      dtype='object')
Index(['Id_policy', 'group', 'Fuel_type', 'year', 'vehicle_category',
       'vehicle_use', 'province', 'Power', 'gender', 'DriverAge', 'Exposure',
       'ClaimNb', 'ClaimAmount'],
      dtype='object')


In [None]:
#RENOMMER LES NOMS DES VARIABLES

df_freMTPL = df_freMTPL.rename(columns={
    'IdPolicy': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'Exposure': 'Exposure',
    'power': 'Power',
    'DriverAge': 'DriverAge',
    'CarGas': 'Fuel_type'
})

df_beMTPL = df_beMTPL.rename(columns={
    'id': 'Id_policy',
    'amount': 'ClaimAmount',
    'nclaims': 'ClaimNb',
    'expo': 'Exposure',
    'power': 'Power',
    'ageph': 'DriverAge',
    'fuel': 'Fuel_type'
})

df_euMTPL = df_euMTPL.rename(columns={
    'policy_id': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'exposure': 'Exposure',
    'horsepower': 'Power',
    'age': 'DriverAge',
    'fuel_type': 'Fuel_type'
})

# Affichage des premières lignes pour vérifier
df_euMTPL
df_freMTPL
df_beMTPL

Unnamed: 0,Id_policy,Exposure,claim,ClaimNb,ClaimAmount,average,coverage,DriverAge,sex,bm,Power,agec,Fuel_type,use,fleet,postcode,long,lat
0,1,1.000000,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386
1,2,1.000000,0,0,0.000000,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386
2,3,1.000000,0,0,0.000000,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386
3,4,1.000000,0,0,0.000000,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163207,163208,1.000000,0,0,0.000000,,TPL,37,male,5,40,10,diesel,work,0,9990,3.421256,51.199975
163208,163209,1.000000,0,0,0.000000,,TPL,44,male,0,55,6,diesel,private,0,9990,3.421256,51.199975
163209,163210,1.000000,0,0,0.000000,,TPL,50,male,0,40,10,diesel,private,0,9990,3.421256,51.199975
163210,163211,1.000000,0,0,0.000000,,TPL,43,male,0,66,7,gasoline,private,0,9990,3.421256,51.199975


In [None]:
# Création de la variable Sinistre dans chaque base
df_freMTPL['Sinistre'] = (df_freMTPL['ClaimAmount'] > 0).astype(int)
df_beMTPL['Sinistre'] = (df_beMTPL['ClaimAmount'] > 0).astype(int)
df_euMTPL['Sinistre'] = (df_euMTPL['ClaimAmount'] > 0).astype(int)

Index(['Id_policy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'CarBrand', 'Fuel_type', 'Region', 'Density', 'Sinistre'],
      dtype='object')
Index(['Id_policy', 'Exposure', 'claim', 'ClaimNb', 'ClaimAmount', 'average',
       'coverage', 'DriverAge', 'sex', 'bm', 'Power', 'agec', 'Fuel_type',
       'use', 'fleet', 'postcode', 'long', 'lat', 'Sinistre'],
      dtype='object')
Index(['Id_policy', 'group', 'Fuel_type', 'year', 'vehicle_category',
       'vehicle_use', 'province', 'Power', 'gender', 'DriverAge', 'Exposure',
       'ClaimNb', 'ClaimAmount', 'Sinistre'],
      dtype='object')


In [94]:
df_euMTPL['Fuel_type'].unique()

['B', 'G', 'S', 'D', 'P', 'T', 'M', 'E']
Categories (8, object): ['B', 'D', 'E', 'G', 'M', 'P', 'S', 'T']

In [97]:
# Dictionnaire pour la conversion des carburants en catégories
carburant_dict = {
    'B': 'Regular',   # Bioéthanol ou autres carburants bio
    'E': 'Regular',   # Ethanol
    'S': 'Regular',   # Super essence
    'T': 'Regular',   # Autre carburant de type essence
    'D': 'Diesel',    # Diesel
    'G': 'Diesel',    # Gaz naturel comprimé (GNC)
    'M': 'Diesel',    # Méthane (utilisé dans les véhicules au gaz)
    'P': 'Diesel'     # Propane (utilisé pour des véhicules à gaz)
}

# Appliquer la catégorisation au dataframe df_euMTPL
df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].map(carburant_dict)
df_euMTPL


Unnamed: 0,Id_policy,group,Fuel_type,year,vehicle_category,vehicle_use,province,Power,gender,DriverAge,Exposure,ClaimNb,ClaimAmount,Sinistre
0,1,test,Regular,7.0,1,1,PA,14,M,77,0.487671,0,0.0,0
1,2,train,Regular,7.0,1,1,,12,M,40,0.019178,0,0.0,0
2,4,train,Regular,7.0,1,1,CN,14,M,75,0.032877,0,0.0,0
3,5,train,Regular,7.0,1,1,,13,M,48,0.043836,0,0.0,0
4,6,train,Regular,7.0,1,1,,12,F,54,0.046575,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,Regular,9.0,1,1,RM,14,F,40,0.246575,0,0.0,0
2373193,2595211,train,Regular,9.0,1,1,LE,14,M,52,0.756164,0,0.0,0
2373194,2595212,test,Regular,9.0,1,1,LE,1,F,46,0.008219,0,0.0,0
2373195,2595213,test,Regular,9.0,1,1,LE,15,M,25,0.723288,0,0.0,0


In [95]:
df_beMTPL['Fuel_type'].unique()

['gasoline', 'diesel']
Categories (2, object): ['diesel', 'gasoline']

In [96]:
df_freMTPL['Fuel_type'].unique()

array(['Regular', 'Diesel'], dtype=object)

In [None]:
#Catégorisation
# Dictionnaire pour catégoriser Diesel (1) et Regular (0) dans df_beMTPL
df_beMTPL['Fuel_type'] = df_beMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)
df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)
df_freMTPL['Fuel_type'] = df_freMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)

In [100]:
df_euMTPL
df_beMTPL
df_freMTPL

Unnamed: 0,Id_policy,ClaimAmount,ClaimNb,Exposure,Power,CarAge,DriverAge,CarBrand,Fuel_type,Region,Density,Sinistre
0,33,302.00,1,0.75,g,1,61,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
1,41,2001.00,1,0.14,l,5,50,Japanese (except Nissan) or Korean,1,Basse-Normandie,56,1
2,92,1449.00,1,0.14,d,0,36,Japanese (except Nissan) or Korean,0,Ile-de-France,4792,1
3,96,9924.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
4,96,946.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
42620,6113793,1769.88,1,0.14,7,2,51,B12,1,Aquitaine,178,1
42621,6113817,1288.28,1,0.17,4,0,35,B12,0,Rhone-Alpes,1719,1
42622,6113834,1940.40,2,0.17,15,3,36,B12,0,Poitou-Charentes,181,1
42623,6113834,10290.00,2,0.17,15,3,36,B12,0,Poitou-Charentes,181,1


In [103]:
# Identifier les variables communes aux trois bases
common_columns = ['Id_policy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'DriverAge', 'Fuel_type', 'Sinistre']

# Sélectionner uniquement les colonnes communes dans chaque base
df_euMTPL_common = df_euMTPL[common_columns]
df_beMTPL_common = df_beMTPL[common_columns]
df_freMTPL_common = df_freMTPL[common_columns]

# Ajouter une colonne 'Dataset' pour indiquer la provenance de chaque ligne
df_euMTPL_common['Dataset'] = 'european'
df_beMTPL_common['Dataset'] = 'belgium'
df_freMTPL_common['Dataset'] = 'french'

# Fusionner les bases en une seule base
Data = pd.concat([df_euMTPL_common, df_beMTPL_common, df_freMTPL_common], ignore_index=True)
Data



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_euMTPL_common['Dataset'] = 'european'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_beMTPL_common['Dataset'] = 'belgium'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_freMTPL_common['Dataset'] = 'french'


Unnamed: 0,Id_policy,ClaimAmount,ClaimNb,Exposure,Power,DriverAge,Fuel_type,Sinistre,Dataset
0,1,0.00,0,0.487671,14,77,0,0,european
1,2,0.00,0,0.019178,12,40,0,0,european
2,4,0.00,0,0.032877,14,75,0,0,european
3,5,0.00,0,0.043836,13,48,0,0,european
4,6,0.00,0,0.046575,12,54,0,0,european
...,...,...,...,...,...,...,...,...,...
2579029,6113793,1769.88,1,0.140000,7,51,1,1,french
2579030,6113817,1288.28,1,0.170000,4,35,0,1,french
2579031,6113834,1940.40,2,0.170000,15,36,0,1,french
2579032,6113834,10290.00,2,0.170000,15,36,0,1,french
