In [51]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadr
import os
import numpy as np

In [52]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [53]:
def missing_values(df):
  return(df.isnull().sum())

In [54]:
import pandas as pd

def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers


In [55]:
def plot_variable_distributions(dataframe, output_dir="plots"):
    os.makedirs(output_dir, exist_ok=True)

    num_colors = len(dataframe.select_dtypes(include='number').columns)
    colors = sns.color_palette("husl", n_colors=num_colors)

    for i, column in enumerate(dataframe.select_dtypes(include='number').columns):
        plt.figure(figsize=(10, 6))
        sns.histplot(dataframe[column], kde=True, 
                     color=colors[i], edgecolor="black", alpha=0.5,  
                     line_kws={"linewidth": 2})  
        plt.title(f"Distribution de {column}")
        plt.xlabel(column)
        plt.ylabel("Fréquence")
        plt.show()  

    for column in dataframe.select_dtypes(include='category').columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(y=dataframe[column], order=dataframe[column].value_counts().index, palette="Set2")
        plt.title(f"Distribution de {column}")
        plt.xlabel("Fréquence")
        plt.ylabel(column)
        plt.show()  


In [56]:
def plot_boxplots(df):
    numeric_columns = df.select_dtypes(include='number').columns
    plt.figure(figsize=(15, 10))

    # Créer un boxplot pour chaque variable numérique
    for i, col in enumerate(numeric_columns, 1):
        plt.subplot(4, 4, i)
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot de {col}')

    plt.tight_layout()
    plt.show()


In [57]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## euMTPL

In [58]:
file_paths = ['euMTPL.rda']

In [59]:
df = load_rda(file_paths)

In [60]:
df1=clean_data(df)

In [61]:
df_euMTPL=df1['euMTPL']
df_euMTPL

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0.0,0,0.0,0,0.0,0,0.0,0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0.0,0,0.0,0,0.0,0,0.0,0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0.0,0,0.0,0,0.0,0,0.0,0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0.0,0,0.0,0,0.0,0,0.0,0


In [62]:
# Vérifier si les colonnes existent dans la base
cols_to_sum = ['cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']

# Remplacer les valeurs NaN par 0 avant la somme
df_euMTPL[cols_to_sum] = euMTPL_df[cols_to_sum].fillna(0)

# Créer la nouvelle variable claimAmount
df_euMTPL['claimAmount'] = euMTPL_df[cols_to_sum].sum(axis=1)

In [63]:
df_euMTPL.head()

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd,claimAmount
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0,0.0


## beMTPL

In [64]:
file_paths = ['/home/onyxia/work/Federated_Learning_Milliman/Kevin/beMTPL16.rda']
df = load_rda(file_paths)
df_1=clean_data(df)
df_beMTPL=df_1['beMTPL16']
df_beMTPL

Unnamed: 0,insurance_contract,policy_year,exposure,insured_birth_year,vehicle_age,policy_holder_age,driver_license_age,vehicle_brand,vehicle_model,mileage,vehicle_power,catalog_value,claim_value,number_of_liability_claims,number_of_bodily_injury_liability_claims,claim_time,claim_responsibility_rate,driving_training_label,signal
0,C1,1,0.386301,1945,10,9,40,MERCEDES,ME-1245,30000,75,983732,2,0,0,00:00,0,No,0
1,C2,1,0.493151,1941,4,25,24,VOLKSWAGEN,VO-2461,30000,55,510562,8,0,0,07:45,0,No,0
2,C3,1,0.290411,1944,0,2,39,AUDI,AU-967,30000,120,1934768,10,0,0,00:00,0,No,0
3,C4,1,0.336986,1948,1,14,37,LANCIA,LA-2346,30000,51,536755,13,0,0,18:50,0,No,0
4,C5,1,0.219178,1928,3,7,59,CITROEN,CI-1258,30000,54,446725,14,0,0,00:00,100,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70786,C58720,4,0.650273,1946,6,1,40,OPEL,OP-3248,30000,74,605100,169567,0,0,00:00,0,No,0
70787,C58721,4,0.229508,1945,8,13,43,CITROEN,CI-1263,30000,85,781789,169588,0,0,00:00,0,No,0
70788,C58722,4,0.767760,1937,12,5,44,RENAULT,RE-2061,30000,55,0,169633,0,0,00:00,0,No,0
70789,C52079,4,0.289617,1949,3,9,42,CITROEN,CI-1265,30000,120,1216867,169684,0,0,00:00,0,No,0


## freMTPL

In [65]:
df_freMTPL1 = pd.read_csv("freMTPL1.csv")
df_freMTPL2 = pd.read_csv("freMTPL2.csv")
# Afficher les premières lignes
print(df_freMTPL1.head())
print(df_freMTPL2.head())


   PolicyID  ClaimAmount  ClaimNb  Exposure Power  CarAge  DriverAge  \
0        33          302        1      0.75     g       1         61   
1        41         2001        1      0.14     l       5         50   
2        92         1449        1      0.14     d       0         36   
3        96         9924        2      0.62     j       0         51   
4        96          946        2      0.62     j       0         51   

                                Brand      Gas           Region  Density  
0  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
1  Japanese (except Nissan) or Korean   Diesel  Basse-Normandie       56  
2  Japanese (except Nissan) or Korean  Regular    Ile-de-France     4792  
3  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
4  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
   IDpol  ClaimAmount  ClaimNb  Exposure  VehPower  VehAge  DrivAge  \
0    139       303.00        1      0.75      

In [66]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

Index(['PolicyID', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'Brand', 'Gas', 'Region', 'Density'],
      dtype='object')
Index(['IDpol', 'ClaimAmount', 'ClaimNb', 'Exposure', 'VehPower', 'VehAge',
       'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Area', 'Density',
       'Region'],
      dtype='object')


In [67]:
# Renommer les colonnes pour df_freMTPL1
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

# Renommer les colonnes pour df_freMTPL2
df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

# Supprimer les colonnes BonusMalus et Area de df_freMTPL2
df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

# Réorganiser les colonnes dans le même ordre
columns_order = ["IdPolicy", "ClaimAmount", "ClaimNb", "Exposure", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]

# Afficher les nouvelles colonnes pour vérifier
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)


Index(['IdPolicy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'power', 'CarAge',
       'DriverAge', 'CarBrand', 'CarGas', 'Region', 'Density'],
      dtype='object')
Index(['IdPolicy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'power', 'CarAge',
       'DriverAge', 'CarBrand', 'CarGas', 'Region', 'Density'],
      dtype='object')


In [68]:
#Fusion verticale des bases
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)

df_freMTPL['CarGas'] = df_freMTPL['CarGas'].map({'Regular': 1, 'Diesel': 0})
# Vérification de la fusion
print(df_freMTPL.shape)  # Vérifie le nombre total de lignes et de colonnes
print(df_freMTPL.head())  # Affiche les premières lignes du DataFrame final


(42625, 11)
   IdPolicy  ClaimAmount  ClaimNb  Exposure power  CarAge  DriverAge  \
0        33        302.0        1      0.75     g       1         61   
1        41       2001.0        1      0.14     l       5         50   
2        92       1449.0        1      0.14     d       0         36   
3        96       9924.0        2      0.62     j       0         51   
4        96        946.0        2      0.62     j       0         51   

                             CarBrand  CarGas           Region  Density  
0  Japanese (except Nissan) or Korean       1    Ile-de-France    27000  
1  Japanese (except Nissan) or Korean       0  Basse-Normandie       56  
2  Japanese (except Nissan) or Korean       1    Ile-de-France     4792  
3  Japanese (except Nissan) or Korean       1    Ile-de-France    27000  
4  Japanese (except Nissan) or Korean       1    Ile-de-France    27000  


### Comparaison et fusion des bases euMTPL,beMTPL et freMTPL

In [74]:
print(df_freMTPL.columns)
print(df_beMTPL.columns)
print(df_euMTPL.columns)



Index(['IdPolicy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'power', 'CarAge',
       'DriverAge', 'CarBrand', 'CarGas', 'Region', 'Density'],
      dtype='object')
Index(['insurance_contract', 'policy_year', 'exposure', 'insured_birth_year',
       'vehicle_age', 'policy_holder_age', 'driver_license_age',
       'vehicle_brand', 'vehicle_model', 'mileage', 'vehicle_power',
       'catalog_value', 'claim_value', 'number_of_liability_claims',
       'number_of_bodily_injury_liability_claims', 'claim_time',
       'claim_responsibility_rate', 'driving_training_label', 'signal'],
      dtype='object')
Index(['policy_id', 'group', 'fuel_type', 'year', 'vehicle_category',
       'vehicle_use', 'province', 'horsepower', 'gender', 'age', 'exposure',
       'cost_nc', 'num_nc', 'cost_cg', 'num_cg', 'cost_fcg', 'num_fcg',
       'cost_cd', 'num_cd', 'claimAmount'],
      dtype='object')
