In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadr
import os
import numpy as np

In [11]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [12]:
def missing_values(df):
  return(df.isnull().sum())

In [13]:
import pandas as pd

def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers


In [14]:
def plot_variable_distributions(dataframe, output_dir="plots"):
    os.makedirs(output_dir, exist_ok=True)

    num_colors = len(dataframe.select_dtypes(include='number').columns)
    colors = sns.color_palette("husl", n_colors=num_colors)

    for i, column in enumerate(dataframe.select_dtypes(include='number').columns):
        plt.figure(figsize=(10, 6))
        sns.histplot(dataframe[column], kde=True, 
                     color=colors[i], edgecolor="black", alpha=0.5,  
                     line_kws={"linewidth": 2})  
        plt.title(f"Distribution de {column}")
        plt.xlabel(column)
        plt.ylabel("Fréquence")
        plt.show()  

    for column in dataframe.select_dtypes(include='category').columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(y=dataframe[column], order=dataframe[column].value_counts().index, palette="Set2")
        plt.title(f"Distribution de {column}")
        plt.xlabel("Fréquence")
        plt.ylabel(column)
        plt.show()  


In [15]:
def plot_boxplots(df):
    numeric_columns = df.select_dtypes(include='number').columns
    plt.figure(figsize=(15, 10))

    # Créer un boxplot pour chaque variable numérique
    for i, col in enumerate(numeric_columns, 1):
        plt.subplot(4, 4, i)
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot de {col}')

    plt.tight_layout()
    plt.show()


In [16]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## euMTPL

In [17]:
file_paths = ['euMTPL.rda']

In [18]:
df = load_rda(file_paths)

In [19]:
df1=clean_data(df)

In [20]:
euMTPL_df=df1['euMTPL']
euMTPL_df

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0.0,0,0.0,0,0.0,0,0.0,0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0.0,0,0.0,0,0.0,0,0.0,0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0.0,0,0.0,0,0.0,0,0.0,0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0.0,0,0.0,0,0.0,0,0.0,0


In [None]:
# Vérifier si les colonnes existent dans la base
cols_to_sum = ['cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']

# Remplacer les valeurs NaN par 0 avant la somme
euMTPL_df[cols_to_sum] = euMTPL_df[cols_to_sum].fillna(0)

# Créer la nouvelle variable claimAmount
euMTPL_df['claimAmount'] = euMTPL_df[cols_to_sum].sum(axis=1)

   cost_nc  cost_cg  cost_fcg  cost_cd  claimAmount
0      0.0      0.0       0.0      0.0          0.0
1      0.0      0.0       0.0      0.0          0.0
2      0.0      0.0       0.0      0.0          0.0
3      0.0      0.0       0.0      0.0          0.0
4      0.0      0.0       0.0      0.0          0.0


In [26]:
euMTPL_df.head()

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd,claimAmount
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0,0.0


## beMTPL

In [28]:
file_paths = ['/home/onyxia/work/Federated_Learning_Milliman/Kevin/beMTPL16.rda']
df = load_rda(file_paths)
df_1=clean_data(df)
beMTPL_df=df_1['beMTPL16']
beMTPL_df

Unnamed: 0,insurance_contract,policy_year,exposure,insured_birth_year,vehicle_age,policy_holder_age,driver_license_age,vehicle_brand,vehicle_model,mileage,vehicle_power,catalog_value,claim_value,number_of_liability_claims,number_of_bodily_injury_liability_claims,claim_time,claim_responsibility_rate,driving_training_label,signal
0,C1,1,0.386301,1945,10,9,40,MERCEDES,ME-1245,30000,75,983732,2,0,0,00:00,0,No,0
1,C2,1,0.493151,1941,4,25,24,VOLKSWAGEN,VO-2461,30000,55,510562,8,0,0,07:45,0,No,0
2,C3,1,0.290411,1944,0,2,39,AUDI,AU-967,30000,120,1934768,10,0,0,00:00,0,No,0
3,C4,1,0.336986,1948,1,14,37,LANCIA,LA-2346,30000,51,536755,13,0,0,18:50,0,No,0
4,C5,1,0.219178,1928,3,7,59,CITROEN,CI-1258,30000,54,446725,14,0,0,00:00,100,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70786,C58720,4,0.650273,1946,6,1,40,OPEL,OP-3248,30000,74,605100,169567,0,0,00:00,0,No,0
70787,C58721,4,0.229508,1945,8,13,43,CITROEN,CI-1263,30000,85,781789,169588,0,0,00:00,0,No,0
70788,C58722,4,0.767760,1937,12,5,44,RENAULT,RE-2061,30000,55,0,169633,0,0,00:00,0,No,0
70789,C52079,4,0.289617,1949,3,9,42,CITROEN,CI-1265,30000,120,1216867,169684,0,0,00:00,0,No,0


## freMTPL

In [31]:
import pyreadr
import os

# 📁 Définition des chemins
input_folder = "/home/onyxia/work/Federated_Learning_Milliman/raw_data/"
output_folder = "/home/onyxia/work/Federated_Learning_Milliman/converted_data/"

# 📝 Fichiers à convertir
file_mappings = {
    "freMTPLfreq.rda": "freMTPLfreq",
    "freMTPL2freq.rda": "freMTPL2freq",
    "freMTPLsev.rda": "freMTPLsev",
    "freMTPL2sev.rda": "freMTPL2sev"
}

# 📂 Vérifier si le dossier de sortie existe, sinon le créer
os.makedirs(output_folder, exist_ok=True)

# 🔄 Conversion de chaque fichier .rda en .csv
for file_name, object_name in file_mappings.items():
    file_path = os.path.join(input_folder, file_name)  # Chemin complet du fichier
    result = pyreadr.read_r(file_path)  # Charger le fichier
    
    if object_name in result:
        df = result[object_name]  # Extraire le DataFrame
        output_file = os.path.join(output_folder, f"{object_name}.csv")  # Nom du fichier CSV
        df.to_csv(output_file, index=False)  # Sauvegarde en CSV
        
        print(f"✅ {file_name} converti en {output_file} (Taille: {df.shape})")
    else:
        print(f"⚠️ Objet {object_name} non trouvé dans {file_name}")

print("\n🎯 Conversion terminée ! 🚀")


KeyboardInterrupt: 