In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadr
import os   
import numpy as np
import random

In [4]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [5]:
def missing_values(df):
  return(df.isnull().sum())

In [2]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [3]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## freMTPL

In [7]:
df_freMTPL1 = pd.read_csv("../raw_data/freMTPL1.csv")
df_freMTPL2 = pd.read_csv("../raw_data/freMTPL2.csv")

In [8]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

Index(['PolicyID', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'Brand', 'Gas', 'Region', 'Density'],
      dtype='object')
Index(['IDpol', 'ClaimAmount', 'ClaimNb', 'Exposure', 'VehPower', 'VehAge',
       'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Area', 'Density',
       'Region'],
      dtype='object')


In [9]:
# Renommer les colonnes pour df_freMTPL1
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

# Renommer les colonnes pour df_freMTPL2
df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

# Supprimer les colonnes BonusMalus et Area de df_freMTPL2
df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

# Réorganiser les colonnes dans le même ordre
columns_order = ["IdPolicy", "ClaimAmount", "ClaimNb", "Exposure", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]


In [11]:
#Fusion verticale des bases
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)

# Vérification de la fusion
print(df_freMTPL.shape)  # Vérifie le nombre total de lignes et de colonnes
print(df_freMTPL.head())  # Affiche les premières lignes du DataFrame final


(42625, 11)
   IdPolicy  ClaimAmount  ClaimNb  Exposure power  CarAge  DriverAge  \
0        33        302.0        1      0.75     g       1         61   
1        41       2001.0        1      0.14     l       5         50   
2        92       1449.0        1      0.14     d       0         36   
3        96       9924.0        2      0.62     j       0         51   
4        96        946.0        2      0.62     j       0         51   

                             CarBrand   CarGas           Region  Density  
0  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
1  Japanese (except Nissan) or Korean   Diesel  Basse-Normandie       56  
2  Japanese (except Nissan) or Korean  Regular    Ile-de-France     4792  
3  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  
4  Japanese (except Nissan) or Korean  Regular    Ile-de-France    27000  


In [12]:
df_freMTPL

Unnamed: 0,IdPolicy,ClaimAmount,ClaimNb,Exposure,power,CarAge,DriverAge,CarBrand,CarGas,Region,Density
0,33,302.00,1,0.75,g,1,61,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
1,41,2001.00,1,0.14,l,5,50,Japanese (except Nissan) or Korean,Diesel,Basse-Normandie,56
2,92,1449.00,1,0.14,d,0,36,Japanese (except Nissan) or Korean,Regular,Ile-de-France,4792
3,96,9924.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
4,96,946.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
...,...,...,...,...,...,...,...,...,...,...,...
42620,6113793,1769.88,1,0.14,7,2,51,B12,Diesel,Aquitaine,178
42621,6113817,1288.28,1,0.17,4,0,35,B12,Regular,Rhone-Alpes,1719
42622,6113834,1940.40,2,0.17,15,3,36,B12,Regular,Poitou-Charentes,181
42623,6113834,10290.00,2,0.17,15,3,36,B12,Regular,Poitou-Charentes,181


### Comparaison et fusion des bases euMTPL,beMTPL et freMTPL

In [22]:
print(df_freMTPL.columns)


Index(['Id_policy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'DriverAge',
       'Fuel_type', 'Sinistre'],
      dtype='object')


In [13]:
#RENOMMER LES NOMS DES VARIABLES

df_freMTPL = df_freMTPL.rename(columns={
    'IdPolicy': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'Exposure': 'Exposure',
    'power': 'Power',
    'DriverAge': 'DriverAge',
    'CarGas': 'Fuel_type'
})

df_freMTPL


Unnamed: 0,Id_policy,ClaimAmount,ClaimNb,Exposure,Power,CarAge,DriverAge,CarBrand,Fuel_type,Region,Density
0,33,302.00,1,0.75,g,1,61,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
1,41,2001.00,1,0.14,l,5,50,Japanese (except Nissan) or Korean,Diesel,Basse-Normandie,56
2,92,1449.00,1,0.14,d,0,36,Japanese (except Nissan) or Korean,Regular,Ile-de-France,4792
3,96,9924.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
4,96,946.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000
...,...,...,...,...,...,...,...,...,...,...,...
42620,6113793,1769.88,1,0.14,7,2,51,B12,Diesel,Aquitaine,178
42621,6113817,1288.28,1,0.17,4,0,35,B12,Regular,Rhone-Alpes,1719
42622,6113834,1940.40,2,0.17,15,3,36,B12,Regular,Poitou-Charentes,181
42623,6113834,10290.00,2,0.17,15,3,36,B12,Regular,Poitou-Charentes,181


In [15]:
# Création de la variable Sinistre dans chaque base
df_freMTPL['Sinistre'] = (df_freMTPL['ClaimAmount'] > 0).astype(int)

In [16]:
df_freMTPL['Fuel_type'].unique()

array(['Regular', 'Diesel'], dtype=object)

In [17]:
#Catégorisation
# Dictionnaire pour catégoriser Diesel (1) et Regular (0) dans df_beMTPL
df_freMTPL['Fuel_type'] = df_freMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)

In [18]:
df_freMTPL

Unnamed: 0,Id_policy,ClaimAmount,ClaimNb,Exposure,Power,CarAge,DriverAge,CarBrand,Fuel_type,Region,Density,Sinistre
0,33,302.00,1,0.75,g,1,61,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
1,41,2001.00,1,0.14,l,5,50,Japanese (except Nissan) or Korean,1,Basse-Normandie,56,1
2,92,1449.00,1,0.14,d,0,36,Japanese (except Nissan) or Korean,0,Ile-de-France,4792,1
3,96,9924.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
4,96,946.00,2,0.62,j,0,51,Japanese (except Nissan) or Korean,0,Ile-de-France,27000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
42620,6113793,1769.88,1,0.14,7,2,51,B12,1,Aquitaine,178,1
42621,6113817,1288.28,1,0.17,4,0,35,B12,0,Rhone-Alpes,1719,1
42622,6113834,1940.40,2,0.17,15,3,36,B12,0,Poitou-Charentes,181,1
42623,6113834,10290.00,2,0.17,15,3,36,B12,0,Poitou-Charentes,181,1


In [20]:
# Identifier les variables communes aux trois bases
common_columns = ['Id_policy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'DriverAge', 'Fuel_type', 'Sinistre']

# Sélectionner uniquement les colonnes communes dans chaque base

df_freMTPL = df_freMTPL[common_columns]

In [21]:
df_freMTPL

Unnamed: 0,Id_policy,ClaimAmount,ClaimNb,Exposure,Power,DriverAge,Fuel_type,Sinistre
0,33,302.00,1,0.75,g,61,0,1
1,41,2001.00,1,0.14,l,50,1,1
2,92,1449.00,1,0.14,d,36,0,1
3,96,9924.00,2,0.62,j,51,0,1
4,96,946.00,2,0.62,j,51,0,1
...,...,...,...,...,...,...,...,...
42620,6113793,1769.88,1,0.14,7,51,1,1
42621,6113817,1288.28,1,0.17,4,35,0,1
42622,6113834,1940.40,2,0.17,15,36,0,1
42623,6113834,10290.00,2,0.17,15,36,0,1
