In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pyreadr
import os   
import numpy as np
import random

In [3]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [4]:
def missing_values(df):
  return(df.isnull().sum())

In [5]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [6]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

## euMTPL

In [8]:
file_paths = ['crespin/euMTPL.rda']
df = load_rda(file_paths)
df1=clean_data(df)
df_euMTPL=df1['euMTPL']
df_euMTPL

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0.0,0,0.0,0,0.0,0,0.0,0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0.0,0,0.0,0,0.0,0,0.0,0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0.0,0,0.0,0,0.0,0,0.0,0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0.0,0,0.0,0,0.0,0,0.0,0


In [9]:
#Création des nouvelles variables
df_euMTPL["ClaimNb"] = df_euMTPL[["num_nc", "num_cg", "num_cd", "num_fcg"]].fillna(0).sum(axis=1)
df_euMTPL["ClaimAmount"] = df_euMTPL[['cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']].fillna(0).sum(axis=1)

#Suppression des anciennes colonnes
cols_to_drop = ["num_nc", "num_cg", "num_cd", "num_fcg", 'cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']
df_euMTPL.drop(columns=cols_to_drop, inplace=True)

# Vérification
df_euMTPL.head()

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,ClaimNb,ClaimAmount
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0,0.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0,0.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0,0.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0,0.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0,0.0


In [10]:
# Affichage des 108 modalités de la variable province afin de voir à quoi correspond chaque modalité 
modalites_province = df_euMTPL['province'].unique()
modalites_province = sorted(modalites_province)

print(f"Nombre de modalités : {len(modalites_province)}\n")
print("Liste des 108 modalités :\n")

for i in range(0, len(modalites_province), 12):
    ligne = modalites_province[i:i+12]
    print(', '.join(ligne))


Nombre de modalités : 108

Liste des 108 modalités :

AG, AL, AN, AO, AP, AQ, AR, AT, AV, BA, BG, BI
BL, BN, BO, BR, BS, BZ, CA, CB, CE, CH, CI, CL
CN, CO, CR, CS, CT, CZ, EN, FC, FE, FG, FI, FR
GE, GO, GR, IM, IS, KR, LC, LE, LI, LO, LT, LU
MC, ME, MI, MN, MO, MS, MT, NA, NO, NU, OG, OR
OT, PA, PC, PD, PE, PG, PI, PN, PO, PR, PT, PU
PV, PZ, RA, RC, RE, RG, RI, RM, RN, RO, SA, SI
SM, SO, SP, SR, SS, SV, TA, TE, TN, TO, TP, TR
TS, TV, UD, VA, VB, VC, VE, VI, VR, VS, VT, VV


In [11]:
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [12]:
# Cherger le fichier contenant les densité de population par KM2 en Europe
fichier_excel = "demo_r_d3dens.xlsx"
df = pd.read_excel(fichier_excel)
df.head()

Unnamed: 0,Observatoire des territoires - ANCT,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Densité de population en Europe (NUTS 3 (2016)),,,
1,,,,
2,,,,Densité de population - Europe
3,codgeo,libgeo,an,per_km2
4,AT111,Mittelburgenland,1990,55.4


In [13]:
# Ajout de la densité de la population par Km2 dans la base euMTPL
fichier_excel = "demo_r_d3dens.xlsx"
df_density = pd.read_excel(fichier_excel, skiprows=4)

#Sélectionner la 4e colonne (index 3 → "colonne D" dans Excel)
donnees_density = df_density.iloc[:, 3]  # colonne numéro 4 (index Python 3)
donnees_density.dropna(inplace=True)

#Convertir en liste
liste_densites = donnees_density.tolist()

#Récupérer les provinces de ta base existante
provinces_uniques = df_euMTPL["province"].unique()

#Vérifier que tu as assez de densités
if len(liste_densites) < len(provinces_uniques):
    raise ValueError("Pas assez de valeurs de densité pour couvrir toutes les provinces.")

#Assigner les densités de façon aléatoire aux provinces
random.shuffle(liste_densites)
affectation_density = dict(zip(provinces_uniques, liste_densites[:len(provinces_uniques)]))

# Ajouter la colonne "density" dans la base
df_euMTPL["density"] = df_euMTPL["province"].map(affectation_density)

# ✅ Aperçu
df_euMTPL.head()


Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,ClaimNb,ClaimAmount,density
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0,0.0,34.3
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0,0.0,56.9
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0,0.0,21.5
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0,0.0,56.9
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0,0.0,56.9


## beMTPL

In [8]:
file_paths = ['raw_data/beMTPL97.rda']
df = load_rda(file_paths)
df_1=clean_data(df)
df_beMTPL=df_1['beMTPL97']
df_beMTPL

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat
0,1,1.000000,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386
1,2,1.000000,0,0,0.000000,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386
2,3,1.000000,0,0,0.000000,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386
3,4,1.000000,0,0,0.000000,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163207,163208,1.000000,0,0,0.000000,,TPL,37,male,5,40,10,diesel,work,0,9990,3.421256,51.199975
163208,163209,1.000000,0,0,0.000000,,TPL,44,male,0,55,6,diesel,private,0,9990,3.421256,51.199975
163209,163210,1.000000,0,0,0.000000,,TPL,50,male,0,40,10,diesel,private,0,9990,3.421256,51.199975
163210,163211,1.000000,0,0,0.000000,,TPL,43,male,0,66,7,gasoline,private,0,9990,3.421256,51.199975


In [9]:
df_beMTPL["postcode"].unique()

array([1000, 1030, 1040, 1050, 1060, 1070, 1080, 1081, 1082, 1083, 1090,
       1140, 1150, 1160, 1170, 1180, 1190, 1200, 1210, 1300, 1310, 1315,
       1320, 1325, 1330, 1340, 1350, 1357, 1360, 1367, 1370, 1380, 1390,
       1400, 1410, 1420, 1430, 1435, 1440, 1450, 1457, 1460, 1470, 1480,
       1490, 1495, 1500, 1540, 1547, 1560, 1570, 1600, 1620, 1630, 1640,
       1650, 1670, 1700, 1730, 1740, 1745, 1750, 1755, 1760, 1770, 1780,
       1785, 1790, 1800, 1820, 1830, 1840, 1850, 1860, 1880, 1910, 1930,
       1950, 1970, 1980, 2000, 2070, 2110, 2150, 2160, 2200, 2220, 2230,
       2235, 2240, 2250, 2260, 2270, 2275, 2280, 2290, 2300, 2310, 2320,
       2330, 2340, 2350, 2360, 2370, 2380, 2387, 2390, 2400, 2430, 2440,
       2450, 2460, 2470, 2480, 2490, 2500, 2520, 2530, 2540, 2547, 2550,
       2560, 2570, 2580, 2590, 2620, 2627, 2630, 2640, 2650, 2800, 2820,
       2830, 2840, 2845, 2850, 2860, 2870, 2880, 2890, 2900, 2910, 2920,
       2930, 2940, 2950, 2960, 2970, 2980, 2990, 30

In [27]:

postcode_to_region = {"Bruxelles": range(1000, 1300),"Brabant Wallon": range(1300, 1400),"Brabant Flamand": range(1500, 2000),"Hainaut": range(6000, 6600),
"Liège": range(4000, 4800),"Namur": range(5000, 5600),"Luxembourg": range(6600, 7000),"Flandre occidentale": range(8000, 9000),"Flandre orientale": range(9000, 9500),
"Limbourg": range(3500, 4000),"Anvers": range(2000, 3000),"Province du Brabant flamand": range(3000, 3500),}

# Mapping région → population
region_to_density = {"Bruxelles": 7500,"Brabant Wallon": 370,"Brabant Flamand": 490,"Hainaut": 350,"Liège": 330,"Namur": 120,"Luxembourg": 70,"Flandre occidentale": 370,
"Flandre orientale": 450,"Limbourg": 370,"Anvers": 660,"Province du Brabant flamand": 490,}

# Fonction pour attribuer la région
def get_region(postcode):
    try:
        postcode = int(postcode)
        for region, codes in postcode_to_region.items():
            if postcode in codes:
                return region
        return "Autre"
    except:
        return "Invalide"


df_beMTPL["région"] = df_beMTPL["postcode"].apply(get_region)

df_beMTPL["density"] = df_beMTPL["région"].map(region_to_density)

print(df_beMTPL[["postcode", "région", "density"]].head())

   postcode     région  density
0      1000  Bruxelles   7500.0
1      1000  Bruxelles   7500.0
2      1000  Bruxelles   7500.0
3      1000  Bruxelles   7500.0
4      1000  Bruxelles   7500.0


In [25]:
df_beMTPL.head()

Unnamed: 0,id,expo,claim,nclaims,amount,average,coverage,ageph,sex,bm,power,agec,fuel,use,fleet,postcode,long,lat,région
0,1,1.0,1,1,1618.001036,1618.001036,TPL,50,male,5,77,12,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
1,2,1.0,0,0,0.0,,TPL+,64,female,5,66,3,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
2,3,1.0,0,0,0.0,,TPL,60,male,0,70,10,diesel,private,0,1000,4.355223,50.845386,Bruxelles
3,4,1.0,0,0,0.0,,TPL,77,male,0,57,15,gasoline,private,0,1000,4.355223,50.845386,Bruxelles
4,5,0.046575,1,1,155.974606,155.974606,TPL,28,female,9,70,7,gasoline,private,0,1000,4.355223,50.845386,Bruxelles


In [26]:
df_beMTPL["région"].unique()

array(['Bruxelles', 'Brabant Wallon', 'Autre', 'Brabant Flamand',
       'Anvers', 'Province du Brabant flamand', 'Limbourg', 'Liège',
       'Namur', 'Hainaut', 'Luxembourg', 'Flandre occidentale',
       'Flandre orientale'], dtype=object)

## freMTPL

In [None]:
df_freMTPL1 = pd.read_csv("freMTPL1.csv")
df_freMTPL2 = pd.read_csv("freMTPL2.csv")
# Afficher les premières lignes
print(df_freMTPL1.head())
print(df_freMTPL2.head())

In [None]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

In [None]:
# Renommer les colonnes pour df_freMTPL1
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

# Renommer les colonnes pour df_freMTPL2
df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

# Supprimer les colonnes BonusMalus et Area de df_freMTPL2
df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

# Réorganiser les colonnes dans le même ordre
columns_order = ["IdPolicy", "ClaimAmount", "ClaimNb", "Exposure", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]


In [None]:
#Fusion verticale des bases
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)

# Vérification de la fusion
print(df_freMTPL.shape)  # Vérifie le nombre total de lignes et de colonnes
print(df_freMTPL.head())  # Affiche les premières lignes du DataFrame final


### Comparaison et fusion des bases euMTPL,beMTPL et freMTPL

In [None]:
print(df_freMTPL.columns)
print(df_beMTPL.columns)
print(df_euMTPL.columns)



In [None]:
#RENOMMER LES NOMS DES VARIABLES

df_freMTPL = df_freMTPL.rename(columns={
    'IdPolicy': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'Exposure': 'Exposure',
    'power': 'Power',
    'DriverAge': 'DriverAge',
    'CarGas': 'Fuel_type'
})

df_beMTPL = df_beMTPL.rename(columns={
    'id': 'Id_policy',
    'amount': 'ClaimAmount',
    'nclaims': 'ClaimNb',
    'expo': 'Exposure',
    'power': 'Power',
    'ageph': 'DriverAge',
    'fuel': 'Fuel_type'
})

df_euMTPL = df_euMTPL.rename(columns={
    'policy_id': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'exposure': 'Exposure',
    'horsepower': 'Power',
    'age': 'DriverAge',
    'fuel_type': 'Fuel_type'
})

# Affichage des premières lignes pour vérifier
df_euMTPL
df_freMTPL
df_beMTPL

In [None]:
# Création de la variable Sinistre dans chaque base
df_freMTPL['Sinistre'] = (df_freMTPL['ClaimAmount'] > 0).astype(int)
df_beMTPL['Sinistre'] = (df_beMTPL['ClaimAmount'] > 0).astype(int)
df_euMTPL['Sinistre'] = (df_euMTPL['ClaimAmount'] > 0).astype(int)

In [None]:
df_euMTPL['Fuel_type'].unique()

In [None]:
# Dictionnaire pour la conversion des carburants en catégories
carburant_dict = {
    'B': 'Regular',   # Bioéthanol ou autres carburants bio
    'E': 'Regular',   # Ethanol
    'S': 'Regular',   # Super essence
    'T': 'Regular',   # Autre carburant de type essence
    'D': 'Diesel',    # Diesel
    'G': 'Diesel',    # Gaz naturel comprimé (GNC)
    'M': 'Diesel',    # Méthane (utilisé dans les véhicules au gaz)
    'P': 'Diesel'     # Propane (utilisé pour des véhicules à gaz)
}

# Appliquer la catégorisation au dataframe df_euMTPL
df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].map(carburant_dict)
df_euMTPL


In [None]:
df_beMTPL['Fuel_type'].unique()

In [None]:
df_freMTPL['Fuel_type'].unique()

In [None]:
#Catégorisation
# Dictionnaire pour catégoriser Diesel (1) et Regular (0) dans df_beMTPL
df_beMTPL['Fuel_type'] = df_beMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)
df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)
df_freMTPL['Fuel_type'] = df_freMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)

In [None]:
df_euMTPL
df_beMTPL
df_freMTPL

In [1]:
# Identifier les variables communes aux trois bases
common_columns = ['Id_policy', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'DriverAge', 'Fuel_type', 'Sinistre']

# Sélectionner uniquement les colonnes communes dans chaque base
df_euMTPL_common = df_euMTPL[common_columns]
df_beMTPL_common = df_beMTPL[common_columns]
df_freMTPL_common = df_freMTPL[common_columns]

# Ajouter une colonne 'Dataset' pour indiquer la provenance de chaque ligne
df_euMTPL_common['Dataset'] = 'european'
df_beMTPL_common['Dataset'] = 'belgium'
df_freMTPL_common['Dataset'] = 'french'

# Fusionner les bases en une seule base
Data = pd.concat([df_euMTPL_common, df_beMTPL_common, df_freMTPL_common], ignore_index=True)
Data



NameError: name 'df_euMTPL' is not defined