In [1]:
import pandas as pd
import pyreadr
import numpy as np

In [2]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [3]:
def missing_values(df):
  return(df.isnull().sum())

In [4]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [5]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

In [6]:
df_freMTPL1 = pd.read_csv("../raw_data/freMTPL1.csv")
df_freMTPL2 = pd.read_csv("../raw_data/freMTPL2.csv")

In [7]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

Index(['PolicyID', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'Brand', 'Gas', 'Region', 'Density'],
      dtype='object')
Index(['IDpol', 'ClaimAmount', 'ClaimNb', 'Exposure', 'VehPower', 'VehAge',
       'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Area', 'Density',
       'Region'],
      dtype='object')


In [8]:
# Renommer les colonnes pour df_freMTPL1
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

# Renommer les colonnes pour df_freMTPL2
df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

columns_order = ["Exposure", "IdPolicy", "ClaimAmount", "ClaimNb", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]


In [9]:
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)
df_freMTPL['Sinistre'] = (df_freMTPL['ClaimAmount'] > 0).astype(int)


# sexe généré artificiemment(femme=0 - 51%, homme=1 - 49%)
df_freMTPL['Sex'] = np.random.choice(['0', '1'], size=df_freMTPL.shape[0], p=[0.51, 0.49])


In [10]:
df_freMTPL

Unnamed: 0,Exposure,IdPolicy,ClaimAmount,ClaimNb,power,CarAge,DriverAge,CarBrand,CarGas,Region,Density,Sinistre,Sex
0,0.75,33,302.00,1,g,1,61,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000,1,1
1,0.14,41,2001.00,1,l,5,50,Japanese (except Nissan) or Korean,Diesel,Basse-Normandie,56,1,0
2,0.14,92,1449.00,1,d,0,36,Japanese (except Nissan) or Korean,Regular,Ile-de-France,4792,1,0
3,0.62,96,9924.00,2,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000,1,0
4,0.62,96,946.00,2,j,0,51,Japanese (except Nissan) or Korean,Regular,Ile-de-France,27000,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
42620,0.14,6113793,1769.88,1,7,2,51,B12,Diesel,Aquitaine,178,1,0
42621,0.17,6113817,1288.28,1,4,0,35,B12,Regular,Rhone-Alpes,1719,1,0
42622,0.17,6113834,1940.40,2,15,3,36,B12,Regular,Poitou-Charentes,181,1,1
42623,0.17,6113834,10290.00,2,15,3,36,B12,Regular,Poitou-Charentes,181,1,0


In [11]:
features = ["Exposure", 'power', 'DriverAge', "CarGas", "Density", "Sex", 'Sinistre']
df_freMTPL = df_freMTPL[features]

In [12]:

df_freMTPL = df_freMTPL.rename(columns={
    'IdPolicy': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'Exposure': 'Exposure',
    'power': 'Power',
    'DriverAge': 'DriverAge',
    'CarGas': 'Fuel_type'
})
#  Diesel (1) et Regular (0) dans df_beMTPL
df_freMTPL['Fuel_type'] = df_freMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)


In [13]:
df_freMTPL

Unnamed: 0,Exposure,Power,DriverAge,Fuel_type,Density,Sex,Sinistre
0,0.75,g,61,0,27000,1,1
1,0.14,l,50,1,56,0,1
2,0.14,d,36,0,4792,0,1
3,0.62,j,51,0,27000,0,1
4,0.62,j,51,0,27000,1,1
...,...,...,...,...,...,...,...
42620,0.14,7,51,1,178,0,1
42621,0.17,4,35,0,1719,0,1
42622,0.17,15,36,0,181,1,1
42623,0.17,15,36,0,181,0,1
