In [43]:
import pandas as pd
import pyreadr
import numpy as np

In [44]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [45]:
def missing_values(df):
  return(df.isnull().sum())

In [46]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [47]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

In [48]:
df_freMTPL1 = pd.read_csv("../raw_data/freMTPL1.csv")
df_freMTPL2 = pd.read_csv("../raw_data/freMTPL2.csv")

In [49]:
letter_to_power = {
    'd': 10, 'e': 30, 'f': 40, 'g': 43, 'h': 50, 'i': 53,
    'j': 56, 'k': 60, 'l': 66, 'm': 80, 'n': 100, 'o': 120
}

# Transformation de la colonne 'Power' dans df_freMTPL1
df_freMTPL1['Power'] = df_freMTPL1['Power'].map(letter_to_power).astype('int32')

In [50]:
print(df_freMTPL1.columns)
print(df_freMTPL2.columns)

Index(['PolicyID', 'ClaimAmount', 'ClaimNb', 'Exposure', 'Power', 'CarAge',
       'DriverAge', 'Brand', 'Gas', 'Region', 'Density'],
      dtype='object')
Index(['IDpol', 'ClaimAmount', 'ClaimNb', 'Exposure', 'VehPower', 'VehAge',
       'DrivAge', 'BonusMalus', 'VehBrand', 'VehGas', 'Area', 'Density',
       'Region'],
      dtype='object')


In [51]:
df_freMTPL1 = df_freMTPL1.rename(columns={
    "PolicyID": "IdPolicy",
    "Power": "power",
    "CarAge": "CarAge",
    "DriverAge": "DriverAge",
    "Brand": "CarBrand",
    "Gas": "CarGas"
})

df_freMTPL2 = df_freMTPL2.rename(columns={
    "IDpol": "IdPolicy",
    "VehPower": "power",
    "VehAge": "CarAge",
    "DrivAge": "DriverAge",
    "VehBrand": "CarBrand",
    "VehGas": "CarGas"
})

df_freMTPL2 = df_freMTPL2.drop(columns=["BonusMalus", "Area"])

columns_order = ["Exposure", "IdPolicy", "ClaimAmount", "ClaimNb", "power", "CarAge", 
                 "DriverAge", "CarBrand", "CarGas", "Region", "Density"]

df_freMTPL1 = df_freMTPL1[columns_order]
df_freMTPL2 = df_freMTPL2[columns_order]


In [52]:
df_freMTPL = pd.concat([df_freMTPL1, df_freMTPL2], ignore_index=True)
df_freMTPL['Sinistre'] = (df_freMTPL['ClaimAmount'] > 3000).astype(int)



# sexe généré artificiemment(femme=0 - 51%, homme=1 - 49%)
df_freMTPL['Sex'] = np.random.choice(['0', '1'], size=df_freMTPL.shape[0], p=[0.51, 0.49])

In [53]:
features = ["Exposure", 'power', 'DriverAge', "CarGas", "Density", "Sex", 'Sinistre']
df_freMTPL = df_freMTPL[features]

In [54]:

df_freMTPL = df_freMTPL.rename(columns={
    'IdPolicy': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'Exposure': 'Exposure',
    'power': 'Power',
    'DriverAge': 'DriverAge',
    'CarGas': 'Fuel_type'
})
#  Diesel (1) et Regular (0) dans df_beMTPL
df_freMTPL['Fuel_type'] = df_freMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)


In [55]:
df_freMTPL

Unnamed: 0,Exposure,Power,DriverAge,Fuel_type,Density,Sex,Sinistre
0,0.75,43,61,0,27000,0,0
1,0.14,66,50,1,56,0,0
2,0.14,10,36,0,4792,0,0
3,0.62,56,51,0,27000,1,1
4,0.62,56,51,0,27000,1,0
...,...,...,...,...,...,...,...
42620,0.14,7,51,1,178,0,0
42621,0.17,4,35,0,1719,0,0
42622,0.17,15,36,0,181,1,0
42623,0.17,15,36,0,181,1,1


In [56]:
df_freMTPL.to_csv("../data/french_data.csv", index=False)