In [1]:
import pandas as pd
import pyreadr

In [2]:
def load_rda(file_paths):

  dataframes = {}

  for file_path in file_paths:
      result = pyreadr.read_r(file_path)

      for key in result.keys():
          dataframes[key] = result[key]

  return dataframes

In [3]:
def missing_values(df):
  return(df.isnull().sum())

In [4]:
def outlier_values(df):
    outliers = pd.Series(dtype="float64")

    for col in df.select_dtypes(include=["number"]):  
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        mask = (df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))
        outliers[col] = mask.mean() * 100  
    return outliers

In [5]:
def clean_data(data_dict):
    cleaned_dict = {}
    for key, df in data_dict.items():
        df_cleaned = df.drop_duplicates()
        if 'RecordBeg' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordBeg'] = pd.to_datetime(df_cleaned['RecordBeg'], errors='coerce')
        if 'RecordEnd' in df_cleaned.columns:
            df_cleaned.loc[:, 'RecordEnd'] = pd.to_datetime(df_cleaned['RecordEnd'], errors='coerce')

        cleaned_dict[key] = df_cleaned

    return cleaned_dict

In [6]:
file_paths = ['../raw_data/euMTPL.rda']
df = load_rda(file_paths)
df1=clean_data(df)
df_euMTPL=df1['euMTPL']
df_euMTPL

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,cost_nc,num_nc,cost_cg,num_cg,cost_fcg,num_fcg,cost_cd,num_cd
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0.0,0,0.0,0,0.0,0,0.0,0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0.0,0,0.0,0,0.0,0,0.0,0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0.0,0,0.0,0,0.0,0,0.0,0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0.0,0,0.0,0,0.0,0,0.0,0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0.0,0,0.0,0,0.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0.0,0,0.0,0,0.0,0,0.0,0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0.0,0,0.0,0,0.0,0,0.0,0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0.0,0,0.0,0,0.0,0,0.0,0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0.0,0,0.0,0,0.0,0,0.0,0


In [7]:
df_euMTPL["ClaimNb"] = df_euMTPL[["num_nc", "num_cg", "num_cd", "num_fcg"]].fillna(0).sum(axis=1)
df_euMTPL["ClaimAmount"] = df_euMTPL[['cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']].fillna(0).sum(axis=1)

#Suppression des anciennes colonnes
cols_to_drop = ["num_nc", "num_cg", "num_cd", "num_fcg", 'cost_nc', 'cost_cg', 'cost_fcg', 'cost_cd']
df_euMTPL.drop(columns=cols_to_drop, inplace=True)

# Vérification
df_euMTPL.head()

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,ClaimNb,ClaimAmount
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0,0.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0,0.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0,0.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0,0.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0,0.0


In [8]:

url = "https://fr.wikipedia.org/wiki/Province_d%27Italie"
tables = pd.read_html(url)

df = tables[3]

df_provinces_italie = df[['Sigle', 'Densité (hab./km2)']].copy()

df_provinces_italie['Densité (hab./km2)'] = (
    df_provinces_italie['Densité (hab./km2)']
    .astype(str)
    .str.replace('+', '', regex=False)
    .str.replace('\xa0', '', regex=False)
    .str.replace(',', '', regex=False)
    .str.strip()
)

# Conversion en float (avec coercition des erreurs)
df_provinces_italie['Densité (hab./km2)'] = pd.to_numeric(
    df_provinces_italie['Densité (hab./km2)'], errors='coerce'
)

# densités manquantes à la main
densites_manquantes = {
    'CA': 123.0,   
    'CI': 87.0,   
    'NU': 36.0,
    'OG': 188.0,
    'OR': 378.0,
    'SS': 221.0,
    'VS': 68.0
}

# Ajouter la province de Novare (NA) qui ne s'est pas enregistré lors du scraping
if 'NA' not in df_provinces_italie['Sigle'].values:
    df_provinces_italie = pd.concat([
        df_provinces_italie,
        pd.DataFrame([{'Sigle': 'NA', 'Densité (hab./km2)': 429}])
    ], ignore_index=True)


for code, densite in densites_manquantes.items():
    df_provinces_italie.loc[df_provinces_italie['Sigle'] == code, 'Densité (hab./km2)'] = densite

# Trier par ordre alphabétique du sigle
df_provinces_italie = df_provinces_italie.sort_values(by='Sigle').reset_index(drop=True)

df_provinces_italie.to_csv('provinces_italie.csv', index=False)

In [9]:
df_euMTPL = df_euMTPL.merge(
    df_provinces_italie.rename(columns={'Sigle': 'province', 'Densité (hab./km2)': 'Density'}),
    on='province',
    how='left'
)
df_euMTPL

Unnamed: 0,policy_id,group,fuel_type,year,vehicle_category,vehicle_use,province,horsepower,gender,age,exposure,ClaimNb,ClaimAmount,Density
0,1,test,B,7.0,1,1,PA,14,M,77,0.487671,0,0.0,250.0
1,2,train,B,7.0,1,1,,12,M,40,0.019178,0,0.0,429.0
2,4,train,B,7.0,1,1,CN,14,M,75,0.032877,0,0.0,86.0
3,5,train,B,7.0,1,1,,13,M,48,0.043836,0,0.0,429.0
4,6,train,B,7.0,1,1,,12,F,54,0.046575,0,0.0,429.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2373192,2595210,test,B,9.0,1,1,RM,14,F,40,0.246575,0,0.0,784.0
2373193,2595211,train,B,9.0,1,1,LE,14,M,52,0.756164,0,0.0,296.0
2373194,2595212,test,E,9.0,1,1,LE,1,F,46,0.008219,0,0.0,296.0
2373195,2595213,test,T,9.0,1,1,LE,15,M,25,0.723288,0,0.0,296.0


In [10]:
df_euMTPL = df_euMTPL.rename(columns={
    'policy_id': 'Id_policy',
    'ClaimAmount': 'ClaimAmount',
    'ClaimNb': 'ClaimNb',
    'exposure': 'Exposure',
    'horsepower': 'Power',
    'age': 'DriverAge',
    'fuel_type': 'Fuel_type',
    'gender':'Sex'
})

df_euMTPL['Sinistre'] = (df_euMTPL['ClaimAmount'] > 0).astype(int)
df_euMTPL['Sex'] = df_euMTPL['Sex'].apply(lambda x: 1 if x == 'M' else 0)

features = ["Exposure", 'Power', 'DriverAge', "Fuel_type", "Density", "Sex", 'Sinistre']

df_euMTPL=df_euMTPL[features]
df_euMTPL["Fuel_type"].unique()


['B', 'G', 'S', 'D', 'P', 'T', 'M', 'E']
Categories (8, object): ['B', 'D', 'E', 'G', 'M', 'P', 'S', 'T']

In [11]:
carburant_dict = {
    'B': 'Regular',   # Bioéthanol ou autres carburants bio
    'E': 'Regular',   # Ethanol
    'S': 'Regular',   # Super essence
    'T': 'Regular',   # Autre carburant de type essence
    'D': 'Diesel',    # Diesel
    'G': 'Diesel',    # Gaz naturel comprimé (GNC)
    'M': 'Diesel',    # Méthane (utilisé dans les véhicules au gaz)
    'P': 'Diesel'     # Propane (utilisé pour des véhicules à gaz)
}

df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].map(carburant_dict)



In [12]:
df_euMTPL['Fuel_type'] = df_euMTPL['Fuel_type'].apply(lambda x: 1 if x == 'Diesel' else 0)
df_euMTPL["Fuel_type"].unique()

array([0, 1])

In [13]:
df_euMTPL.to_csv('../data/european_data.csv', index=False)