## Exemple d'un utilitaire pour nettoyer les valeus manquantes

In [2]:
#importation du jeu données. dataset
import pandas as pd
pd.options.display.float_format='{:.2f}'.format
df = pd.read_csv('FraudeAssuranceAuto.csv')
df.head(10)

Unnamed: 0,ID,Insurance Type,Income of Policy Holder,Marital Status,Num Claimants,Injury Type,Overnight Hospital Stay,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag
0,1,CI,0,,2,Soft Tissue,No,1625,3250,2,2.0,1.0,0,1
1,2,CI,0,,2,Back,Yes,15028,60112,1,0.0,0.0,15028,0
2,3,CI,54613,Married,1,Broken Limb,No,-99999,0,0,0.0,0.0,572,0
3,4,CI,0,,3,Serious,Yes,270200,0,0,0.0,0.0,270200,0
4,5,CI,0,,4,Soft Tissue,No,8869,0,0,0.0,0.0,0,1
5,6,CI,0,,1,Broken Limb,Yes,17480,0,0,0.0,0.0,17480,0
6,7,CI,52567,Single,3,Broken Limb,No,3017,18102,2,1.0,0.5,0,1
7,8,CI,0,,2,Back,Yes,7463,0,0,0.0,0.0,7463,0
8,9,CI,0,,1,Soft Tissue,No,2067,0,0,,0.0,2067,0
9,10,CI,42300,Married,4,Back,No,2260,0,0,0.0,0.0,2260,0


In [5]:
df.describe(include="all")

Unnamed: 0,ID,Insurance Type,Income of Policy Holder,Marital Status,Num Claimants,Injury Type,Overnight Hospital Stay,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag
count,500.0,500,500.0,170,500.0,500,500,500.0,500.0,500.0,490.0,500.0,500.0,500.0
unique,,1,,3,,4,2,,,,,,,
top,,CI,,Married,,Broken Limb,No,,,,,,,
freq,,500,,99,,177,354,,,,,,,
mean,250.5,,13739.99,,1.91,,,16373.2,9597.19,0.8,0.23,0.17,13051.94,0.34
std,144.48,,20081.54,,1.01,,,29426.28,35655.69,2.67,0.59,0.43,30547.19,0.47
min,1.0,,0.0,,1.0,,,-99999.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,125.75,,0.0,,1.0,,,3322.25,0.0,0.0,0.0,0.0,0.0,0.0
50%,250.5,,0.0,,2.0,,,5663.0,0.0,0.0,0.0,0.0,3253.5,0.0
75%,375.25,,33918.5,,3.0,,,12245.5,11282.75,1.0,0.0,0.0,8191.75,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       500 non-null    int64  
 1   Insurance Type           500 non-null    object 
 2   Income of Policy Holder  500 non-null    int64  
 3   Marital Status           170 non-null    object 
 4   Num Claimants            500 non-null    int64  
 5   Injury Type              500 non-null    object 
 6   Overnight Hospital Stay  500 non-null    object 
 7   Claim Amount             500 non-null    int64  
 8   Total Claimed            500 non-null    int64  
 9   Num Claims               500 non-null    int64  
 10  Num Soft Tissue          490 non-null    float64
 11  % Soft Tissue            500 non-null    float64
 12  Claim Amount Received    500 non-null    int64  
 13  Fraud Flag               500 non-null    int64  
dtypes: float64(2), int64(8), o

## scikit-learn fournit une classe bien pratique pour gérer les données manquantes : Imputer

In [7]:
#importation de l'utilitaire d'imputation
from sklearn.impute import SimpleImputer

In [10]:
#Voici comment l'utiliser: on doit d'abord créer une instance
# de la classe SimpleImputer, en sépcifiant que vous voulez remplacer
#les valeurs manquantes désignées par le symbole NaN : Not a Number
#de chaque caractéristique. la stratégie de remplacement est la moyenne
#L'instanciation de la classe SimpleImputer permet de définir un objet transformateur.

imp = SimpleImputer(strategy='mean')


In [11]:
#la moyenne "mean" pouvant être calculée uniquement sur des variables
#numérique. Nous devons créer une copie des données en éliminant
#les caractéristiques catégorielle (de type object)

v_cat = ['Insurance Type ','Marital Status','Injury Type','Overnight Hospital Stay']
df_num = df.drop(columns=v_cat, axis=1)

In [12]:
#mantenant on peut appliquer l'instance SimpleImputer 
#au jeu de données -TBA- d'entrainement  en utilisant la méthode fit()
# fit() permet juste une estimation de la moyenne de chaque variable mais le dataframe (données) n'est pas affecté
imp.fit(df_num)

In [13]:
#le fit du SimpleImputer a simplement calculé la moyenne de chaque caractéristique
# et a rangé le résultat dans sa variable d'instance statistics_
imp.statistics_

array([2.50500000e+02, 1.37399940e+04, 1.90800000e+00, 1.63732040e+04,
       9.59718600e+03, 7.98000000e-01, 2.34693878e-01, 1.72011905e-01,
       1.30519420e+04, 3.36000000e-01])

In [15]:
df_num.mean().values

array([2.50500000e+02, 1.37399940e+04, 1.90800000e+00, 1.63732040e+04,
       9.59718600e+03, 7.98000000e-01, 2.34693878e-01, 1.72011905e-01,
       1.30519420e+04, 3.36000000e-01])

In [14]:
#mainteant vous pouvez utiliser cet "SimpleImputer" entraîné pour transformer
#le jeu d'entrainement en remplacant les valeurs manquantes par les moyennes
#apprises
X_num= imp.transform(df_num)

In [18]:
#le résultat X_num est un tableau Numpy complet constitué des valeurs transformés
#si vous voulez re-injecter dans un dataframe Pandas:
df_fraude_aa_num=pd.DataFrame(X_num, columns=df_num.columns)
df_fraude_aa_num.head(10)

Unnamed: 0,ID,Income of Policy Holder,Num Claimants,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag
0,1.0,0.0,2.0,1625.0,3250.0,2.0,2.0,1.0,0.0,1.0
1,2.0,0.0,2.0,15028.0,60112.0,1.0,0.0,0.0,15028.0,0.0
2,3.0,54613.0,1.0,-99999.0,0.0,0.0,0.0,0.0,572.0,0.0
3,4.0,0.0,3.0,270200.0,0.0,0.0,0.0,0.0,270200.0,0.0
4,5.0,0.0,4.0,8869.0,0.0,0.0,0.0,0.0,0.0,1.0
5,6.0,0.0,1.0,17480.0,0.0,0.0,0.0,0.0,17480.0,0.0
6,7.0,52567.0,3.0,3017.0,18102.0,2.0,1.0,0.5,0.0,1.0
7,8.0,0.0,2.0,7463.0,0.0,0.0,0.0,0.0,7463.0,0.0
8,9.0,0.0,1.0,2067.0,0.0,0.0,0.23,0.0,2067.0,0.0
9,10.0,42300.0,4.0,2260.0,0.0,0.0,0.0,0.0,2260.0,0.0


In [16]:
df_fraude_aa_num.describe(include="all")

Unnamed: 0,ID,Income of Policy Holder,Num Claimants,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,13739.99,1.91,16373.2,9597.19,0.8,0.23,0.17,13051.94,0.34
std,144.48,20081.54,1.01,29426.28,35655.69,2.67,0.58,0.43,30547.19,0.47
min,1.0,0.0,1.0,-99999.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,125.75,0.0,1.0,3322.25,0.0,0.0,0.0,0.0,0.0,0.0
50%,250.5,0.0,2.0,5663.0,0.0,0.0,0.0,0.0,3253.5,0.0
75%,375.25,33918.5,3.0,12245.5,11282.75,1.0,0.0,0.0,8191.75,1.0
max,500.0,71284.0,4.0,270200.0,729792.0,56.0,5.0,2.0,295303.0,1.0


In [27]:
#Nous avons appliquer l'imputation uniquement aux caractéristiques
#numériques. Nous allons le faire pour les varaibles qualitatives
#Nous allons récupérer une copie des données qualitatives
df_cat=df[v_cat]

In [28]:
df_cat.head()

Unnamed: 0,Insurance Type,Marital Status,Injury Type,Overnight Hospital Stay
0,CI,,Soft Tissue,No
1,CI,,Back,Yes
2,CI,Married,Broken Limb,No
3,CI,,Serious,Yes
4,CI,,Soft Tissue,No


In [29]:
#nous utiliserons le mode : valeur la plus fréquente pour 
#remplacer les valeurs manquantes
imp=SimpleImputer(strategy='most_frequent')

In [30]:
imp.fit(df_cat)

In [31]:
X_cat=imp.transform(df_cat)

In [32]:
df_fraude_aa_cat=pd.DataFrame(X_cat, columns=df_cat.columns)

In [33]:
df_fraude_aa_cat.describe(include='all')

Unnamed: 0,Insurance Type,Marital Status,Injury Type,Overnight Hospital Stay
count,500,500,500,500
unique,1,3,4,2
top,CI,Married,Broken Limb,No
freq,500,429,177,354


In [34]:
df_fraude_aa_cat.shape

(500, 4)

In [35]:
#pour produir le dataFrame final nous allons concatener les deux dataframes
#contenant respectivement les données de type numérique et le type catégoriel
df_fraude_aa=pd.concat([df_fraude_aa_num,df_fraude_aa_cat], axis=1)

In [36]:
df_fraude_aa

Unnamed: 0,ID,Income of Policy Holder,Num Claimants,Claim Amount,Total Claimed,Num Claims,Num Soft Tissue,% Soft Tissue,Claim Amount Received,Fraud Flag,Insurance Type,Marital Status,Injury Type,Overnight Hospital Stay
0,1.0,0.0,2.0,1625.0,3250.0,2.0,2.000000,1.0,0.0,1.0,CI,Married,Soft Tissue,No
1,2.0,0.0,2.0,15028.0,60112.0,1.0,0.000000,0.0,15028.0,0.0,CI,Married,Back,Yes
2,3.0,54613.0,1.0,-99999.0,0.0,0.0,0.000000,0.0,572.0,0.0,CI,Married,Broken Limb,No
3,4.0,0.0,3.0,270200.0,0.0,0.0,0.000000,0.0,270200.0,0.0,CI,Married,Serious,Yes
4,5.0,0.0,4.0,8869.0,0.0,0.0,0.000000,0.0,0.0,1.0,CI,Married,Soft Tissue,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,496.0,0.0,1.0,2118.0,0.0,0.0,0.000000,0.0,0.0,1.0,CI,Married,Soft Tissue,No
496,497.0,29280.0,4.0,3199.0,0.0,0.0,0.234694,0.0,0.0,1.0,CI,Married,Broken Limb,Yes
497,498.0,0.0,1.0,32469.0,0.0,0.0,0.000000,0.0,16763.0,0.0,CI,Married,Broken Limb,Yes
498,499.0,46683.0,1.0,179448.0,0.0,0.0,0.000000,0.0,179448.0,0.0,CI,Married,Broken Limb,No


In [36]:
df_fraude_aa.shape

(500, 14)