# Nettoyage des données

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.linear_model import SGDClassifier

In [2]:
from sklearn.impute import SimpleImputer

In [4]:
X = np.array([[10, 3], [0, 4], [5, 3], [np.nan, 3]])
X

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [nan,  3.]])

In [7]:
imputer = SimpleImputer(missing_values= np.nan, strategy='mean')
#plusieurs peuvent être nécessaire pour nettoyer le dataframe tel que des valeurs neg
imputer.fit_transform(X)

array([[10.,  3.],
       [ 0.,  4.],
       [ 5.,  3.],
       [ 5.,  3.]])

In [8]:
X_test = np.array([[12, 5], [40, 2], [2, 1], [np.nan, np.nan]])
imputer.transform(X_test)

array([[12.  ,  5.  ],
       [40.  ,  2.  ],
       [ 2.  ,  1.  ],
       [ 5.  ,  3.25]])

In [14]:
from sklearn.impute import KNNImputer
#remplace toutes les valeurs manquantes par les valeurs des plus proches voisins

In [12]:
X = np.array([[1, 100], [2, 30], [3, 15], [np.nan, 20]])

In [13]:
imputer = KNNImputer(n_neighbors=1)
imputer.fit_transform(X)

array([[  1., 100.],
       [  2.,  30.],
       [  3.,  15.],
       [  3.,  20.]])

In [19]:
from sklearn.impute import MissingIndicator
#indique où il manque des données
from sklearn.pipeline import make_union

In [20]:
X = np.array([[1, 100], [2, 30], [3, 15], [np.nan, np.nan]])

In [21]:
MissingIndicator().fit_transform(X)

array([[False, False],
       [False, False],
       [False, False],
       [ True,  True]])

In [22]:
pipeline = make_union(SimpleImputer(strategy='constant', fill_value= -99), MissingIndicator())
pipeline.fit_transform(X)

array([[  1., 100.,   0.,   0.],
       [  2.,  30.,   0.,   0.],
       [  3.,  15.,   0.,   0.],
       [-99., -99.,   1.,   1.]])

il arrive que le manque d'information permettent d'obtenir une autre information <br>
ex = dans le dataframe titanic, l'absence de donnée dans la classe et le prix du billet peut indiquer que c'était un membre d'équipage.

## Application

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
import seaborn as sns

In [28]:
titanic = sns.load_dataset('titanic')
X = titanic[['pclass', 'age']]
y = titanic['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [29]:
model = make_pipeline(KNNImputer(), SGDClassifier())

In [30]:
params = {
    'knnimputer__n_neighbors' : [1, 2, 3, 4]
}

In [31]:
grid = GridSearchCV(model, param_grid=params, cv = 5)

In [32]:
grid.fit(X_train, y_train)

In [33]:
grid.best_params_

{'knnimputer__n_neighbors': 3}