<a href="https://colab.research.google.com/github/lmcanavals/data_mining/blob/main/notebooks/aa_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn import preprocessing
import numpy as np

## Standarization

Llevar los datos de las columnas a la misma distribución.

In [None]:
X_train = np.array([[1, -1, 2], [2, 0, 0], [0, 1, -1]])
X_scaled = preprocessing.scale(X_train)
print(X_scaled)
print(X_scaled.mean(axis=0))
print(X_scaled.std(axis=0))

[[ 0.         -1.22474487  1.33630621]
 [ 1.22474487  0.         -0.26726124]
 [-1.22474487  1.22474487 -1.06904497]]
[0. 0. 0.]
[1. 1. 1.]


Cuando queremos aplicar scale a otros datos, es mejor crear un scaler

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
print(scaler)
print(scaler.mean_)
print(scaler.scale_)
scaler.transform(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)
[1.         0.         0.33333333]
[0.81649658 0.81649658 1.24721913]


array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [None]:
X_test = [[-1, 1, 0]]
scaler.transform(X_test)

array([[-2.44948974,  1.22474487, -0.26726124]])

## Normalization

In [None]:
X_normalized = preprocessing.normalize(X_train, norm='l2')
print(X_normalized)

[[ 0.40824829 -0.40824829  0.81649658]
 [ 1.          0.          0.        ]
 [ 0.          0.70710678 -0.70710678]]


In [None]:
normalizer = preprocessing.Normalizer().fit(X_train)
print(normalizer)
normalizer.transform(X_train)

Normalizer(copy=True, norm='l2')


array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [None]:
normalizer.transform(X_test)

array([[-0.70710678,  0.70710678,  0.        ]])

## Missing values

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit([[     1, 2],
         [np.nan, 3],
         [     7, 6]])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [None]:
X = [[np.nan,      2],
     [     6, np.nan],
     [     7,      6],
     [np.nan,      4],
     [     1,      1],
     [np.nan, np.nan]]
imp.transform(X)

array([[4.        , 2.        ],
       [6.        , 3.66666667],
       [7.        , 6.        ],
       [4.        , 4.        ],
       [1.        , 1.        ],
       [4.        , 3.66666667]])

In [None]:
import scipy.sparse as sp
X = sp.csc_matrix([[1, 2], [0, -1], [7, 6]])

imp = SimpleImputer(missing_values=-1, strategy ='mean')
imp.fit(X)

X_test = sp.csc_matrix ([[-1, 2], [6, -1], [7, 6]])
imp.transform(X_test).toarray()

array([[2.66666667, 2.        ],
       [6.        , 4.        ],
       [7.        , 6.        ]])

## Rescaling

In [None]:
import pandas as pd
import scipy
from sklearn.preprocessing import MinMaxScaler

In [None]:
url="https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv"

In [None]:
names=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pd.read_csv(url, names=names, comment="#")
array = dataframe.values
X = array[:, :-1]
Y = array[:, -1]
scaler = MinMaxScaler(feature_range=(0, 1))

X_rescaled = scaler.fit_transform(X)
np.set_printoptions(precision=3)
X_rescaled[:5]

array([[0.353, 0.744, 0.59 , 0.354, 0.   , 0.501, 0.234, 0.483],
       [0.059, 0.427, 0.541, 0.293, 0.   , 0.396, 0.117, 0.167],
       [0.471, 0.92 , 0.525, 0.   , 0.   , 0.347, 0.254, 0.183],
       [0.059, 0.447, 0.541, 0.232, 0.111, 0.419, 0.038, 0.   ],
       [0.   , 0.688, 0.328, 0.354, 0.199, 0.642, 0.944, 0.2  ]])

## Binarization

In [None]:
from sklearn.preprocessing import Binarizer

In [None]:
binarizer = Binarizer(threshold=0.5).fit(X_rescaled)
X_binarized = binarizer.transform(X_rescaled)
X_binarized[:5]

array([[0., 1., 1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 1., 0.]])

## Standarization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
rescaledX[:5]

array([[ 0.64 ,  0.848,  0.15 ,  0.907, -0.693,  0.204,  0.468,  1.426],
       [-0.845, -1.123, -0.161,  0.531, -0.693, -0.684, -0.365, -0.191],
       [ 1.234,  1.944, -0.264, -1.288, -0.693, -1.103,  0.604, -0.106],
       [-0.845, -0.998, -0.161,  0.155,  0.123, -0.494, -0.921, -1.042],
       [-1.142,  0.504, -1.505,  0.907,  0.766,  1.41 ,  5.485, -0.02 ]])