In [1]:
import numpy as np 
import pandas as pd

from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20)

In [2]:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20)
X

array([[-0.75285871, -1.43248509, -0.11892659, ...,  0.39191953,
         0.40009368, -0.99459449],
       [ 0.58569488,  0.3447027 ,  0.09085676, ...,  0.85204653,
        -0.77756038,  1.10456842],
       [ 1.69241881,  1.04460372, -0.21495329, ..., -0.10469111,
         0.77719899, -0.90804375],
       ...,
       [ 1.57283237,  0.82537875, -0.50845332, ...,  1.09857313,
        -0.74249934,  0.38561316],
       [-0.19286062, -0.5864671 , -1.88490717, ...,  1.08221682,
        -0.34381541, -0.47112687],
       [-0.67634875, -1.53375868,  0.00918866, ..., -0.84313717,
        -0.01138071,  0.05962186]])

In [4]:
y

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,

In [2]:
x = np.array([[0.1, 1.0, 22.8],[0.5, 5.0, 41.2],[1.2, 12.0, 2.8],[0.8, 8.0, 14.0]])
x

array([[ 0.1,  1. , 22.8],
       [ 0.5,  5. , 41.2],
       [ 1.2, 12. ,  2.8],
       [ 0.8,  8. , 14. ]])

## Step: Data Preprocessing
- You often need to transform data in such a way that the __mean of each column__ (__feature__) is __zero__ and the __standard deviation__ is __one__
- You can apply class `sklearn.preprocessing.StandardScaler` to do this

### Data Processing 3: 

In [4]:
age_sample_df = pd.DataFrame({'Age':[20, 30, 10, np.nan, 10]})
age_sample_df

Unnamed: 0,Age
0,20.0
1,30.0
2,10.0
3,
4,10.0


In [5]:
from sklearn.impute import SimpleImputer

# Mean Imputation
imputer = SimpleImputer()
imputer.fit_transform(age_sample_df)

array([[20. ],
       [30. ],
       [10. ],
       [17.5],
       [10. ]])

**Remark**
- In order to encode the missingness of values as a feature, we can set the `add_indicator` argument to `True` and observe the output.

In [6]:
# impute the mean and add an indicator matrix (new in scikit-learn 0.21)
imputer = SimpleImputer(add_indicator=True)
imputer.fit_transform(age_sample_df)

array([[20. ,  0. ],
       [30. ,  0. ],
       [10. ,  0. ],
       [17.5,  1. ],
       [10. ,  0. ]])

**Remark**
- In the output, we observe that the indicator value of 1 is inserted at index 3 where the original data was missing. This feature is new in scikit-learn version 0.21 and above. In the next section, we shall see how we can use the HistGradientBoosting Classifier that natively handles missing values.

### Data Preporocessing 1. `LabelEncoder`

In [2]:
from sklearn import preprocessing
weather = ['Clear', 'Clear', 'Clear', 'Clear', 'Clear', 'Clear',
            'Rainy', 'Rainy', 'Rainy', 'Rainy', 'Rainy', 'Rainy',
            'Snowy', 'Snowy', 'Snowy', 'Snowy', 'Snowy', 'Snowy']
labelEncoder = preprocessing.LabelEncoder();
print (labelEncoder.fit_transform(weather))

[0 0 0 0 0 0 1 1 1 1 1 1 2 2 2 2 2 2]


In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)
scaler.scale_

array([ 0.40311289,  4.03112887, 14.04421589])

In [4]:
scaler.mean_

array([ 0.65,  6.5 , 20.2 ])

In [5]:
scaler.var_

array([1.6250e-01, 1.6250e+01, 1.9724e+02])

In [6]:
scaled_x

array([[-1.36438208, -1.36438208,  0.18512959],
       [-0.3721042 , -0.3721042 ,  1.4952775 ],
       [ 1.36438208,  1.36438208, -1.23894421],
       [ 0.3721042 ,  0.3721042 , -0.44146288]])

In [7]:
scaled_x.mean().round(decimals=4)

np.float64(0.0)

In [8]:
scaled_x.mean(axis=0)

array([ 1.66533454e-16, -1.38777878e-17,  1.52655666e-16])

In [9]:
scaled_x.std(axis=0)

array([1., 1., 1.])

In [10]:
scaler.inverse_transform(scaled_x)

array([[ 0.1,  1. , 22.8],
       [ 0.5,  5. , 41.2],
       [ 1.2, 12. ,  2.8],
       [ 0.8,  8. , 14. ]])

## Remarks
- Sometimes, you’ll have some categorical data and need to convert it to meaningful numbers. One of the ways to do that is by using class `sklearn.preprocessing.OneHotEncoder`. Consider the following example with the arrays of roles in a company:

In [11]:
from sklearn.preprocessing import OneHotEncoder
roles = np.array([('Tom', 'manager'),('Mary', 'developer'),('Ann', 'recruiter'),('Jim', 'developer')])
roles

array([['Tom', 'manager'],
       ['Mary', 'developer'],
       ['Ann', 'recruiter'],
       ['Jim', 'developer']], dtype='<U9')

In [14]:
encoder = OneHotEncoder()
encoded_roles = encoder.fit_transform(roles[:, [1]])
encoded_roles.toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

Remarks:
In the example above, the first column of the object encoded_roles indicates if each employee is a developer. The second and fourth employee (Mary and Jim) are. The second column is related to the position of manager. Only the first employee (Tom) has this position. Finally, the third column corresponds to the recruiter, and the third employee (Ann) is the one.

## Step: Dimensionality Reduction
Dimensionality reduction involves the selection or extraction of the most important components (features) of a multidimensional dataset. Scikit-learn offers several approaches to dimensionality reduction. One of them is the principal component analysis or PCA.