In [1]:
import numpy as np
from sklearn import preprocessing
from sklearn import datasets 

features, targets = datasets.make_regression(n_samples = 100,
                                            n_features = 3, 
                                            n_informative = 3,
                                            n_targets = 1,
                                            noise = 0.0,
                                            coef = False, 
                                            random_state = 1)

print('Features\n', features[:4])

Features
 [[ 1.29322588 -0.61736206 -0.11044703]
 [-2.793085    0.36633201  1.93752881]
 [ 0.80186103 -0.18656977  0.0465673 ]
 [ 0.12910158  0.50274088  1.6169496 ]]


In [2]:
# Min Max Scaling
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scaler.fit_transform(features)
scaled_feature[:4]

array([[0.77946533, 0.37533626, 0.46832847],
       [0.        , 0.59456271, 0.88096364],
       [0.6857373 , 0.4713428 , 0.4999644 ],
       [0.55740819, 0.62496284, 0.81637193]])

In [3]:
# Standarizing a Feature
standard_scaler = preprocessing.StandardScaler()
standarized_feature = standard_scaler.fit_transform(features)
standarized_feature[:4]

array([[ 1.2770548 , -0.81821179, -0.21248228],
       [-2.81488465,  0.27548689,  1.99459418],
       [ 0.78501313, -0.33924483, -0.04327002],
       [ 0.11132701,  0.4271501 ,  1.6491102 ]])

In [4]:
# If there are outliers present in our dataset then we go for Robust Scaler
robust_scaler = preprocessing.RobustScaler()
robustscaled_feature = robust_scaler.fit_transform(features)
robustscaled_feature[:4]

array([[ 0.92989139, -0.59759549, -0.19905129],
       [-2.02848916,  0.23108285,  1.56673201],
       [ 0.57415628, -0.23468974, -0.06367212],
       [ 0.08709631,  0.34599568,  1.29032571]])

In [5]:
# If you want to rescale the feature values of observations to have unit norm
normalize_scaler = preprocessing.Normalizer(norm="l2")
normalized_features = normalize_scaler.fit_transform(features)
normalized_features[:4]

# This method rescales the data on individual observation unlike StandardScaler and MinMaxScaler
#This type of rescaling is often used when we have many equivalent features (e.g., text classification when every word or n-word group is a feature).

array([[ 0.89977444, -0.42953564, -0.07684459],
       [-0.81693109,  0.10714605,  0.56669508],
       [ 0.97242943, -0.22625608,  0.05647289],
       [ 0.07602183,  0.29604039,  0.95214535]])

In [6]:
# If you want to create polynomial and interaction features
polynomial_scaler = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial_features = polynomial_scaler.fit_transform(features)
polynomial_features[:4]

# Polynomial Features are often created when we want to include the notion that there exists a nonlinear relationship between the features
# and the targe.

array([[ 1.29322588e+00, -6.17362064e-01, -1.10447026e-01,
         1.67243318e+00, -7.98388600e-01, -1.42832953e-01,
         3.81135918e-01,  6.81858042e-02,  1.21985456e-02],
       [-2.79308500e+00,  3.66332015e-01,  1.93752881e+00,
         7.80132382e+00, -1.02319645e+00, -5.41168267e+00,
         1.34199145e-01,  7.09778834e-01,  3.75401790e+00],
       [ 8.01861032e-01, -1.86569772e-01,  4.65672984e-02,
         6.42981114e-01, -1.49603030e-01,  3.73405020e-02,
         3.48082798e-02, -8.68805025e-03,  2.16851328e-03],
       [ 1.29101580e-01,  5.02740882e-01,  1.61694960e+00,
         1.66672179e-02,  6.49046421e-02,  2.08750748e-01,
         2.52748394e-01,  8.12906667e-01,  2.61452601e+00]])

# Detecting Outliers


In [7]:
from sklearn.covariance import EllipticEnvelope

features[0,0] = 100000
features[0,1] = 100000

outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features)


# The problem with the above approach is that we need to be sure that dataset is mostly clean , so that we can define contamination
# Another way of detecting outlier is through IQR
# http://bit.ly/2FzMC2k please follow this link to get a brief understanding of how to detect outliers

array([-1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1])

# Handling Outliers 

In [8]:
# Strategy 1 : Drop Outliers

import pandas as pd

houses = pd.DataFrame()
houses['Price'] = [5343333, 3239333, 292333, 4322032]
houses['Bathrooms'] = [2, 3, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,5343333,2,1500
1,3239333,3,2500
2,292333,2,1500


In [9]:
# Strategy 2: Mark which are outliers
houses['Outliers'] = np.where(houses["Bathrooms"] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers
0,5343333,2,1500,0
1,3239333,3,2500,0
2,292333,2,1500,0
3,4322032,116,48000,1


In [10]:
# Strategy 3: Dampen the effect of the outlier
houses['Log Square Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,Log Square Feet
0,5343333,2,1500,0,7.31322
1,3239333,3,2500,0,7.824046
2,292333,2,1500,0,7.31322
3,4322032,116,48000,1,10.778956


# Discretizating Feature

In [11]:
# You have a numerical feature and want to break it up into bins

#Depending on how we want to break up the data there are 2 techniques

# Technique 1 : Breaking up with 1 threshold
age = np.array([[6],
               [12],
               [25],
               [42],
               [78]])

binarizer = preprocessing.Binarizer(18)
binarizer.fit_transform(age)


array([[0],
       [0],
       [1],
       [1],
       [1]])

In [12]:
# Technique 2: Breaking up data with multiple thresholds
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

# Grouping Observation using Clustering

In [13]:
# You want to cluster observations so that similar observations are grouped together

# If you know that you have k-groups , you can use k-means clustering

from sklearn.cluster import KMeans

features, _ = datasets.make_blobs(n_samples = 50,
                                 n_features = 2,
                                 centers = 3,
                                 random_state = 1)

df = pd.DataFrame(features, columns = ["Feature-1", "Feature-2"])
clusterer = KMeans(3, random_state = 0)
clusterer.fit(features)
df['Group'] = clusterer.predict(features)
df.head(10)

Unnamed: 0,Feature-1,Feature-2,Group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0
5,-0.794152,2.104951,1
6,-2.760179,5.551214,1
7,-9.946905,-4.590344,2
8,-0.52579,3.306599,1
9,-1.981977,4.022436,1


# Imputing Missing Values

In [18]:
# You have missing values in your data and want to fill in or predict their values

# If you have small amount of data, predict the missing values using k-nearest neighbours
from fancyimpute import KNN

features, _ = datasets.make_blobs(n_samples = 1000,
                                 n_features = 2,
                                 random_state = 1)

scaler = preprocessing.StandardScaler()
standarized_features = scaler.fit_transform(features)

# Replace first features first value with a missing values
true_value = standarized_features[0,0]
standarized_features[0,0] = np.nan

features_knn_imputed = KNN(k=5, verbose=0).fit_transform(standarized_features)
print('True value: ', true_value)
print('Imputed value: ',features_knn_imputed[0,0])

from sklearn.impute import SimpleImputer
# In case of large dataset we use scikit-learn's Imputer module to fill in the missing value with features's mean, median or mode.
mean_imputer = SimpleImputer(strategy='mean')
features_mean_imputed = mean_imputer.fit_transform(standarized_features)
print('True value: ', true_value)
print('Imputed value: ',features_mean_imputed[0,0])


True value:  0.8730186113995938
Imputed value:  1.0955332713113226
True value:  0.8730186113995938
Imputed value:  -0.000873892503901796
