<h4 class="text-center"> Rescale Feature </h4>

In [1]:
#### import lib
import numpy as np
import pandas as pd

<span class="badge"> Min-Max Scaler </span>

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
## Creating a Feature
column_one = np.array([[500.5],
                      [-100.5],
                      [1],
                      [900.2]])

In [4]:
column_one

array([[ 500.5],
       [-100.5],
       [   1. ],
       [ 900.2]])

In [5]:
min_max_scaler = MinMaxScaler(feature_range=(0,1))

In [6]:
min_max_scaler.fit_transform(column_one) #converted all values between 0 and 1

array([[0.60057959],
       [0.        ],
       [0.101429  ],
       [1.        ]])

<span class="badge"> Standardizing </span>

In [7]:
column_one

array([[ 500.5],
       [-100.5],
       [   1. ],
       [ 900.2]])

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
std_scaler = StandardScaler()

In [10]:
std_scaler.fit_transform(column_one) #it will scale the column where mean is 0 and std =1

array([[ 0.43539502],
       [-1.05816895],
       [-0.80592811],
       [ 1.42870204]])

In [11]:
std_value = std_scaler.fit_transform(column_one)
round(np.mean(std_value))


0.0

In [12]:
round(np.std(std_value))

1.0

In [13]:
#standardScaler is often used scaler
#min-max is best practice for neural network

<span class="badge"> Robust Scaler </span>

In [14]:
#IF data has many outliers we should use roubust_scaler

In [15]:
from sklearn.preprocessing import RobustScaler

In [16]:
robust_scaler = RobustScaler()

In [17]:
robust_scaler.fit_transform(column_one)

array([[ 0.39972791],
       [-0.5621799 ],
       [-0.39972791],
       [ 1.03945262]])

<h4 class="text-center"> Normalize Observation </h4>

<span class="badge"> l2 Norm </span>

In [18]:
# Scale all the values between the length of 1

In [19]:
from sklearn.preprocessing import Normalizer

In [20]:
column_two = np.array([ [4,1],
                        [5,2],
                        [10,4],
                        [0.1,4]])

In [21]:
column_two

array([[ 4. ,  1. ],
       [ 5. ,  2. ],
       [10. ,  4. ],
       [ 0.1,  4. ]])

In [22]:
normalizer = Normalizer(norm='l2')

In [23]:
l2_norm = normalizer.transform(column_two)
l2_norm

array([[0.9701425 , 0.24253563],
       [0.92847669, 0.37139068],
       [0.92847669, 0.37139068],
       [0.02499219, 0.99968765]])

<span class="badge"> l1 norm </span>

In [24]:
l1_norm = Normalizer(norm='l1').transform(column_two)
l1_norm

array([[0.8       , 0.2       ],
       [0.71428571, 0.28571429],
       [0.71428571, 0.28571429],
       [0.02439024, 0.97560976]])

In [25]:
#“Manhattan norm”(l1) or “Taxicab norm.”(l2)
l1_norm[0,0] + l1_norm[0,1]


1.0

In [26]:
l2_norm[0,0] + l2_norm[0,1]

1.212678125181665

<h4 class="text-center"> Polynomial and Interaction Feature </h4>

<span class="badge"> Polynomial </span>

In [27]:
column_three = np.array([
    [3,4],
    [3,4],
    [3,4]
])

In [28]:
column_three

array([[3, 4],
       [3, 4],
       [3, 4]])

In [29]:
from sklearn.preprocessing import PolynomialFeatures

In [30]:
polynomial_scale = PolynomialFeatures(degree=2, include_bias=False)

In [31]:
polynomial_scale.fit_transform(column_three)

array([[ 3.,  4.,  9., 12., 16.],
       [ 3.,  4.,  9., 12., 16.],
       [ 3.,  4.,  9., 12., 16.]])

In [32]:
# degree = 2 means
#x1, x2, x1^2, [interaction], x2^2


<span class="badge"> Interaction </span>

In [33]:
#with only Interaction
polynomial_scale2 = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)

In [34]:
polynomial_scale2.fit_transform(column_three)

array([[ 3.,  4., 12.],
       [ 3.,  4., 12.],
       [ 3.,  4., 12.]])

<h4 class="text-center"> Transform Feature </h4>

<span class="badge"> Custom Transform </span>

In [35]:
from sklearn.preprocessing import FunctionTransformer

In [36]:
column_four = np.array(
[
    [2,3],
    [3,4],
    [5,6]
])

In [37]:
column_four

array([[2, 3],
       [3, 4],
       [5, 6]])

In [38]:
#Function
def add_five(x):
    return x+5

In [39]:
#Create Transform

In [40]:
transform_five = FunctionTransformer(add_five)

In [41]:
transform_five.transform(column_four)

array([[ 7,  8],
       [ 8,  9],
       [10, 11]])

<span class="badge"> Transform Using Pandas </span>

In [42]:
data = pd.DataFrame(column_four,columns=['one','two'])

In [43]:
data

Unnamed: 0,one,two
0,2,3
1,3,4
2,5,6


In [44]:
data.apply(add_five)

Unnamed: 0,one,two
0,7,8
1,8,9
2,10,11


<h4 class="text-center"> Detecting Outliers </h4>

In [45]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

<span class="badge"> Creating a Outlier Dataset </span>

In [46]:
blob, a = make_blobs(n_samples=10,n_features=2,centers=1,random_state=42)

In [47]:
blob

array([[-2.743351  ,  8.78014917],
       [-3.4172217 ,  7.60198243],
       [-3.52202874,  9.32853346],
       [-2.26723535,  7.10100588],
       [-2.97261532,  8.54855637],
       [-1.04354885,  8.78850983],
       [-1.86150908, 10.53731598],
       [-2.97867201,  9.55684617],
       [-4.23411546,  8.4519986 ],
       [-0.92998481,  9.78172086]])

In [48]:
a

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [49]:
blob[0,0] = 10000
blob[0,1] = 10000

In [50]:
blob

array([[ 1.00000000e+04,  1.00000000e+04],
       [-3.41722170e+00,  7.60198243e+00],
       [-3.52202874e+00,  9.32853346e+00],
       [-2.26723535e+00,  7.10100588e+00],
       [-2.97261532e+00,  8.54855637e+00],
       [-1.04354885e+00,  8.78850983e+00],
       [-1.86150908e+00,  1.05373160e+01],
       [-2.97867201e+00,  9.55684617e+00],
       [-4.23411546e+00,  8.45199860e+00],
       [-9.29984808e-01,  9.78172086e+00]])

<span class="badge"> Outlier Detector </span>

In [51]:
outlier_detector = EllipticEnvelope(contamination=.1)

In [52]:
outlier_detector.fit(blob)

EllipticEnvelope()

In [53]:
outlier_detector.predict(blob)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [54]:
#so it is shwoing first row has possible outliers
#Contamination is way to know how many outliers are in the dataset 
#we can set to higher values to check more possible outlier

<span class="badge"> IQR </span>

In [55]:
#Interquartile Range

In [56]:
blob_2  = blob[:,0]

In [57]:
blob_2

array([ 1.00000000e+04, -3.41722170e+00, -3.52202874e+00, -2.26723535e+00,
       -2.97261532e+00, -1.04354885e+00, -1.86150908e+00, -2.97867201e+00,
       -4.23411546e+00, -9.29984808e-01])

In [58]:
def iqr_detection(x):
    q1,q3 = np.percentile(x,[25,75])
    iqr = q3-q1
    lower_bound = q1- (iqr *1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where( (x> upper_bound) | (x < lower_bound) )

In [59]:
iqr_detection(blob_2)

(array([0], dtype=int64),)

In [60]:
# At [0] Indices there is outliers

In [61]:
#### There is no specific technique to detect outliers. as it's kind of assumption . There is no actual science to it
#### Better check all technique to detect outliers

<h4 class="text-center"> Handling Outliers </h4>

In [62]:
#creating a DataFrame with Outliers

In [63]:
house = pd.DataFrame()

In [64]:
house['bathroom'] = [2,3,4,2,3,100]
house['square'] = [1500,2500,3500,1500,2500,3500]
house['price'] = [10000,15000,25000,10000,15000,25000]

In [65]:
house

Unnamed: 0,bathroom,square,price
0,2,1500,10000
1,3,2500,15000
2,4,3500,25000
3,2,1500,10000
4,3,2500,15000
5,100,3500,25000


<span class="badge"> Drop </span>

In [66]:
house[house['bathroom']<10]

Unnamed: 0,bathroom,square,price
0,2,1500,10000
1,3,2500,15000
2,4,3500,25000
3,2,1500,10000
4,3,2500,15000


<span class="badge"> Mark Outliers </span>

In [67]:
house['Outliers'] = np.where(house['bathroom']<20,0,1)

In [68]:
house

Unnamed: 0,bathroom,square,price,Outliers
0,2,1500,10000,0
1,3,2500,15000,0
2,4,3500,25000,0
3,2,1500,10000,0
4,3,2500,15000,0
5,100,3500,25000,1


<span class="badge"> Transform Feature </span>

In [69]:
house['transform_feature'] = [np.log(x) for x in house['bathroom']]

In [70]:
house

Unnamed: 0,bathroom,square,price,Outliers,transform_feature
0,2,1500,10000,0,0.693147
1,3,2500,15000,0,1.098612
2,4,3500,25000,0,1.386294
3,2,1500,10000,0,0.693147
4,3,2500,15000,0,1.098612
5,100,3500,25000,1,4.60517


In [71]:
# There is no specific way to deal with outlier
# Outliers should be researched first. (what causes for outliers)
# and solve the outliers according to the research
# RobustScaler is a good way to scale Outliers

<h4 class="text-center"> Numeric to Discrete </h4>

<span class="badge"> Binarizer </span>

In [72]:
from sklearn.preprocessing import Binarizer

In [82]:
#### Categroy age
age = np.array([ [4],[5],[10],[20],[30],[35],[40]])

In [83]:
age

array([[ 4],
       [ 5],
       [10],
       [20],
       [30],
       [35],
       [40]])

In [84]:
age.shape

(7, 1)

In [85]:
binarizer_age = Binarizer(threshold=20) 

In [87]:
binarizer_age.fit_transform(age) #converted age - 0 if it's lower than 20 and 1 if its higher than 20

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1]])

<span class="badge"> Multiple Threshold </span>

In [88]:
np.digitize(age,bins=[20,30,35]) 

array([[0],
       [0],
       [0],
       [1],
       [2],
       [3],
       [3]], dtype=int64)

In [89]:
np.digitize(age,bins=[20,30,35],right=True) 

array([[0],
       [0],
       [0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [91]:
np.digitize(age,bins=[20])

array([[0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1]], dtype=int64)

<h4 class="text-center"> Cluster Data Scaling </h4>

<span class="badge"> Cluster </span>

In [94]:
# Create a blob using Sklearn_dataset Blob
blob

array([[ 1.00000000e+04,  1.00000000e+04],
       [-3.41722170e+00,  7.60198243e+00],
       [-3.52202874e+00,  9.32853346e+00],
       [-2.26723535e+00,  7.10100588e+00],
       [-2.97261532e+00,  8.54855637e+00],
       [-1.04354885e+00,  8.78850983e+00],
       [-1.86150908e+00,  1.05373160e+01],
       [-2.97867201e+00,  9.55684617e+00],
       [-4.23411546e+00,  8.45199860e+00],
       [-9.29984808e-01,  9.78172086e+00]])

In [95]:
blob_df = pd.DataFrame(blob,columns=['cluster_1','cluster_2'])

In [96]:
blob_df

Unnamed: 0,cluster_1,cluster_2
0,10000.0,10000.0
1,-3.417222,7.601982
2,-3.522029,9.328533
3,-2.267235,7.101006
4,-2.972615,8.548556
5,-1.043549,8.78851
6,-1.861509,10.537316
7,-2.978672,9.556846
8,-4.234115,8.451999
9,-0.929985,9.781721


<span class="badge"> Kmeans </span>

In [97]:
from sklearn.cluster import KMeans

In [98]:
cluster = KMeans(n_clusters=3,random_state=42)

In [99]:
cluster.fit(blob)

KMeans(n_clusters=3, random_state=42)

In [102]:
blob_df['Scale_data'] = cluster.predict(blob)

In [103]:
blob_df

Unnamed: 0,cluster_1,cluster_2,Result,Scale_data
0,10000.0,10000.0,1,1
1,-3.417222,7.601982,2,2
2,-3.522029,9.328533,2,2
3,-2.267235,7.101006,2,2
4,-2.972615,8.548556,2,2
5,-1.043549,8.78851,0,0
6,-1.861509,10.537316,0,0
7,-2.978672,9.556846,2,2
8,-4.234115,8.451999,2,2
9,-0.929985,9.781721,0,0


<h4 class="text-center"> Delete Missing Values </h4>

In [104]:
data_nan = blob_df.iloc[:5,2:]

In [108]:
data_nan['Result'][2] = np.nan

In [109]:
data_nan

Unnamed: 0,Result,Scale_data
0,1.0,1
1,2.0,2
2,,2
3,2.0,2
4,2.0,2


In [113]:
data_nan[~np.isnan(data_nan).any(axis=1)] #remove rows

Unnamed: 0,Result,Scale_data
0,1.0,1
1,2.0,2
3,2.0,2
4,2.0,2


<span class="badge"> Pandas Method </span>

In [114]:
data_nan

Unnamed: 0,Result,Scale_data
0,1.0,1
1,2.0,2
2,,2
3,2.0,2
4,2.0,2


In [115]:
data_nan.dropna()

Unnamed: 0,Result,Scale_data
0,1.0,1
1,2.0,2
3,2.0,2
4,2.0,2


<h4 class="text-center"> Impute Missing Values </h4>

In [118]:
from sklearn.impute import SimpleImputer

In [120]:
#mean imputer
mean_imputer = SimpleImputer(strategy='mean')

In [121]:
feature_impute = mean_imputer.fit_transform(data_nan)

In [122]:
feature_impute

array([[1.  , 1.  ],
       [2.  , 2.  ],
       [1.75, 2.  ],
       [2.  , 2.  ],
       [2.  , 2.  ]])

<span class="badge"> Sklearn Dataset </span>

In [124]:
dataset = pd.DataFrame(blob,columns=['one','two'])

In [125]:
dataset

Unnamed: 0,one,two
0,10000.0,10000.0
1,-3.417222,7.601982
2,-3.522029,9.328533
3,-2.267235,7.101006
4,-2.972615,8.548556
5,-1.043549,8.78851
6,-1.861509,10.537316
7,-2.978672,9.556846
8,-4.234115,8.451999
9,-0.929985,9.781721


In [126]:
scaler = StandardScaler()

In [127]:
standardize = scaler.fit_transform(dataset)

In [128]:
standardize

array([[ 2.99999983,  2.99999984],
       [-0.33361206, -0.33375141],
       [-0.33364699, -0.33317539],
       [-0.33322883, -0.33391855],
       [-0.3334639 , -0.33343561],
       [-0.33282104, -0.33335555],
       [-0.33309362, -0.3327721 ],
       [-0.33346591, -0.33309921],
       [-0.33388429, -0.33346782],
       [-0.33278319, -0.33302419]])

In [129]:
true_value = standardize[0,0]

In [131]:
standardize_ = standardize.copy()

In [132]:
standardize_[0,0] = np.nan

In [133]:
standardize_

array([[        nan,  2.99999984],
       [-0.33361206, -0.33375141],
       [-0.33364699, -0.33317539],
       [-0.33322883, -0.33391855],
       [-0.3334639 , -0.33343561],
       [-0.33282104, -0.33335555],
       [-0.33309362, -0.3327721 ],
       [-0.33346591, -0.33309921],
       [-0.33388429, -0.33346782],
       [-0.33278319, -0.33302419]])

In [134]:
mean_imputer.fit_transform(standardize_)

array([[-0.33333331,  2.99999984],
       [-0.33361206, -0.33375141],
       [-0.33364699, -0.33317539],
       [-0.33322883, -0.33391855],
       [-0.3334639 , -0.33343561],
       [-0.33282104, -0.33335555],
       [-0.33309362, -0.3327721 ],
       [-0.33346591, -0.33309921],
       [-0.33388429, -0.33346782],
       [-0.33278319, -0.33302419]])

In [136]:
mean_imputer.fit_transform(standardize_)[0,0], true_value

(-0.33333331420465356, 2.9999998278418816)