# Feature Transforming And Handling Outliers

In [2]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer

In [3]:
features = np.array([[4,5,7],[5,8,9],[11,45,9]])

In [4]:
features

array([[ 4,  5,  7],
       [ 5,  8,  9],
       [11, 45,  9]])

In [5]:
def add_x(feature_matrix):
    return feature_matrix + 20

In [7]:
ten_transformer = FunctionTransformer(add_x)

In [8]:
ten_transformer.transform(features)

array([[24, 25, 27],
       [25, 28, 29],
       [31, 65, 29]])

## Detecting Outliers
## Total Data

In [10]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [12]:
features, _ = make_blobs(n_samples=20,n_features=4,centers=3,random_state=1)

In [13]:
features

array([[ -2.37085453,   1.60430932,  -1.38601498,   4.46640119],
       [ -2.34673261,   3.56128423, -10.66895863,  -3.96601315],
       [ -6.22589877,  -7.22212602,  -5.98920845,  -2.20364429],
       [ -6.94472323,  -7.53602499,  -5.97462545,  -3.44103531],
       [ -1.57613237,   0.70076297,  -0.48448032,   5.22420682],
       [ -7.01407443,  -8.79022375,  -6.08388029,  -0.98853032],
       [ -1.83198811,   3.52863145,  -9.95549876,  -3.37053333],
       [ -2.28697866,   0.57557661,  -1.42954832,   4.11444166],
       [ -2.76017908,   5.55121358,  -9.09612178,  -3.45085421],
       [ -1.34052081,   4.15711949,  -8.53560457,  -6.01348926],
       [ -8.18219253,  -7.91881241,  -4.6149936 ,  -2.3467413 ],
       [ -1.90461345,   1.6525036 ,  -1.30047476,   1.68218879],
       [  0.12092489,  -0.62016166,  -3.06022352,   3.19992414],
       [ -1.98197711,   4.02243551,  -8.86394306,  -5.05323981],
       [ -8.20740038,  -8.50257083,  -6.48369001,  -2.50216227],
       [ -7.81928012,  -6

In [14]:
features[0,0] = 10
features[0,1] = 10
features[10,0] = 10
features[10,2] = 10
features[9,3] = 10

In [16]:
features

array([[ 10.        ,  10.        ,  -1.38601498,   4.46640119],
       [ -2.34673261,   3.56128423, -10.66895863,  -3.96601315],
       [ -6.22589877,  -7.22212602,  -5.98920845,  -2.20364429],
       [ -6.94472323,  -7.53602499,  -5.97462545,  -3.44103531],
       [ -1.57613237,   0.70076297,  -0.48448032,   5.22420682],
       [ -7.01407443,  -8.79022375,  -6.08388029,  -0.98853032],
       [ -1.83198811,   3.52863145,  -9.95549876,  -3.37053333],
       [ -2.28697866,   0.57557661,  -1.42954832,   4.11444166],
       [ -2.76017908,   5.55121358,  -9.09612178,  -3.45085421],
       [ -1.34052081,   4.15711949,  -8.53560457,  10.        ],
       [ 10.        ,  -7.91881241,  10.        ,  -2.3467413 ],
       [ -1.90461345,   1.6525036 ,  -1.30047476,   1.68218879],
       [  0.12092489,  -0.62016166,  -3.06022352,   3.19992414],
       [ -1.98197711,   4.02243551,  -8.86394306,  -5.05323981],
       [ -8.20740038,  -8.50257083,  -6.48369001,  -2.50216227],
       [ -7.81928012,  -6

In [26]:
outlier_detector = EllipticEnvelope(contamination=.2)

In [27]:
outlier_detector.fit(features)

EllipticEnvelope(contamination=0.2)

In [28]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1,
        1,  1,  1])

In [29]:
outlier_detector = EllipticEnvelope(contamination=.3)

In [30]:
outlier_detector.fit(features)

EllipticEnvelope(contamination=0.3)

In [31]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1, -1,  1,  1,  1,  1,
        1, -1,  1])

In [32]:
outlier_detector = EllipticEnvelope(contamination=.1)

In [33]:
outlier_detector.fit(features)

EllipticEnvelope()

In [34]:
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,
        1,  1,  1])

## Individual Feature

In [37]:
def outlier_detection(ft):
    quartile1,quartile3 = np.percentile(ft,[25,75])
    IQR = quartile3 - quartile1
    lower_limit = quartile1 - (IQR*1.5)
    upper_limit = quartile3 + (IQR*1.5)
    print(f"The upper limit is {upper_limit}")
    return np.where((ft > upper_limit) | (ft<lower_limit))

In [38]:
feature1 = features[:,0]
feature2 = features[:,1]
feature3 = features[:,2]
feature4 = features[:,3]

In [39]:
outlier_detection(feature1)

The upper limit is 5.81533362992297


(array([ 0, 10], dtype=int64),)

In [40]:
outlier_detection(feature2)

The upper limit is 20.44510211381755


(array([], dtype=int64),)

In [41]:
outlier_detection(feature3)

The upper limit is 8.201816415370471


(array([10], dtype=int64),)

In [42]:
outlier_detection(feature4)

The upper limit is 13.716313940273588


(array([], dtype=int64),)

## Handling Outliers

In [43]:
import pandas as pd

In [45]:
houses = pd.DataFrame()
houses['Price'] = [534433,392333,293222,4322032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_feet'] = [1500,2500,1500,48000]

## Dropping The column

In [46]:
houses[houses['Bathrooms']<40]

Unnamed: 0,Price,Bathrooms,Square_feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


## Outlier to feature Transformation 


In [47]:
houses['outlier'] = np.where(houses['Bathrooms']<20,0,1)

In [48]:
houses

Unnamed: 0,Price,Bathrooms,Square_feet,outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


## Minimize the outlier effect

In [50]:
houses['minimized_Square_Feet'] = [np.log(x) for x in houses["Square_feet"]]

In [51]:
houses

Unnamed: 0,Price,Bathrooms,Square_feet,outlier,minimized_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956
