In [1]:
import numpy as np
import pandas as pd

from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import StandardScaler,Binarizer,FunctionTransformer,PolynomialFeatures,Normalizer,MinMaxScaler,RobustScaler
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope

4.1 : Rescaling a Feature

In [2]:
fearure = np.array([[-500.5],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])
minmax_scaler = MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scaler.fit_transform(fearure)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

Discussion

x_prime_i = (x_i - min(x)) / (max(x) - min(x))

4.2 : Standardizing a Feature

In [3]:
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])
scaler = StandardScaler()
standardozed = scaler.fit_transform(x)
standardozed

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

Discussion

x_prime_i = (x_i - mean_x) / std_x

In [4]:
print("Mean" , round(standardozed.mean()))
print("Standard Deviation" ,standardozed.std())

Mean 0
Standard Deviation 1.0


In [5]:
robust_scaler = RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

4.3 : Normalizing Observations

In [6]:
fearures  = np.array([[0.5,0.5],
                      [1.1,3.4],
                      [1.5,20.2],
                      [1.63,34.4],
                      [10.9,3.3]])
normalizer = Normalizer(norm='l2')
normalizer.transform(fearures)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

Discussion

In [7]:
features_l2_norm = Normalizer(norm='l2').transform(fearures)
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [8]:
featires_l1_norm = Normalizer(norm='l1').transform(fearures)
featires_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [9]:
print("Sum of the first observation \'s value: ", features_l2_norm[0,0] + featires_l1_norm[0,1])

Sum of the first observation 's value:  1.2071067811865475


4.4 : Generating Polynomial and Interaction Features

In [10]:
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [11]:
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

4.5 : Transforming Features

In [12]:
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

def add_ten(x: int) -> int:
    return x + 10

ten_transformer = FunctionTransformer(add_ten)
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [13]:
df = pd.DataFrame(features, columns=['feature1','feature2'])
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,12,13
2,12,13


4.6 : Detecting Outliers

In [14]:
features, _ = make_blobs(n_samples=10, 
                         n_features=2,
                         centers=1, 
                         random_state=1)
features[0,0] = 10000
features[0,1] = 10000
outlier_detector = EllipticEnvelope(contamination=0.1)
outlier_detector.fit(features)
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [15]:
feature = features[:,0]

def indicies_of_outliers(x: int) -> np.ndarray:
    quartile_1 = np.percentile(x,25)
    quartile_3 = np.percentile(x,75)
    interquartile_range = quartile_3 - quartile_1
    lower_bound = quartile_1 - (1.5 * interquartile_range)
    upper_bound = quartile_3 + (1.5 * interquartile_range)
    return np.where((x > upper_bound) | (x < lower_bound))

indicies_of_outliers(feature)

(array([0]),)

4.7 : Handling Outliers

In [16]:
houses = pd.DataFrame()
houses['Price'] = [543344 , 392333 , 293222 , 4322032]
houses['Bathrooms'] = [2 , 3.5 , 2 , 116]
houses['Square_Feet'] = [1500 , 2500 , 1500 , 48000]

houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,543344,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [18]:
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,543344,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [19]:
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,543344,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


4.8 : Discretizating Features

In [20]:
age = np.array([[6],
                [12],
                [20],
                [36],
                [65]])
binarizer = Binarizer(threshold=18)
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [21]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]])

In [22]:
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]])

Discussion

In [23]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]])

4.9 : Grouping Observations Using Clustering

In [24]:
features,_ = make_blobs(n_samples=50,
                        n_features=2,
                        centers=3,
                        random_state=1)

dataframe = pd.DataFrame(features, columns=['Feature_1','Feature_2'])
clusterer = KMeans(3, random_state=0)
clusterer.fit(features)
dataframe['group'] = clusterer.predict(features)
dataframe.head(5)

Unnamed: 0,Feature_1,Feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


4.10 : Deleting Observations with Missing Values

In [25]:
features = np.array([[1.1,11.1],
                     [2.2,22.2],
                     [3.3,33.3],
                     [4.4,44.4],
                     [np.nan,55.5]])

features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [26]:
dataframe = pd.DataFrame(features, columns=['Feature_1','Feature_2'])
dataframe.dropna()

Unnamed: 0,Feature_1,Feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


4.11 Imputing Missing Values

In [27]:
features,_ = make_blobs(n_samples=1000,
                        n_features=2,
                        random_state=1)

scaler = StandardScaler()
standardozed_features = scaler.fit_transform(features)
true_value = standardozed_features[0,0]
standardozed_features[0,0] = np.nan
knn_imputer = KNNImputer(n_neighbors=5)
features_knn_imputed = knn_imputer.fit_transform(standardozed_features)

print("True Value: ", true_value)
print("Imputed Value: ", features_knn_imputed[0,0])

True Value:  0.8730186113995938
Imputed Value:  1.0959262913919632


In [28]:
features,_ = make_blobs(n_samples=1000,
                        n_features=2,
                        random_state=1)

scaler = StandardScaler()
standardozed_features = scaler.fit_transform(features)
true_value = standardozed_features[0,0]

mean_imputer = SimpleImputer(strategy='mean')
features_mean_imputed = mean_imputer.fit_transform(features)

print("True Value: ", true_value)
print("Imputed Value: ", features_mean_imputed[0,0])

True Value:  0.8730186113995938
Imputed Value:  -3.058372724614996
