<h1>Chap 04 - Handling Numerical Data</h1>

Imports

In [49]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer, PolynomialFeatures, FunctionTransformer, Binarizer, StandardScaler
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
#from fancyimpute import KNN

## 4.1 Rescaling a Feature 

In [2]:
# Many of machine learning algorithms assum that all features are on the same scale
# good for neural networks
feature = np.array([[-550.0],
                    [-100.1],
                    [0],
                    [100.1],
                    [900.9]])

minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scaler.fit_transform(feature)

scaled_feature

array([[0.        ],
       [0.3100834 ],
       [0.37907506],
       [0.44806672],
       [1.        ]])

## 4.2 Standardizing a Feature 

In [3]:
# good for PCA
x = np.array([[-1000.1],
              [-200.2],
              [500.5],
              [600.6],
              [9000.9]])

scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [4]:
print('Mean:', standardized.mean())
print('Deviation', standardized.std())

Mean: 4.4408920985006264e-17
Deviation 1.0


In [5]:
# if there are outliers, use RobustScaler, that uses median and quartile range to standardize
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

## 4.3 Normalizing Observations 

In [6]:
features = np.array([[0.5, 0.5],
                     [1.1, 3.4],
                     [1.5, 20.2],
                     [1.63, 34.4],
                     [10.9, 3.3]])

normalizer = Normalizer(norm='l2')
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [7]:
#Euclidean norm
features_l2_norm = Normalizer(norm="l2").transform(features)

In [8]:
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [9]:
#Manhatan norm
features_l1_norm = Normalizer(norm="l1").transform(features)
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [10]:
for i in range(0, features_l1_norm.shape[0]):
    print(f'Sum of n = {i} observation is equal to {features_l1_norm[i,0] + features_l1_norm[i,1]} ')

Sum of n = 0 observation is equal to 1.0 
Sum of n = 1 observation is equal to 1.0 
Sum of n = 2 observation is equal to 1.0 
Sum of n = 3 observation is equal to 1.0 
Sum of n = 4 observation is equal to 1.0 


## 4.4 Generating Polynomial and Interaction Features 

In [11]:
features = np.array([[2,3],
                     [2,4],
                     [2,5]])

polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False) # degree = 2   means x^1 and x^2 
polynomial_interaction.fit_transform(features)

array([[ 2.,  3.,  4.,  6.,  9.],
       [ 2.,  4.,  4.,  8., 16.],
       [ 2.,  5.,  4., 10., 25.]])

In [12]:
interaction = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
interaction.fit_transform(features)

array([[ 2.,  3.,  6.],
       [ 2.,  4.,  8.],
       [ 2.,  5., 10.]])

## 4.5 Transforming Features 

In [13]:
features = np.array([[2,3],
                     [2,3],
                     [2,3]])

def add_ten(x):
    return x+10

ten_transformer = FunctionTransformer(add_ten)
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [14]:
df = pd.DataFrame(features, columns=['feature_1', 'feature_2'])

In [15]:
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


## 4.6 Detecting Outliers 

In [16]:
features, _ = make_blobs(n_samples =10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)
# replaces with extrem values
features[0,0] = 10000
features[0,1] = 10000

outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features) # -1 means outliers

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [17]:
feature = features[:,0]

def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1 # interquartile range
    lower_bound = q1 - (iqr * 1.5) # lower limits for outliers detection
    upper_bound = q3 + (iqr * 1.5) # upper limits for outliers detection
    
    return np.where((x>upper_bound) | (x<lower_bound))

indicies_of_outliers(feature)

(array([0], dtype=int64),)

## 4.7 Handling Outliers 

In [18]:
houses = pd.DataFrame()
houses['Price'] =  [53433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet']= [1500, 2500, 1500, 48000]

In [19]:
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,53433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [20]:
#Marking as outliers
houses['Outliers'] = np.where(houses["Bathrooms"] < 20, 0, 1) # fills with 0 for true and 1 for false

In [21]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers
0,53433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [22]:
houses['Log_Of_Square_Feet'] = [np.log(x) for x in houses['Square_Feet']]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,Log_Of_Square_Feet
0,53433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


In [23]:
houses['Log_Of_Square_Feet_Apply'] = houses.Square_Feet.apply(np.log)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,Log_Of_Square_Feet,Log_Of_Square_Feet_Apply
0,53433,2.0,1500,0,7.31322,7.31322
1,392333,3.5,2500,0,7.824046,7.824046
2,293222,2.0,1500,0,7.31322,7.31322
3,4322032,116.0,48000,1,10.778956,10.778956


## 4.8 Discretizating Features 

In [28]:
age = np.array([[6], 
                [12],
                [20],
                [36],
                [65]])
# by threshold
binarizer = Binarizer(threshold=18)

binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [29]:
# by multiple thresholds
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [33]:
features, _ = make_blobs(n_samples = 50, n_features = 2, centers = 3, random_state = 1)
dataframe = pd.DataFrame(features, columns=['feature_1', 'feature_2'])
clusterer = KMeans(3, random_state = 0)
clusterer.fit(features)
dataframe['group'] = clusterer.predict(features)
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,0
1,-7.28721,-8.353986,2
2,-6.943061,-7.023744,2
3,-7.440167,-8.791959,2
4,-6.641388,-8.075888,2


## 4.10 Deleting Observations with Missing Values

In [37]:
features = np.array([[1.1, 11.1],
                     [2.2, 22.2],
                     [3.3, 33.3],
                     [4.4, 44.4],
                     [np.nan, 55]])

#features that are not (denoted by ~) missing valures
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [39]:
dataframe = pd.DataFrame(features, columns = ['feture_1', 'feature_2'])
dataframe.dropna()

Unnamed: 0,feture_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


## 4.11 Imputing Missing Values 

In [50]:
features, _ = make_blobs(n_samples = 1000, n_features = 2, random_state = 1)
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)
true_value = standardized_features[0,0]
standardized_features[0,0] = np.nan

In [None]:
#with KNN
features_knnn_imputed = KNN(k = 5, verbose = 0).complete(standardized_features)

print('True value:', true_value)
print('Imputed value:', features_knn_imputed[0,0])

In [None]:
#with mean
mean_imputer = Imputer(strategy="mean", axis = 0)
features_mean_imputed = mean_imputer.fit_transform(features)

print('True value:', true_vale)
print('Imputed value:', features_mean_imputed[0,0])