Q1.You need to rescale the values of a numerical feaature to be between two values

In [3]:
import numpy as np
from sklearn import preprocessing

In [4]:
feature=np.array([[-500.5],[-100.1],[0],[100.1],[900.9]])
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature=minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

Q2.You want to transform a feature to have a mean of 0 and a standard deviation of 1. 

In [5]:
x=np.array([[-1000.1],[-200.2],[500.5],[600.6],[9000.9]])
scaler=preprocessing.StandardScaler()
scaled_feature=scaler.fit_transform(x)
scaled_feature

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [7]:
print(round(scaled_feature.mean()))

0.0


In [8]:
print(round(scaled_feature.std()))

1.0


In [9]:
robust_scaler=preprocessing.RobustScaler()
scaler=robust_scaler.fit_transform(x)
scaler

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

Q3.You want to rescale the feature values of observations to have unit norm (a total length of 1).

In [11]:
from sklearn.preprocessing import Normalizer


In [12]:
features = np.array([[0.5, 0.5],[1.1, 3.4],[1.5, 20.2],[1.63, 34.],[10.9, 3.3]])
normalizer=Normalizer(norm="l2")
normalizer.transform(features)


array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04788618, 0.9988528 ],
       [0.95709822, 0.28976368]])

Alternatively,we can specify Manhattan form(L1):

In [13]:
features_l1_norm = Normalizer(norm="l1").transform(features)
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04574797, 0.95425203],
       [0.76760563, 0.23239437]])

In [14]:
 print(features_l1_norm[0, 0] + features_l1_norm[0, 1])


1.0


In [15]:
features_l2_norm = Normalizer(norm="l2").transform(features)
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04788618, 0.9988528 ],
       [0.95709822, 0.28976368]])

In [16]:
 print(features_l2_norm[0, 0] + features_l2_norm[0, 1])

1.414213562373095


Q4.You want to create polynominal and interaction features

In [17]:
from sklearn.preprocessing import PolynomialFeatures
feature=np.array([[2,4],[5,6],[2,3]])
poly=PolynomialFeatures(degree=2,include_bias=False)
poly.fit_transform(feature)


array([[ 2.,  4.,  4.,  8., 16.],
       [ 5.,  6., 25., 30., 36.],
       [ 2.,  3.,  4.,  6.,  9.]])

In [22]:
interaction = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
interaction.fit_transform(feature)


array([[ 2.,  4.,  8.],
       [ 5.,  6., 30.],
       [ 2.,  3.,  6.]])

Q5.You want to make a custom transformation to one or more features. 

It is common to want to make some custom transformations to one or more features. For example, we might want to create a feature that is the natural log of the values of the different feature. We can do this by creating a function and then mapping it to features using either scikit-learn’s FunctionTransformer or pandas’ apply. In the solution we created a very simple function, add_ten, which added 10 to each input, but there is no reason we could not define a much more complex function. 

In [23]:
from sklearn.preprocessing import FunctionTransformer
features=np.array([[2,3],[2,3],[2,3]])
def add_ten(x):
  return x+10

ten=FunctionTransformer(add_ten)
ten.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [25]:
import pandas as pd
df=pd.DataFrame(features,columns=["feature_1","feature_2"])
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


Q6.You want to identify extreme observations. 

Detecting outliers is unfortunately more of an art than a science. However, a common method is to assume the data is normally distributed and based on that assumption “draw” an ellipse around the data, classifying any observation inside the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1):


In [30]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
features,_=make_blobs(n_samples=10,n_features=2,centers=1,random_state=1)
# replace the first observations with extreme values
features[0,0]=1000
features[1,1]=1000
outlier_detector = EllipticEnvelope(contamination=.2)
# Fit detector outlier_detector.fit(features)
outlier_detector.fit(features)
# Predict outliers outlier_detector.predict(features)
outlier_detector.predict(features)

array([-1, -1,  1,  1,  1,  1,  1,  1,  1,  1])

Q7 How u will handle the outliers

Typically we have three strategies we can use to handle outliers. First, we can drop them:


In [31]:
import pandas as pd
houses=pd.DataFrame()
houses['Price']=[34506,45678,89000,10001]
houses['Bathrooms']=[1,2,4,116]
houses['Square_feet']=[1500,1200,2000,23000]
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_feet
0,34506,1,1500
1,45678,2,1200
2,89000,4,2000


Second, we can mark them as outliers and include it as a feature:


In [32]:
houses["Outliers"]=np.where(houses["Bathrooms"]<20,0,1)
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outliers
0,34506,1,1500,0
1,45678,2,1200,0
2,89000,4,2000,0
3,10001,116,23000,1


Second, we can mark them as outliers and include it as a feature:


In [34]:
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_feet"]]
# Show data 
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outliers,Log_Of_Square_Feet
0,34506,1,1500,0,7.31322
1,45678,2,1200,0,7.090077
2,89000,4,2000,0,7.600902
3,10001,116,23000,1,10.043249


Q8.You have a numerical feature and want to break it up into discrete bins

In [37]:
from sklearn.preprocessing import Binarizer
age=np.array([[2],[4],[6],[10],[12],[14],[16],[18],[22],[30],[45],[60]])
binarizer=Binarizer(25)
binarizer.fit_transform(age)

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1]])