In [3]:
# transforming raw numerical data

# recaling features. 
# preprocessing task. 
# most common is min-max scaling
# use fit and transform or use fit_transform once.

import numpy as np
from sklearn import preprocessing

# feature vector

feature = np.array([[-500.5],
                  [-100.1],
                   [0],
                   [100.1]])

# create scaler

#minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1)) # output #1

minmax_scale = preprocessing.MinMaxScaler(feature_range=(-1,1)) # output #2

# feature scaling apply

scaled_feature = minmax_scale.fit_transform(feature)

# dsplay feature

scaled_feature


# so all values are between 0 and 1.

# now if values between -1 and 1 .. the output chnages.



array([[-1.        ],
       [ 0.33333333],
       [ 0.66666667],
       [ 1.        ]])

In [6]:
# standardizing of features

# standardizing deciedded how close deviation is close to mean. ( also called as Z-score)
# in neural network commonly used is scaling and in ML commonly used in standardizing
# for better results use robustscaler whhich uses mean and quartile range (25th,75th)

import numpy as np
from sklearn import preprocessing

# feature vector

feature = np.array([[-500.5],
                  [-100.1],
                   [0],
                   [100.1]])

# create scaler 

scaler = preprocessing.StandardScaler()

# feature transforming 

standardized = scaler.fit_transform(feature)

# display feature

standardized



array([[-1.6464639 ],
       [ 0.10976426],
       [ 0.5488213 ],
       [ 0.98787834]])

In [7]:
# to print mean and standard deviation

print("Mean:",round(standardized.mean()))
print("Deviation:" , standardized.std())

Mean: 0.0
Deviation: 1.0


In [10]:
# use robust scaler if having outliers.
# if there are outliers then it can affect mean and variance i.e deviation.

robust_scaler = preprocessing.RobustScaler()

robust_scaler.fit_transform(feature)



array([[-2.        ],
       [-0.22222222],
       [ 0.22222222],
       [ 0.66666667]])

In [13]:
# Normalizing : rescale feature obserations to have unit norm ( sum of 1)

import numpy as np
from sklearn.preprocessing import Normalizer

features = np.array([[-500.5],
                  [-100.1],
                   [0],
                   [100.1]])

#normalizer = Normalizer(norm = 'l2')
normalizer = Normalizer(norm = 'l1')

normalizer.transform(features)

array([[-1.],
       [-1.],
       [ 0.],
       [ 1.]])

In [14]:
# generating polynomial

# for nonlinear relationsip between feature and targets

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

features = np.array([[-500.5],
                  [-100.1],
                   [0],
                   [100.1]])

# create polynomial feature object 

polynomial_interaction = PolynomialFeatures(degree=2 , include_bias = False) # will generate a degree =2 polynomial.

# create features of polynomial

polynomial_interaction.fit_transform(features)



array([[-5.0050000e+02,  2.5050025e+05],
       [-1.0010000e+02,  1.0020010e+04],
       [ 0.0000000e+00,  0.0000000e+00],
       [ 1.0010000e+02,  1.0020010e+04]])

In [17]:
# make custom transformation

# using pandas is easy

"""
import pandas as pd

df = pd.DataFrame(features,columns=['feature_1','feature_2'])

df.apply()
"""


# custom transform will make customised addition/subtraction/any operation to features.

"\nimport pandas as pd\n\ndf = pd.DataFrame(features,columns=['feature_1','feature_2'])\n\ndf.apply()\n"

In [18]:
# how to detect outliers ?

# 2 ways : 

# 1) using ellipticalenvelope 
# 2) using IQR based ( quartile 25th , 75th)


import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

features,_ = make_blobs(n_samples = 10,
                       n_features = 2,
                       centers= 1,
                       random_state = 1)

# now add outliers

features[0,0] = 100000
features[0,1] = 200000


outlier_detector = EllipticEnvelope(contamination = .1)

# fiting detecor

outlier_detector.fit(features)

# now predict outlier

outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [19]:
# Now how to handle outliers ?

import pandas as pd

# create dataframe

houses = pd.DataFrame()
houses['Price'] = [1000,2000,3000]
houses['Bathrooms'] = [2,3,500]

# filter 

houses[houses['Bathrooms'] < 10]

Unnamed: 0,Price,Bathrooms
0,1000,2
1,2000,3


In [20]:
# mark as outliers

import numpy as np

# create feature 
houses['Outlier'] = np.where(houses['Bathrooms'] < 10,0,1)

houses

# here outlier is shown as 1 and not outlier is shown as 0. It works.

Unnamed: 0,Price,Bathrooms,Outlier
0,1000,2,0
1,2000,3,0
2,3000,500,1


In [21]:
# Grouping similar observations. Also called Clustering.

import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

features,_ = make_blobs(n_samples=50,
                       n_features = 2,
                       centers = 3,
                       random_state = 1)

dataframe = pd.DataFrame(features , columns = ['feature_1','feature_2'])

# K-Means cluster

clusterer = KMeans(3,random_state=0)

clusterer.fit(features)

# predict 

dataframe['group'] = clusterer.predict(features)

dataframe.head()

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0
