In [13]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [3]:
feature = np.array([[-500.5], 
[-100.1],
[0],
[100.1],
[900.9]])

In [4]:
#minmax scaler
minmax_scale = MinMaxScaler(feature_range=(0, 1))

In [7]:
minmax_scale

MinMaxScaler(copy=True, feature_range=(0, 1))

In [8]:
scaled_feature = minmax_scale.fit_transform(feature)

In [9]:
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [11]:
MinMaxScaler(feature_range=(0, 1)).fit_transform(feature) #or you can directly do this

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [15]:
x = np.array([[-1000.1],
[-200.2],
[500.5],
[600.6],
[9000.9]])

In [23]:
standardized=StandardScaler().fit_transform(x) #transforms to mean 0 and standard deviation 1

In [24]:
round(standardized.mean())

0.0

In [25]:
standardized.std()

1.0

In [None]:
#Robust scaler
#if we have significant outliers in the data its better to do reslace the feature using median and quartile range

In [26]:
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [27]:
#Normalizing Observations
import numpy as np
from sklearn.preprocessing import Normalizer
features=np.array([[0.5, 0.5],
[1.1, 3.4],
[1.5, 20.2],
[1.63, 34.4],
[10.9, 3.3]])

In [28]:
normalizer = Normalizer(norm="l2")

In [29]:
normalizer.transform(features)
#Euclidean norm (l2)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [30]:
#Manhattan norm (L1)
normalizer=Normalizer(norm="l1")
normalizer.transform(features)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [32]:
#Practically, notice that norm='l1' rescales an observation’s values so they sum to 1, which can sometimes be a desirable quality

In [33]:
"""Intuitively, L2 norm can be thought of as the distance between two points in
New York for a bird (i.e., a straight line), while L1 can be thought of as the
distance for a human walking on the street (walk north one block, east one block,
north one block, east one block, etc.), which is why it is called “Manhattan
norm” or “Taxicab norm."""

'Intuitively, L2 norm can be thought of as the distance between two points in\nNew York for a bird (i.e., a straight line), while L1 can be thought of as the\ndistance for a human walking on the street (walk north one block, east one block,\nnorth one block, east one block, etc.), which is why it is called “Manhattan\nnorm” or “Taxicab norm.'

In [34]:
from sklearn.preprocessing import FunctionTransformer
def add_five(x):
    return x + 5
features = np.array([[2, 3],
[2, 3],
[2, 3]])
five_transform=FunctionTransformer(add_five)
five_transform.transform(features)

array([[7, 8],
       [7, 8],
       [7, 8]])

In [36]:
import pandas as pd
df = pd.DataFrame(features, columns=["feature_1", "feature_2"])
df.apply(add_five)

Unnamed: 0,feature_1,feature_2
0,7,8
1,7,8
2,7,8


In [37]:
#Detecting outliers


"""Detecting outliers is unfortunately more of an art than a science. However, a
common method is to assume the data is normally distributed and based on that
assumption “draw” an ellipse around the data, classifying any observation inside
the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an
outlier (labeled as -1)"""

'Detecting outliers is unfortunately more of an art than a science. However, a\ncommon method is to assume the data is normally distributed and based on that\nassumption “draw” an ellipse around the data, classifying any observation inside\nthe ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an\noutlier (labeled as -1)'

In [38]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [39]:
features, _ = make_blobs(n_samples = 10,
n_features = 2,
centers = 1,
random_state = 1)

In [40]:
features

array([[-1.83198811,  3.52863145],
       [-2.76017908,  5.55121358],
       [-1.61734616,  4.98930508],
       [-0.52579046,  3.3065986 ],
       [ 0.08525186,  3.64528297],
       [-0.79415228,  2.10495117],
       [-1.34052081,  4.15711949],
       [-1.98197711,  4.02243551],
       [-2.18773166,  3.33352125],
       [-0.19745197,  2.34634916]])

In [41]:
features[0,0] = 10000
features[0,1] = 10000

In [42]:
features

array([[ 1.00000000e+04,  1.00000000e+04],
       [-2.76017908e+00,  5.55121358e+00],
       [-1.61734616e+00,  4.98930508e+00],
       [-5.25790464e-01,  3.30659860e+00],
       [ 8.52518583e-02,  3.64528297e+00],
       [-7.94152277e-01,  2.10495117e+00],
       [-1.34052081e+00,  4.15711949e+00],
       [-1.98197711e+00,  4.02243551e+00],
       [-2.18773166e+00,  3.33352125e+00],
       [-1.97451969e-01,  2.34634916e+00]])

In [43]:
outlier_detector = EllipticEnvelope(contamination=.1)
outlier_detector.fit(features)
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [44]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [45]:
houses["Outlier"] = np.where(houses["Bathrooms"] < 20, 0, 1)

In [46]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [None]:
"""Data entry errors (human errors)
Measurement errors (instrument errors)
Experimental errors (data extraction or experiment planning/executing errors)
Intentional (dummy outliers made to test detection methods)
Data processing errors (data manipulation or data set unintended mutations)
Sampling errors (extracting or mixing data from wrong or various sources)
Natural (not an error, novelties in data)"""

In [None]:
# Standard deviation based outlier detection
import numpy as np
import matplotlib.pyplot as plt
seed(1)
anomalies = []
# multiply and add by random numbers to get some real values
data = np.random.randn(50000)  * 20 + 20
#https://towardsdatascience.com/5-ways-to-detect-outliers-that-every-data-scientist-should-know-python-code-70a54335a623
#https://www.kaggle.com/kevinarvai/outlier-detection-practice-uni-multivariate




#see this website and code later (maybe tomorrow)

In [50]:
#discretizing features
from sklearn.preprocessing import Binarizer
age = np.array([[6],
[12],
[20],
[36],
[65]])
binarizer = Binarizer(18)
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [54]:
#we can break up numerical features according to multiple thresholds
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [52]:
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [53]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

In [55]:
features = np.array([[1.1, 11.1],
[2.2, 22.2],
[3.3, 33.3],
[4.4, 44.4],
[np.nan, 55]])

In [57]:
# Keep only observations that are not (denoted by ~) missing
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [58]:
#dropna
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])
dataframe.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [59]:
#Missing Completely At Random (MCAR)
#Missing At Random (MAR)
#Missing Not At Random (MNAR)