# Data Standardization

In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer()

In [None]:
dataset

In [None]:
print(dataset["DESCR"])

In [None]:
#create the dataframe
dataset_df = pd.DataFrame(dataset.data)

columns = dataset.feature_names
dataset_df.columns = columns

dataset_df.head()

In [None]:
columns

In [None]:
dataset.target

In [None]:
dataset_df.describe()

In [None]:
%matplotlib inline
dataset_df.boxplot()

In [None]:
dataset_df.hist(figsize=(15, 20))

In [None]:
#A first simple solution is to scale by order of magnitude
df=pd.DataFrame(dataset_df["worst area"])
df

In [None]:
df["worst area scaled"] = df["worst area"].apply(lambda x: x/1000)
df[["worst area","worst area scaled"]].sample(5)

In [None]:
df.head()

In [None]:
df.hist()

In [None]:
df[["worst area","worst area scaled"]].boxplot()

In [None]:
from sklearn.preprocessing import StandardScaler
'''
USING STANDARDSCALER:

Standardize features by removing the mean and scaling to unit variance

Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using the transform method.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).
'''

scaler = StandardScaler()
scaler.fit(dataset_df) 
print(scaler)

In [None]:
scaler.mean_

In [None]:
scaler.scale_ 

In [None]:
scaled_data = scaler.transform(dataset_df)
#The scaler instance can then be used on new data (e.g.TEST SET!)

In [None]:
scaled_df = pd.DataFrame(scaled_data)
scaled_df.columns = columns

scaled_df.head()

In [None]:
scaled_df.boxplot()

In [None]:
'''
USING MinMaxScaler:

Transforms features by scaling each feature to a given range.

This estimator scales and translates each feature individually such that it is in the given range on the training set, i.e. between zero and one.

The transformation is given by:

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
where min, max = feature_range.

This transformation is often used as an alternative to zero mean, unit variance scaling.
'''
from sklearn.preprocessing import MinMaxScaler
mm_scaler =  MinMaxScaler(copy=False,feature_range=(-1, 1))
mm_scaler.fit_transform(dataset_df)

In [None]:
dataset_df.head()

In [None]:
dataset_df.boxplot()