# Preprocessing data

In [1]:
import numpy as np
from sklearn import preprocessing
import pandas as pd

# 1. Binarization

In [11]:
data = np.array([[3, -1.5,  2, -5.4], [ 0,  4,  -0.3, 2.1], [ 1,  3.3, -1.9, -4.3]])
# Put 1 if value is greater than 1.4, else 0. 1.4 will be converted to 0.
bindata = preprocessing.Binarizer(threshold=1.4).transform(data)
print ("Binarized data: \n\n", bindata)


Binarized data: 

 [[1. 0. 1. 0.]
 [0. 1. 0. 1.]
 [0. 1. 0. 0.]]


# 2. Mean Removal / Standardization

In [57]:
data = np.array([[3, -1.5,  2, -5.4], [ 0,  4,  -0.3, 2.1], [ 1,  3.3, -1.9, -4.3]])
print ("\nMean across each columns (Before): ", data.mean(axis=0))
print ("\nStandard Deviation: (Before): ", data.std(axis=0))

# Transform the data to center it by removing the mean value of each feature, 
# then scale it by dividing each features by their standard deviation. This transformation is done for transforming all the data 
# proportionally.
# Transformation also known as Gaussian with zero mean and unit variance.
scaled_data = preprocessing.scale(data)
print ("\n\nscaled_data: ", scaled_data)
                
print ("\nMean across each columns (After): ", scaled_data.mean(axis=0))
print ("\nStandard Deviation (After):  ", scaled_data.std(axis=0))


Mean across each columns (Before):  [ 1.33333333  1.93333333 -0.06666667 -2.53333333]

Standard Deviation: (Before):  [1.24721913 2.44449495 1.60069429 3.30689515]


scaled_data:  [[ 1.33630621 -1.40451644  1.29110641 -0.86687558]
 [-1.06904497  0.84543708 -0.14577008  1.40111286]
 [-0.26726124  0.55907936 -1.14533633 -0.53423728]]

Mean across each columns (After):  [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]

Standard Deviation (After):   [1. 1. 1. 1.]


# 3. Scaling
      StandardScalar
      MinMaxScalar - Feature to be in 0 to 1 range.
      Normalizer

In [59]:
# MinMaxScalar
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
# X_scaled = X_std * (max - min) + min
data = np.array([[3, -1.5,  2, -5.4], [ 0,  4,  -0.3, 2.1], [ 1,  3.3, -1.9, -4.3]])
minmax_scalar = preprocessing.MinMaxScaler(feature_range=(0,1))
data_minmax = minmax_scalar.fit_transform(data)
print ("\nscaled_data: ", data_minmax)



scaled_data:  [[1.         0.         1.         0.        ]
 [0.         1.         0.41025641 1.        ]
 [0.33333333 0.87272727 0.         0.14666667]]


# 4. Normalization
   -- bringing the value of each feature vector on a common scale
   - l1 - Least absolute devaitions: Sum of absolute values on each row is 1. It is insensitive to outliers.
   - l2 - Least squares: Sum of squares on each row is 1. Takes outliers in considerations during training.
   

In [64]:
data = np.array([[3, -1.5,  2, -5.4], [ 0,  4,  -0.3, 2.1], [ 1,  3.3, -1.9, -4.3]])
data_l1 = preprocessing.normalize(data, norm = 'l1')
data_l2 = preprocessing.normalize(data, norm = 'l2')

print ("\nL1 Normalized data: ", data_l1)
print ("\nL2 Normalized data: ", data_l2)



L1 Normalized data:  [[ 0.25210084 -0.12605042  0.16806723 -0.45378151]
 [ 0.          0.625      -0.046875    0.328125  ]
 [ 0.0952381   0.31428571 -0.18095238 -0.40952381]]

L2 Normalized data:  [[ 0.45017448 -0.22508724  0.30011632 -0.81031406]
 [ 0.          0.88345221 -0.06625892  0.46381241]
 [ 0.17152381  0.56602858 -0.32589524 -0.73755239]]


# 5. One-Hot Encoding / Dummy Variables creation
- Used on categorical variables
- It replaces a categorical variable/feature with one or more feature that will take the values of either 0 or 1.
- Increses data burden
- Increses the efficiency of the process

In [94]:
# one hot encoding
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print ("\nEncoded vector:\n", encoded_vector)


Encoded vector:
 [[0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]


In [2]:
# Load the data
df = pd.read_table("http://data.princeton.edu/wws509/datasets/salary.dat", delim_whitespace=True)
# Take a look
print (df.head())

# Encode sx column into one-hot encoding
dummy = pd.get_dummies(df['sx'])
print (dummy.head())

# Concatenate original and newly created dummy data frame
df = pd.concat([df, dummy], axis = 1)
print (df.head())

# Merge original and newly created dummy data frame
df = df.merge(dummy, left_index = True, right_index = True)
print (df.head())

       sx    rk  yr         dg  yd     sl
0    male  full  25  doctorate  35  36350
1    male  full  13  doctorate  22  35350
2    male  full  10  doctorate  23  28200
3  female  full   7  doctorate  27  26775
4    male  full  19    masters  30  33696
   female  male
0       0     1
1       0     1
2       0     1
3       1     0
4       0     1
       sx    rk  yr         dg  yd     sl  female  male
0    male  full  25  doctorate  35  36350       0     1
1    male  full  13  doctorate  22  35350       0     1
2    male  full  10  doctorate  23  28200       0     1
3  female  full   7  doctorate  27  26775       1     0
4    male  full  19    masters  30  33696       0     1
       sx    rk  yr         dg  yd     sl  female_x  male_x  female_y  male_y
0    male  full  25  doctorate  35  36350         0       1         0       1
1    male  full  13  doctorate  22  35350         0       1         0       1
2    male  full  10  doctorate  23  28200         0       1         0       1
3  f