# Basic Example

In [6]:
from sklearn import neighbors, datasets, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()

In [38]:
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=33)
scaler=preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy_score(y_test, y_pred)

0.631578947368421

# Loading The Data

In [80]:
import numpy as np
X = np.random.random((10, 5))
y = np.array(['M','M','F','F','M','F','M','M','F','F'])
X[X < 0.7] = 0
print(X[0:2], '\n'*2, X[1])

[[0.90162383 0.71305793 0.         0.         0.        ]
 [0.88823175 0.70300404 0.94943237 0.         0.        ]] 

 [0.88823175 0.70300404 0.94943237 0.         0.        ]


# Training And Test Data

In [82]:
from sklearn.model_selection import train_test_split
X, y = iris.data[:, :2], iris.target
X_train, X_test, y_train, y_test=train_test_split(X, y, random_state=0)
X.shape, X_train.shape, X_test.shape

((150, 2), (112, 2), (38, 2))

# Preprocessing The Data
### Standardization
Standardize features by removing the mean and scaling to unit variance.
Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set

In [137]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
std_X_train=scaler.transform(X_train)
std_X_test=scaler.transform(X_test)
print(X_train[3], '\n'*2, std_X_train[3])
std_X_train.mean(axis=0), std_X_train.std(axis=0)

[4.7 3.2] 

 [-1.36797986  0.34131533]


(array([ 1.21331516e-15, -4.41115398e-17]), array([1., 1.]))

### Normalization
Normalize samples individually to unit norm.

In [103]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
norm_X_train = scaler.transform(X_train)
norm_X_test = scaler.transform(X_test)
print(X_train[:2], '\n'*2, norm_X_train[:2])

[[5.9 3. ]
 [5.8 2.6]] 

 [[0.89138513 0.45324668]
 [0.91250932 0.4090559 ]]


### Binarization

In [138]:
from sklearn.preprocessing import Binarizer
scaler = Binarizer(threshold=5.0).fit(X_train)
bin_X_train = scaler.transform(X_train)
bin_X_test = scaler.transform(X_test)
print(X_train[:2], '\n'*2, bin_X_train[:2])

[[5.9 3. ]
 [5.8 2.6]] 

 [[1. 0.]
 [1. 0.]]


# Encoding Categorical Features
### Label Encoder
[Label Encoder vs. One Hot Encoder](https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621)

In [170]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
label_y = ['Toronto', 'Ottawa', 'Montreal', 'Vancouver', 'Toronto']
lbl_encoded_y = enc.fit_transform(label_y)
lbl_encoded_y

array([2, 1, 0, 3, 2])

### One Hot Encoder
Matrices that contain mostly zero values are called sparse, distinct from matrices where most of the values are non-zero, called dense.
1. **Compressed Sparse Row** - represented using 3 one-dimensional arrays for non-zero values, extents of the rows and the column indexes
2. **Compressed Sparse Column** - same as CSR except the column indices are compresssed and read before row indices

[Introduction to Sparse Matrices](https://machinelearningmastery.com/sparse-matrices-for-machine-learning/)

In [178]:
from sklearn.preprocessing import OneHotEncoder
hot_encoded_y_dense = OneHotEncoder(sparse=False).fit_transform(np.array(label_y).reshape(len(label_y), 1))
hot_encoded_y_dense
hot_encoded_y_sparse = OneHotEncoder().fit_transform(np.array(label_y).reshape(len(label_y), 1))
print(hot_encoded_y_dense, '\n'*2, hot_encoded_y_sparse.todense())


[[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]] 

 [[0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]


# Imputing Missing Values
Other strategies: most_frequent, constant

In [181]:
from sklearn.impute import SimpleImputer
import pandas as pd
df = pd.DataFrame(data={'col1': [1,2,0], 'col2': [3,0,4]})
imp = SimpleImputer(missing_values=0, strategy='mean')
imp.fit_transform(df)

array([[1. , 3. ],
       [2. , 3.5],
       [1.5, 4. ]])

# Generating Polynomial Features
[Why you should generate polynomial features first](https://medium.com/@samchaaa/preprocessing-why-you-should-generate-polynomial-features-first-before-standardizing-892b4326a91d)
#### Polynomial features
Polynomial features are often created when we want to include the notion that there exists a nonlinear relationship between the features and the target. For example, we might suspect that the effect of age on the probability of having a major medical condition is not constant over time but increases as age increases. We can encode that nonconstant effect in a feature, x, by generating that feature’s higher-order forms (x2, x3, etc.).
#### Interaction features
Additionally, often we run into situations where the effect of one feature is dependent on another feature. A simple example would be if we were trying to predict whether or not our coffee was sweet and we had two features: 1) whether or not the coffee was stirred and 2) if we added sugar. Individually, each feature does not predict coffee sweetness, but the combination of their effects does. That is, a coffee would only be sweet if the coffee had sugar and was stirred. The effects of each feature on the target (sweetness) are dependent on each other. We can encode that relationship by including an interaction feature that is the product of the individual features.


In [192]:
from sklearn.preprocessing import PolynomialFeatures
poly_X = np.array([1,2,3,4])
poly = PolynomialFeatures(3)
poly.fit_transform(poly_X.reshape(len(poly_X), 1))

array([[ 1.,  1.,  1.,  1.],
       [ 1.,  2.,  4.,  8.],
       [ 1.,  3.,  9., 27.],
       [ 1.,  4., 16., 64.]])