<a href="https://colab.research.google.com/github/SoIllEconomist/ds4b/blob/master/python_ds4b/07_machine_learning/scikit_learn_overview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scikit-learn
Scikit-learn is an open source Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using an unified interface.

## Loading Data
Your data needs to be numeric and stored as NumPy arrays or SciPy sparse
matrices. Other types that are convertible to numeric arrays, such as Pandas
DataFrame, are also acceptable.


In [0]:
import numpy as np
X = np.random.random((11,5))
y = np.array(['M','M','F','F','M','F','M','M','F','F','F'])
X[X < 0.7] = 0

## Train-Test-Split

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Preprocessing Data
### Standardization

In [0]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

### Normalization

In [0]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

Binarization

In [0]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

### Encoding Categorical Features

In [0]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)

### Imputing Missing Values

In [22]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=0, strategy='mean')
imp.fit_transform(X_train)

array([[0.86497596, 0.74907852, 0.86589987, 0.87571712, 0.78348697],
       [0.86497596, 0.74354756, 0.86589987, 0.88807105, 0.75515521],
       [0.95158784, 0.75460947, 0.86589987, 0.88807105, 0.96974819],
       [0.86497596, 0.74907852, 0.86589987, 0.88807105, 0.88198792],
       [0.77836408, 0.74907852, 0.86589987, 0.88807105, 0.88198792],
       [0.86497596, 0.74907852, 0.85387203, 0.88807105, 0.94367797],
       [0.86497596, 0.74907852, 0.86589987, 0.88807105, 0.93636999],
       [0.86497596, 0.74907852, 0.8779277 , 0.90042499, 0.90348922]])

### Generating Polynomial Features

In [23]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
poly.fit_transform(X) 

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.95158784, 0.75460947, ..., 0.        , 0.        ,
        0.85761999],
       [1.        , 0.        , 0.74354756, ..., 0.        , 0.        ,
        0.24557328],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.90880191, ..., 0.        , 0.        ,
        0.31298558],
       [1.        , 0.        , 0.        , ..., 0.36882726, 0.32998253,
        0.29522892]])

## Model Creation
### Supervised Learning Estimators
#### Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)

#### Support Vector Machines (SVM)

In [0]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')

#### Naive Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

#### KNN

In [0]:
from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

### Unsupervised Learning Estimators
#### Principal Component Analysis (PCA)

In [0]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

#### K Means

In [0]:
from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)

## Model Fitting
### Supervised Learning
Fit the model to the data

In [30]:
lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

### Unsupervised Learning
Fit the model to the data

In [31]:
k_means.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

Fit to data, then transform it

In [0]:
pca_model = pca.fit_transform(X_train)

## Prediction
### Supervised Estimators
Predict Labels

In [0]:
y_pred = svc.predict(np.random.random((2,5)))
y_pred = lr.predict(X_test)

Estimate probability of a label

In [0]:
y_pred = knn.predict_proba(X_test)

### Unsupervised Estimators
Predict labels in clustering algorithms

In [0]:
y_pred = k_means.predict(X_test)

## Evaluate Model Performance
### Classification Metrics
#### Accuracy Score
Estimator score method

In [60]:
knn.score(X_test, y_test)

  score = y_true == y_pred


0.0

Metric scoring functions

In [61]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.3333333333333333

#### Classification
Precision, recall, f1-score
and support

In [62]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0

    accuracy                           0.33         3
   macro avg       0.33      0.17      0.22         3
weighted avg       0.67      0.33      0.44         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Confusion Matrix

In [63]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[1 0 1]
 [0 0 1]
 [0 0 0]]


### Regression Metrics
#### Mean Absolute Error

In [41]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2]
mean_absolute_error(y_true, y_pred)

1.8333333333333333

#### Mean Squared Error

In [64]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

1.6666666666666667

#### $R^2$ Score

In [43]:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

-0.7307692307692308

### Cluster Metrics
#### Adjusted Rand Index

In [44]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred) 

0.0

#### Homogeneity

In [45]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred) 

0.5793801642856952

#### V-measure

In [51]:
from sklearn.metrics import v_measure_score
v_measure_score(y_true, y_pred) 

0.7336804366512111

### Cross-Validation

In [50]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))

[0.5 0.5 0.5 0.5]
[-1.02289061 -0.54391998]


## Model Tuning
### Grid Search

In [76]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)
GridSearchCV(estimator=SVC(),
             param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
sorted(clf.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_kernel',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

### Randomized Parameter Optimization

In [78]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn,
                             param_distributions=params,
                             cv=4,
                             n_iter=8,
                             random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)

0.5
