# Scikit-learn

Scikit-learn contains simple and efficient tools for data mining and data analysis.  It implements a wide variety of machine learning algorithms and processes to conduct advanced analytics.

Library documentation: <a>http://scikit-learn.org/stable/</a>

### General

In [1]:
import numpy as np
from sklearn import datasets
from sklearn import svm

In [2]:
# import a sample dataset and view the data
digits = datasets.load_digits()
print(digits.data)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]


In [3]:
# view the target variable
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [4]:
# train a support vector machine using everything but the last example 
classifier = svm.SVC(gamma=0.001, C=100.)
classifier.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100.0, gamma=0.001)

In [5]:
# predict the target of the last example
classifier.predict(digits.data[:-1])

array([0, 1, 2, ..., 0, 8, 9])

In [6]:
# another example with the digits data set
svc = svm.SVC(C=1, kernel='linear')
svc.fit(digits.data[:-100], digits.target[:-100]).score(digits.data[-100:], digits.target[-100:])

0.98

In [7]:
# perform cross-validation on the estimator's predictions
from sklearn.model_selection import KFold, cross_val_score
#k_fold = KFold(n_splits=3, random_state=1, shuffle=True)

In [9]:
# apply to the model
kfold = KFold(len(digits.data))
cross_val_score(svc, digits.data, digits.target, cv=kfold, n_jobs=-1)

array([1., 1., 1., ..., 1., 1., 1.])

In [12]:
# use the grid search module to optimize model parameters
from sklearn.model_selection import GridSearchCV
gammas = np.logspace(-6, -1, 10)
classifier = GridSearchCV(estimator=svc, param_grid=dict(gamma=gammas), n_jobs=-1)
classifier.fit(digits.data[:1000], digits.target[:1000])

GridSearchCV(estimator=SVC(C=1, kernel='linear'), n_jobs=-1,
             param_grid={'gamma': array([1.00000000e-06, 3.59381366e-06, 1.29154967e-05, 4.64158883e-05,
       1.66810054e-04, 5.99484250e-04, 2.15443469e-03, 7.74263683e-03,
       2.78255940e-02, 1.00000000e-01])})

In [13]:
classifier.best_score_

0.9460000000000001

In [14]:
classifier.best_estimator_.gamma

1e-06

In [15]:
# run against the test set
classifier.score(digits.data[1000:], digits.target[1000:])

0.9422835633626098

In [17]:
# nested cross-validation example
cross_val_score(classifier, digits.data, digits.target)

array([0.96388889, 0.91944444, 0.96657382, 0.9637883 , 0.92479109])

### Other Classifiers

In [18]:
# import the iris dataset
iris = datasets.load_iris()

In [19]:
# k nearest neighbors
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(iris.data, iris.target)

KNeighborsClassifier()

In [20]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [21]:
# stochastic gradient descent
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss="hinge", penalty="l2")
sgd.fit(iris.data, iris.target)

SGDClassifier()

In [22]:
# naive bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
print("Number of mislabeled points : %d" % (iris.target != y_pred).sum())

Number of mislabeled points : 6


### Regression

In [23]:
# load another sample dataset
diabetes = datasets.load_diabetes()

In [26]:
# linear regression
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes.data, diabetes.target)

LinearRegression()

In [27]:
# regression coefficients
print(regr.coef_)

[ -10.01219782 -239.81908937  519.83978679  324.39042769 -792.18416163
  476.74583782  101.04457032  177.06417623  751.27932109   67.62538639]


In [28]:
# mean squared error
np.mean((regr.predict(diabetes.data)-diabetes.target)**2)

2859.6903987680657

In [29]:
# explained variance
regr.score(diabetes.data, diabetes.target)

0.5177494254132934

In [30]:
# ridge regression
regr = linear_model.Ridge(alpha=.1)
regr.fit(diabetes.data, diabetes.target)

Ridge(alpha=0.1)

In [31]:
# lasso regression
regr = linear_model.Lasso()
regr.fit(diabetes.data, diabetes.target)

Lasso()

In [32]:
# logistic regression (this is actually a classifier)
iris = datasets.load_iris()
logistic = linear_model.LogisticRegression(C=1e5)
logistic.fit(iris.data, iris.target)

LogisticRegression(C=100000.0)

### Preprocessing

In [33]:
# feature scaling
from sklearn import preprocessing
X = np.array([[ 1., -1.,  2.],
               [ 2.,  0.,  0.],
               [ 0.,  1., -1.]])
X_scaled = preprocessing.scale(X)

In [39]:
# save the scaling transform to apply to new data later
scaler = preprocessing.StandardScaler().fit(X)
scaler

StandardScaler()

In [40]:
scaler.transform(X)

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [41]:
# range scaling
min_max_scaler = preprocessing.MinMaxScaler()
X_minmax = min_max_scaler.fit_transform(X)
X_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [42]:
# instance normalization using L2 norm
X_normalized = preprocessing.normalize(X, norm='l2')
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [43]:
# category encoding
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
enc.transform([[0, 1, 3]]).toarray()

array([[1., 0., 0., 1., 0., 0., 0., 0., 1.]])

In [44]:
# binning
binarizer = preprocessing.Binarizer().fit(X)
binarizer.transform(X)

array([[1., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

### Clustering

In [45]:
# k means clustering
from sklearn import cluster
k_means = cluster.KMeans(n_clusters=3)
k_means.fit(iris.data)

KMeans(n_clusters=3)

### Decomposition

In [46]:
# create a signal with 2 useful dimensions
x1 = np.random.normal(size=100)
x2 = np.random.normal(size=100)
x3 = x1 + x2
X = np.c_[x1, x2, x3]

In [47]:
# compute principal component analysis
from sklearn import decomposition
pca = decomposition.PCA()
pca.fit(X)

PCA()

In [48]:
pca.explained_variance_

array([2.84048177e+00, 1.00459317e+00, 3.21288955e-32])

In [49]:
# only the 2 first components are useful
pca.n_components = 2
X_reduced = pca.fit_transform(X)
X_reduced.shape

(100, 2)

In [50]:
# generate more sample data
time = np.linspace(0, 10, 2000)
s1 = np.sin(2 * time)  # signal 1 : sinusoidal signal
s2 = np.sign(np.sin(3 * time))  # signal 2 : square signal
S = np.c_[s1, s2]
S += 0.2 * np.random.normal(size=S.shape)  # Add noise
S /= S.std(axis=0)  # standardize data

In [51]:
# mix data
A = np.array([[1, 1], [0.5, 2]])  # mixing matrix
X = np.dot(S, A.T)  # generate observations

In [52]:
# compute independent component analysis
ica = decomposition.FastICA()
S_ = ica.fit_transform(X)  # get the estimated sources
A_ = ica.mixing_.T
np.allclose(X,  np.dot(S_, A_) + ica.mean_)

True