# An introduction to machine learning with scikit-learn

### Loading an example dataset

In [3]:
# Importing necessary libraries
from sklearn import datasets

In [4]:
# Load data
iris = datasets.load_iris()
digits = datasets.load_digits()

In [7]:
# Exploring the data
print(digits.data)
print(digits.target)

[[ 0.  0.  5. ...  0.  0.  0.]
 [ 0.  0.  0. ... 10.  0.  0.]
 [ 0.  0.  0. ... 16.  9.  0.]
 ...
 [ 0.  0.  1. ...  6.  0.  0.]
 [ 0.  0.  2. ... 12.  0.  0.]
 [ 0.  0. 10. ... 12.  1.  0.]]
[0 1 2 ... 8 9 8]


In [8]:
print(digits.images[0])

[[ 0.  0.  5. 13.  9.  1.  0.  0.]
 [ 0.  0. 13. 15. 10. 15.  5.  0.]
 [ 0.  3. 15.  2.  0. 11.  8.  0.]
 [ 0.  4. 12.  0.  0.  8.  8.  0.]
 [ 0.  5.  8.  0.  0.  9.  8.  0.]
 [ 0.  4. 11.  0.  1. 12.  7.  0.]
 [ 0.  2. 14.  5. 10. 12.  0.  0.]
 [ 0.  0.  6. 13. 10.  0.  0.  0.]]


### Learning and predicting

In [9]:
# Importing necessary libraries
from sklearn import svm

In [10]:
# Creating classifier
clf = svm.SVC(gamma=0.001, C=100)

In [11]:
# Fit the classifier to the model
clf.fit(digits.data[:-1], digits.target[:-1]) # All but the last entry

In [12]:
# Predict new values
clf.predict(digits.data[-1:])

array([8])

### Conventions

Typecasting

In [13]:
# Importing necessary libraries
import numpy as np
from sklearn import kernel_approximation

In [14]:
# Creating data
rng = np.random.RandomState(0)
X = rng.rand(10,2000)
X = np.array(X, dtype='float32')
X.dtype

dtype('float32')

In [16]:
# Transforming data
transformer = kernel_approximation.RBFSampler()
X_new = transformer.fit_transform(X)
X_new.dtype

dtype('float64')

In [17]:
# Importing necessary libraries
from sklearn import datasets
from sklearn.svm import SVC

In [18]:
# Get Data
iris = datasets.load_iris()

In [19]:
# Creating classifier
clf = SVC()

In [20]:
# Fitting classifier into data model
clf.fit(iris.data, iris.target) # Uses integer array

In [21]:
list(clf.predict(iris.data[:3]))

[0, 0, 0]

In [22]:
clf.fit(iris.data, iris.target_names[iris.target]) # Uses string array

In [23]:
list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

Refitting and updating parameters

In [25]:
# Importing necessary libraries
import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC

In [26]:
# Get data
X, y = load_iris(return_X_y=True)

In [27]:
# Create classifier
clf = SVC()

In [29]:
# Fit classifier onto data model
clf.set_params(kernel='linear').fit(X, y)

# Prediction
clf.predict(X[:5])

array([0, 0, 0, 0, 0])

In [31]:
# Fit classifier onto data model oiverwriting previous fit
clf.set_params(kernel='rbf').fit(X, y)

# Prediction
clf.predict(X[:5])

array([0, 0, 0, 0, 0])

Multiclass vs. multilabel fitting

In [33]:
# Importing necessary libraries
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

In [34]:
# Creating data
X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
y = [0, 0, 1, 1, 2]

In [35]:
# Create classifier, fit onto model and predict
classif = OneVsRestClassifier(estimator=SVC(random_state=0))
classif.fit(X, y).predict(X)

array([0, 0, 1, 1, 2])

In [37]:
# Fit onto 2d array of binary label indicators
y = LabelBinarizer().fit_transform(y)

# Predict
classif.fit(X, y).predict(X)

array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [38]:
# Importing necessary libraries
from sklearn.preprocessing import MultiLabelBinarizer

In [39]:
# Create y data
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]

In [44]:
# Transform data and assign multiple labels
y = MultiLabelBinarizer().fit_transform(y)

In [45]:
# Fit data onto model and predict
classif.fit(X, y).predict(X)

array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])