## Why use scikit-learn?

### Classification

#### Import a dataset

In [1]:
from sklearn import svm, datasets, model_selection

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import random

In [3]:
iris = datasets.load_iris()

#### Let's classify types of flowers based on their physical features!

In [4]:
len(iris.data)

150

In [5]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

In [7]:
# labels for targets
np.unique(iris.target)

array([0, 1, 2])

In [8]:
# example of features for one data point
iris.data[0]

array([ 5.1,  3.5,  1.4,  0.2])

#### Define our feature vector and target

In [9]:
X = iris.data
y = iris.target

#### Let's split the data into training and test sets

In [15]:
# let's train on 66% of the sample and test on the remaining 33%
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=5)

#### Define our classifier

In [16]:
clf = svm.SVC(kernel='linear', C=1.0)
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
clf.score(X_train,y_train)

0.98999999999999999

In [18]:
clf.predict(X_test)

array([1, 2, 2, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0,
       1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 0, 1, 0, 0, 2, 0, 2, 2, 1, 0, 0, 1, 2,
       1, 2, 2, 0])

In [19]:
clf.score(X_test,y_test)

0.97999999999999998