## Why use scikit-learn?
* It's a Python library with "simple and efficient tools for data mining and data analysis"

In [1]:
from sklearn import svm, datasets, model_selection
import matplotlib.pyplot as plt
import numpy as np

## Some teaser functions in sklearn 

#### Import a dataset

In [2]:
iris = datasets.load_iris()

### Classification: Let's classify types of flowers based on their physical features!

In [3]:
# number of data poitns that we've got in our sample
len(iris.data)

150

In [4]:
# what are our features?
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
# what are we classifying?
iris.target_names

array(['setosa', 'versicolor', 'virginica'], 
      dtype='<U10')

In [6]:
# labels for targets
np.unique(iris.target)

array([0, 1, 2])

In [7]:
# example of features for one data point
iris.data[0]

array([ 5.1,  3.5,  1.4,  0.2])

#### Define our feature vector and target

In [8]:
# we're going to predict flower type based on their sepal length, sepal width, petal length, and petal width
X = iris.data
y = iris.target

#### Let's split the data into training and test sets

In [9]:
# let's train on 66% of the sample and test on the remaining 33%
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=5)

#### Define our classifier

In [10]:
# let's use support vector classification with a linear kernel
clf = svm.SVC(kernel='linear', C=1.0)
# fit to the training data
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

#### Predict on the test data

In [11]:
# predicted labels of y_test 
clf.predict(X_test)

array([1, 2, 2, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0,
       1, 1, 2, 1, 1, 1, 2, 0, 1, 1, 0, 1, 0, 0, 2, 0, 2, 2, 1, 0, 0, 1, 2,
       1, 2, 2, 0])

In [12]:
# accuracy of classifications
clf.score(X_test,y_test)

0.97999999999999998