# Introduction to Scikit-learn

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import sklearn
sklearn.set_config(print_changed_only=True)

In [0]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
blood = fetch_openml('blood-transfusion-service-center')

X_train, X_test, y_train, y_test = train_test_split(
    blood.data, blood.target, random_state=0)

In [30]:
X_train.shape

(561, 4)

In [31]:
import pandas as pd
pd.Series(y_train).value_counts()

1    438
2    123
dtype: int64

In [32]:
pd.Series(y_train).value_counts() / len(y_train)
#problem with using accuracy

1    0.780749
2    0.219251
dtype: float64

Really Simple API
-------------------
0) Import your model class

In [0]:
from sklearn.svm import LinearSVC

1) Instantiate an object and set the parameters

In [0]:
svm = LinearSVC()

2) Fit the model

In [35]:
svm.fit(X_train, y_train)



LinearSVC()

3) Apply / evaluate

In [36]:
print(svm.predict(X_train))
print(y_train)

['1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1'
 '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1' '1

In [37]:
svm.score(X_train, y_train)

0.7807486631016043

In [38]:
svm.score(X_test, y_test)

0.7058823529411765

And again
---------

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
rf = RandomForestClassifier()

In [41]:
rf.fit(X_train, y_train)

RandomForestClassifier()

In [42]:
rf.score(X_train, y_train)

0.9411764705882353

In [43]:
rf.score(X_test, y_test)

0.7112299465240641

# Exercises

## Exercise 1
Load the iris dataset from the ``sklearn.datasets`` module using the ``load_iris`` function.

Split it into training and test set using ``train_test_split``.

## Exercise 2
Then train an evaluate ``sklearn.neighbors.KNeighborsClassifier``, the RandomForestClassifier and  ``sklearn.linear_model.LogisticRegression`` on the iris dataset.
How do these perform on the training set vs the test set? Which one is the best on the training set, which one is the best on the test set?

## Exercise 3 (extra)
Can you construct a binary classification dataset (using np.random for example) on which ``sklearn.linear_model.LogisticRegression`` achieves an accuracy of 1? Can you construct a binary classification dataset on which it achieves accuracy 0.5?

In [0]:
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
# Exercise 1, loading data
iris = load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y)


In [45]:

# Exercise 2
# Training KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

print("test set score of knn: %f" % knn.score(X_test, y_test))

# Training RandomForest
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf.score(X_train, y_train)
rf.score(X_test, y_test)



test set score of knn: 1.000000


0.9736842105263158

In [46]:
# Exercise 3

# Perfect classification (accuracy=1) on easy dataset
from sklearn.linear_model import LogisticRegression
X = np.random.uniform(size=(1000, 3))
X[::2] += 1000
y = X[:, 0] > 500
X_train, X_test, y_train, y_test = train_test_split(X, y)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("score on trivial data: ", logreg.score(X_test, y_test))

# Random classification (accuracy=.5) on random data
y = np.random.normal(size=1000) > .0
X_train, X_test, y_train, y_test = train_test_split(X, y)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("score on random data: ", logreg.score(X_test, y_test))

score on trivial data:  1.0
score on random data:  0.528
