# Einführung in Machine Learning - Schnelldurchgang

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
%matplotlib inline
%pylab inline

In [None]:
import matplotlib.pylab as plt
import numpy as np

In [None]:
from distutils.version import StrictVersion

In [None]:
import sklearn
print(sklearn.__version__)

# assert StrictVersion(sklearn.__version__ ) >= StrictVersion('0.18.1')

# Zuerst laden wir den Iris Datensatz und verschaffen uns einen ersten Eindruck
https://de.wikipedia.org/wiki/Portal:Statistik/Datensaetze#Iris

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
print(iris.DESCR)

In [None]:
X = iris.data
y = iris.target

In [None]:
X.shape, y.shape

In [None]:
X[0]

In [None]:
y[0]

In [None]:
X_sepal_length = X[:, 0]
X_sepal_width =  X[:, 1]
X_petal_length = X[:, 2]
X_petal_width = X[:, 3]

In [None]:
X_petal_width.shape

## Aufteilung der Daten in Training (60%) und Test (40%)
http://scikit-learn.org/stable/modules/cross_validation.html

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42, stratify=y)

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Wir trainieren einen einfachen KNN Klassifikator und überprüfen die Ergebnisse
http://scikit-learn.org/stable/modules/neighbors.html#classification

In [None]:
from sklearn import neighbors

In [None]:
clf = neighbors.KNeighborsClassifier(1)

In [None]:
clf.fit(X_train, y_train)

In [None]:
sample_id = 32
sample_feature = X_test[sample_id]
sample_label = y_test[sample_id]

In [None]:
sample_feature

In [None]:
sample_label

In [None]:
clf.predict([sample_feature])

In [None]:
clf.predict([[6.3, 2.7, 5.5, 1.5]]) # slightly different from above, still gives 2

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

## Um zu versehen, was durch das Training passiert ist, zeichnen wir die Decision Boundaries ein

In [None]:
# ignore this, it is just technical code
# should come from a lib, consider it to appear magically 
# http://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
font_size=25

def meshGrid(x_data, y_data):
    h = .02  # step size in the mesh
    x_min, x_max = x_data.min() - 1, x_data.max() + 1
    y_min, y_max = y_data.min() - 1, y_data.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return (xx,yy)
    
def plotPrediction(clf, x_data, y_data, x_label, y_label, colors, title="", mesh=True):
    xx,yy = meshGrid(x_data, y_data)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=(20,10))
    if mesh:
        plt.pcolormesh(xx, yy, Z, cmap=cmap_light)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.scatter(x_data, y_data, c=colors, cmap=cmap_bold, s=80, marker='o')
    plt.xlabel(x_label, fontsize=font_size)
    plt.ylabel(y_label, fontsize=font_size)
    plt.title(title, fontsize=font_size)

### Zuerst für die Sepal Features

In [None]:
X_train_sepal_only = X_train[:, :2]
X_test_sepal_only = X_test[:, :2]

In [None]:
clf_sepal = neighbors.KNeighborsClassifier(1)
clf_sepal.fit(X_train_sepal_only, y_train)

In [None]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train, mesh=False,
                title="Train Data for Sepal Features")
# plt.savefig('ML_0201.png', bbox_inches='tight')

### Scores sind gut für die Trainingsdaten, aber nicht so toll für Testdaten

In [None]:
clf_sepal.score(X_train_sepal_only, y_train)

In [None]:
clf_sepal.score(X_test_sepal_only, y_test)

### Das sieht nach Overfittung aus, das siehst du auch beim Plotting der Deciscion Boundaries

In [None]:
plotPrediction(clf_sepal, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train,
               title="Highly Fragmented Decision Boundaries for Train Data")
# plt.savefig('ML_0202.png', bbox_inches='tight')

In [None]:
plotPrediction(clf_sepal, X_test_sepal_only[:, 0], X_test_sepal_only[:, 1],
               'Sepal length', 'Sepal width', y_test,
               title="Same Decision Boundaries don't work well for Test Data")
# plt.savefig('ML_0203.png', bbox_inches='tight')

## Wir machen das Modell weniger komplex, allgemeiner

In [None]:
clf_sepal_10 = neighbors.KNeighborsClassifier(10)
clf_sepal_10.fit(X_train_sepal_only, y_train)

In [None]:
clf_sepal_10.score(X_train_sepal_only, y_train)

In [None]:
clf_sepal_10.score(X_test_sepal_only, y_test)

In [None]:
plotPrediction(clf_sepal_10, X_train_sepal_only[:, 0], X_train_sepal_only[:, 1], 
               'Sepal length', 'Sepal width', y_train,
               title="Model too simple even for Train Data")
# plt.savefig('ML_0204.png', bbox_inches='tight')

## Mit den Sepal Features werden wir immer entweder overfitten oder underfitten
## Wir versuchen es noch einmal mit den Petal Features

In [None]:
X_train_petal_only = X_train[:, 2:]
X_test_petal_only = X_test[:, 2:]

In [None]:
clf_petal_10 = neighbors.KNeighborsClassifier(10)
clf_petal_10.fit(X_train_petal_only, y_train)

In [None]:
clf_petal_10.score(X_train_petal_only, y_train)

In [None]:
clf_petal_10.score(X_test_petal_only, y_test)

In [None]:
plotPrediction(clf_petal_10, X_train_petal_only[:, 0], X_train_petal_only[:, 1], 
               'Petal length', 'Petal width', y_train,
               title="Simple model looks good for Train Data")
# plt.savefig('ML_0205.png', bbox_inches='tight')

In [None]:
plotPrediction(clf_petal_10, X_test_petal_only[:, 0], X_test_petal_only[:, 1], 
               'Petal length', 'Petal width', y_test,
               title="Simple model looks good even for Test Data")
# plt.savefig('ML_0206.png', bbox_inches='tight')

Ein deutlich besseres Ergebnis, obwohl wir wieder nur 2 Features genommen haben. Es kann also entscheident sein, welche Features man nimmt.