In [None]:
from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import ensemble
from sklearn import cluster

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

**Regresja**

In [None]:
X_all, y_all = datasets.make_regression(n_samples=50,n_features=50, n_informative=10)

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_all, y_all, train_size=0.5)

In [None]:
model = linear_model.LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
def sse(resid): #sum of squared errors (SSE)
    return np.sum(resid**2)

In [None]:
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

In [None]:
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_test, y_test)

$min_\beta ||X\beta-y ||^2_2$

**Regularyzacja**

Ridge $min_\beta\{ ||X\beta-y ||^2_2-\alpha||\beta||_1\}$

Lasso $min_\beta\{ ||X\beta-y ||^2_2-\alpha||\beta||_2^2\}$

LassoCV

ElasticNetCV $min_\beta\{ ||X\beta-y ||^2_2--\alpha\rho||\beta||_1-\alpha(1-\rho)||\beta||_1\}$

In [None]:
model = linear_model.Ridge(alpha=2.5)

In [None]:
model.fit(X_train, y_train)

In [None]:
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

In [None]:
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

In [None]:
model = linear_model.Lasso(alpha=1.0)

In [None]:
model.fit(X_train, y_train)

In [None]:
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

In [None]:
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

In [None]:
model = linear_model.LassoCV()

In [None]:
model.fit(X_all, y_all)

In [None]:
model.alpha_

In [None]:
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

In [None]:
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

In [None]:
model = linear_model.ElasticNetCV()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.alpha_

In [None]:
model.l1_ratio

In [None]:
resid_train = y_train - model.predict(X_train)
sse_train = sse(resid_train)
sse_train

In [None]:
resid_test = y_test - model.predict(X_test)
sse_test = sse(resid_test)
sse_test

**Klasyfikacja**

In [None]:
iris = datasets.load_iris()

In [None]:
iris.target_names

In [None]:
iris.feature_names

In [None]:
iris.data.shape

In [None]:
iris.target.shape

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(iris.data, iris.target,train_size=0.7)

In [None]:
classifier = linear_model.LogisticRegression()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_test_pred = classifier.predict(X_test)

In [None]:
print(metrics.classification_report(y_test, y_test_pred))

In [None]:
metrics.confusion_matrix(y_test, y_test_pred)

In [None]:
classifier = tree.DecisionTreeClassifier()

In [None]:
classifier.fit(X_train, y_train)

In [None]:
y_test_pred = classifier.predict(X_test)

In [None]:
metrics.confusion_matrix(y_test, y_test_pred)

In [None]:
train_size_vec = np.linspace(0.1, 0.9, 30)

In [None]:
classifiers = [tree.DecisionTreeClassifier,neighbors.KNeighborsClassifier,svm.SVC,ensemble.RandomForestClassifier]

In [None]:
cm_diags = np.zeros((3, len(train_size_vec), len(classifiers)),dtype=float)

In [None]:
for n, train_size in enumerate(train_size_vec):
    X_train, X_test, y_train, y_test=model_selection.train_test_split(iris.data, iris.target,train_size=train_size)
    for m, Classifier in enumerate(classifiers):
        classifier = Classifier()
        classifier.fit(X_train, y_train)
        y_test_p = classifier.predict(X_test)
        cm_diags[:, n, m] = metrics.confusion_matrix(y_test,y_test_p).diagonal()
        cm_diags[:, n, m] /= np.bincount(y_test)

In [None]:
fig, axes = plt.subplots(1, len(classifiers), figsize=(12, 3))
for m, Classifier in enumerate(classifiers):
    axes[m].plot(train_size_vec, cm_diags[2, :, m], label=iris.target_names[2])
    axes[m].plot(train_size_vec, cm_diags[1, :, m], label=iris.target_names[1])
    axes[m].plot(train_size_vec, cm_diags[0, :, m], label=iris.target_names[0])
    axes[m].set_title(type(Classifier()).__name__)
    axes[m].set_ylim(0, 1.1)
    axes[m].set_ylabel("classification accuracy")
    axes[m].set_xlabel("training size ratio")
    axes[m].legend(loc=4)

**Klasteryzacja**

In [None]:
X, y = iris.data, iris.target

In [None]:
n_clusters = 3

In [None]:
clustering = cluster.KMeans(n_clusters=n_clusters)

In [None]:
clustering.fit(X)

In [None]:
y_pred = clustering.predict(X)

In [None]:
y_pred[::8]

In [None]:
y[::8]

In [None]:
idx_0, idx_1, idx_2 = (np.where(y_pred == n) for n in range(3))

In [None]:
y_pred[idx_0], y_pred[idx_1], y_pred[idx_2] = 2, 0, 1

In [None]:
y_pred[::8]

In [None]:
metrics.confusion_matrix(y, y_pred)

In [None]:
N = X.shape[1]
fig, axes = plt.subplots(N, N, figsize=(12, 12), sharex=True,sharey=True)
colors = ["coral", "blue", "green"]
markers = ["^", "v", "o"]
for m in range(N):
    for n in range(N):
        for p in range(n_clusters):
            mask = y_pred == p
            axes[m, n].scatter(X[:, m][mask], X[:, n][mask], s=30,
                               marker=markers[p], color=colors[p],alpha=0.25)
            
        for idx in np.where(y != y_pred):
            axes[m, n].scatter(X[idx, m], X[idx, n], s=30,marker="s", edgecolor="red",facecolor=(1,1,1,0))
            axes[N-1, m].set_xlabel(iris.feature_names[m], fontsize=16)
            axes[m, 0].set_ylabel(iris.feature_names[m], fontsize=16)