### Импорт данных и установка random_state

In [8]:
from sklearn import datasets

RS=44

iris = datasets.load_iris()
X, y = iris.data[:, :2], iris.target

### Разделение данных на train/test

#### Случайное разбиение в нужной пропорции

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RS)

### Шкалирование: стандартизация, нормализация, бинаризация

#### Standartization (вычесть среднее, поделить на дисперсию)

In [10]:
from sklearn.preprocessing import StandardScaler
# обучаем только на train
scaler = preprocessing.StandardScaler().fit(X_train)
# далее преобразуем и train и test обученным на train оценщиком
X_train_standarted = scaler.transform(X_train)
X_test_standarted = scaler.transform(X_test)

#### Normalization (вычесть минимум, поделить на разницу между максимумом и минимумом)

In [13]:
from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
X_train_normalized = scaler.transform(X_train)
X_test_normalized = scaler.transform(X_test)

#### Binarization (по порогу)

In [14]:
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X) # fit does nothing.
X_train_binarizeded = binarizer.transform(X_train)
X_test_binarizeded = binarizer.transform(X_test)

#### Еще методы предобработки:

from sklearn.preprocessing import LabelEncoder, Imputer, PolynomialFeatures

### Модели с учителем

In [26]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

from sklearn.svm import SVC # Support Vector Machines (SVM)
svc = SVC(kernel='linear')
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

from sklearn.naive_bayes import GaussianNB # Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

from sklearn import neighbors # k near neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict_proba(X_test) # вероятности принадлежности к классу (кластеру)
y_pred = knn.predict(X_test)

### Модели без учителя

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
pca_model = pca.fit_transform(X_train)

from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=RS)
k_means.fit(X_train)
y_pred = k_means.predict(X_test)

### Оценка качества (производительности) работы алгоритмов

In [28]:
knn.score(X_test, y_test) # mean accuracy on the given test data and labels

0.7631578947368421

#### Accuracy

In [29]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7631578947368421

#### Три в одном отчете: precision, recall, f1-score

In [31]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       0.60      0.75      0.67        12
           2       0.73      0.62      0.67        13

    accuracy                           0.76        38
   macro avg       0.78      0.76      0.76        38
weighted avg       0.78      0.76      0.77        38



In [32]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[12  1  0]
 [ 0  9  3]
 [ 0  5  8]]


### Метрики качества для регрессий

#### MAE (Mean Absolute Error)

In [42]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)

0.23684210526315788

#### MSE (Mean Squared Error)

In [43]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

0.23684210526315788

#### R^2 Score

\begin{equation*}
R^2 = 1 - \frac{\sum_{k=1}^m (\hat{y_{i}}-y_{i})^2}{\sum_{k=1}^m (\bar{y}-y_{i})^2}
\end{equation*}

In [44]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.6538461538461539

### Метрики качества для алгоритмов кластеризации

#### ARI (Adjusted Rand Index)

In [46]:
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_test, y_pred)

0.46931034482758616

#### Homogeneity score

Показатель однородности маркировки кластера с учетом фактов.
Результат кластеризации считается однородным, если все его кластеры содержат только точки данных, которые являются членами одного класса.

In [49]:
from sklearn.metrics import homogeneity_score
homogeneity_score(y_test, y_pred)

0.5387417757632237

#### V-measure

Соотношение отднородности и полноты. Подробнее: http://espressocode.top/ml-v-measure-for-evaluating-clustering-performance/

In [52]:
from sklearn.metrics import v_measure_score
v_measure_score(y_test, y_pred)

0.5407437229481321

### Кросс-валидация (K-fold) и комбинаторный перебор параметров (Grid search)

#### K-Fold (без соблюдения пропорций классов)

In [54]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(knn, X_train, y_train, cv=4, scoring='accuracy'))

[0.78571429 0.85714286 0.67857143 0.92857143]


#### K-Fold (с соблюдением пропорций классов)

In [59]:
from sklearn.model_selection import StratifiedKFold
skfolds = StratifiedKFold(n_splits=4, random_state=RS, shuffle=True)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
cross_val_score_strat_list = []

for train_index, test_index in skfolds.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cross_val_score_strat_list.append(round(acc, 8))

print(cross_val_score_strat_list)

[0.73684211, 0.73684211, 0.7027027, 0.81081081]


#### Grid Search

In [38]:
from sklearn.model_selection import GridSearchCV
import numpy as np
params = {"n_neighbors": np.arange(1,3), "metric": ["euclidean", "cityblock"]}
grid = GridSearchCV(estimator=knn,param_grid=params)
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_estimator_.n_neighbors)

0.7316205533596838
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
1


#### Randomized search

В зависимости от дозволенного количества итераций алгоритм опробует случайно выбранные параметры (а не заданные, как в Grid search).

In [41]:
from sklearn.model_selection import RandomizedSearchCV
params = {"n_neighbors": range(1,5), "weights": ["uniform", "distance"]}
rsearch = RandomizedSearchCV(estimator=knn, param_distributions=params, cv=4,n_iter=8,random_state=5)
rsearch.fit(X_train, y_train)
print(rsearch.best_score_)
print(rsearch.best_estimator_)
print(rsearch.best_estimator_.n_neighbors)

0.7678571428571428
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
3
