In [1]:
! pip install mne



In [2]:
import mne
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import pywt
import matplotlib.pyplot as plt
import math as m

# 1. извлечь и преобразовать данные

## Загрузка данных экспериментов

Собираем датасет из участников

In [3]:
# кол-во участников
n = 5 + 1
# кол-во сеансов
s = 1

## Задача MATB-II. 
Для анализа субъективных оценок RSME был выполнен анализ дисперсии многократных измерений 3 × 3 (сессия: 1, 2, 3; и сложность: легко, средне, сложно). Как и в области субъективных баллов, RMANOVAS выполнялся по выполнению задач отслеживания и мониторинга системы. Это было сделано, так как это были единственные две задачи, оцененные при всех условиях. Для задачи отслеживания в качестве меры производительности использовалось среднее абсолютное расстояние от центра квадрата (среднее расстояние в квадрате корня). Для задачи мониторинга системы использовалось среднее время реакции на сигналы тревоги. Для всех анализов коррекция Greenhouse-Geisser применялась, когда предполагалось предположение о сферичности. Сердечные данные были проанализированы с помощью GLM (Фиксированные факторы: Состояние сеанса x, TOT как ковариат) на зависимых переменных HRV (SDNN или RMSS) и HR (BPM). Для анализа ЭЭГ данные были проанализированы в соответствии со сложностью задачи (Легко; Среднее; Сложно), областью коры (фронтальная и задняя) и тета- и альфа-полосами. GLM (Фиксированные факторы: сессия x Сложность) выполнялась на каждой окупаемости инвестиций (фронтальная, центральная и апостериорная).
    
НАСА разработало задание MATB-II1 для оценки возможности переключения задач и умственной рабочей нагрузки (Santiago-Espada et al., 2011). Здесь участникам представлено до 6 различных задач, которые они должны выполнить одновременно. Это обеспечивает высокореалистичную среду операционных систем, которые исследователь может контролировать для создания различной степени сложности. Использовалась адаптированная версия 2, закодированная в Matlab, но обеспечивающая те же показатели, что и оригинальная задача MATB-II (Verdière et al., 2020). Для получения полного описания первоначальной задачи и подзадач обратитесь к Santiago-Espada и др. (2011). Для этого исследования были использованы комбинации из четырех доступных подзадач MATB. В задаче отслеживания (TRACK) участникам представляется движущаяся цель внутри окна. Цель состоит в том, чтобы, используя джойстик, сохранить цель в окне. Степень сложности может быть изменена путем изменения степени и скорости, с которой движется цель. Для задачи мониторинга системы (SYSMON) участники должны отслеживать датчики и предупреждающие огни. Действие требуется при отсутствии зеленого света, наличии красного света и отклонениях четырех движущихся циферблатов указателей от средней точки. Задача мониторинга системы управляется вводом в конкретные команды клавиатуры. Степень сложности может быть адаптирована путем увеличения количества событий, на которые участник должен реагировать. В задаче связи (COMM) участники должны прослушать радиосообщения и определить, важны ли они для оператора (вызов его/ее позывного) или нет (вызов другого позывного). Если сообщение актуально, оператор должен изменить частоту радиоканала на частоту, указанную в сообщении. Последняя используемая задача - это задача управления ресурсами (RESMAN). Участникам представлен интерфейс, показывающий два основных резервуара и четыре вспомогательных резервуара, соединенные между собой через восемь насосов с различными расходами топлива. Цель состоит в том, чтобы поддерживать определенный уровень жидкости в обоих основных резервуарах. Участники могут сделать это, активировав или отключив насосы. Чтобы увеличить сложность задачи, могут быть введены такие события, как сбои насоса. В текущем исследовании участники выполнили три независимых 5-минутных пробега с тремя степенями сложности (т.е. см. сценарии 2, 3 и 4 в Кабоне и др., 2006). Для легкого состояния участники занимаются только мониторингом системы и задачей отслеживания. Для среднего состояния участники участвуют в обеих задачах, а также в задаче по управлению топливом. Для сложного состояния добавляется коммуникационная задача, а также усложняется задача отслеживания. Перед началом каждого забега участники также получили краткую инструкцию.

### Задача NASA level - beasy (MATBeasy)

In [4]:
files_MATBeasy_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_MATBeasy_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/MATBeasy.set')
        else:
            files_MATBeasy_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/MATBeasy.set')
len(files_MATBeasy_sn)

5

### Задача NASA level - medium (MATBmed)

In [5]:
files_MATBmed_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_MATBmed_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/MATBmed.set')
        else:
            files_MATBmed_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/MATBmed.set')
len(files_MATBmed_sn)

5

### Задача NASA level - difficult (MATBdiff)

In [6]:
files_MATBdiff_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_MATBdiff_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/MATBdiff.set')
        else:
            files_MATBdiff_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/MATBdiff.set')
len(files_MATBdiff_sn)

5

## Данные в состоянии покая (до/после)

### Данные в состоянии покоя перед экспериментом

In [7]:
files_RS_Beg_EC_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_RS_Beg_EC_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/RS_Beg_EC.set')
        else:
            files_RS_Beg_EC_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/RS_Beg_EC.set')
len(files_RS_Beg_EC_sn)

5

In [8]:
files_RS_Beg_EO_sn = [] 
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_RS_Beg_EO_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/RS_Beg_EO.set')
        else:
            files_RS_Beg_EO_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/RS_Beg_EO.set')
len(files_RS_Beg_EO_sn)

5

### Данные в состоянии покоя после эксперимента

In [9]:
files_RS_End_EO_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_RS_End_EO_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/RS_End_EO.set')
        else:
            files_RS_End_EO_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/RS_End_EO.set')
len(files_RS_End_EO_sn)

5

In [10]:
files_RS_End_EC_sn = []
for j in range(s):
    for i in range(1, n):
        if len(str(i)) == 1:
            files_RS_End_EC_sn.append(f'data/sub-0{i}/ses-S{j+1}/eeg/RS_End_EC.set')
        else:
            files_RS_End_EC_sn.append(f'data/sub-{i}/ses-S{j+1}/eeg/RS_End_EC.set')
len(files_RS_End_EC_sn)

5

### Склейка данных

In [11]:
cognitive_load_1 = files_MATBdiff_sn #files_MATBeasy_sn #+ files_MATBmed_sn + files_MATBdiff_sn
cognitive_load_0 = files_RS_Beg_EC_sn + files_RS_Beg_EO_sn + files_RS_End_EO_sn + files_RS_End_EC_sn
print(len(cognitive_load_1), len(cognitive_load_0))

5 20


In [12]:
cognitive_MATBdiff_load_1 = files_MATBdiff_sn #files_MATBeasy_sn #+ files_MATBmed_sn + files_MATBdiff_sn
cognitive_MATBmed_load_1 = files_MATBmed_sn #files_MATBeasy_sn #+ files_MATBmed_sn + files_MATBdiff_sn
cognitive_MATBeasy_load_1 = files_MATBeasy_sn #files_MATBeasy_sn #+ files_MATBmed_sn + files_MATBdiff_sn

cognitive_RS_Beg_EC_load_0 = files_RS_Beg_EC_sn
cognitive_RS_Beg_EO_load_0 = files_RS_Beg_EO_sn
cognitive_RS_End_EO_load_0 = files_RS_End_EO_sn
cognitive_RS_End_EC_load_0 = files_RS_End_EC_sn


In [13]:
len(cognitive_MATBdiff_load_1), len(cognitive_MATBmed_load_1), len(cognitive_MATBeasy_load_1)

(5, 5, 5)

In [14]:
len(cognitive_RS_Beg_EC_load_0), len(cognitive_RS_Beg_EO_load_0), len(cognitive_RS_End_EC_load_0), len(cognitive_RS_End_EO_load_0)

(5, 5, 5, 5)

In [15]:
chanels =  [
    'Fp1', 'Fz', 'F3', 'F7', 'FT9', 'FC5', 'FC1', 'C3',
     'T7', 'CP5', 'CP1', 'Pz', 'P3', 'P7', 'O1', 'Oz', 'O2',
     'P4', 'P8', 'TP10', 'CP6', 'CP2', 'FCz', 'C4', 'T8', 'FT10',
     'FC6', 'FC2', 'F4', 'F8', 'Fp2', 'AF7', 'AF3', 'AFz', 'F1', 
     'F5', 'FT7', 'FC3', 'C1', 'C5', 'TP7', 'CP3', 'P1', 'P5',
     'PO7', 'PO3', 'POz', 'PO4', 'PO8', 'P6', 'P2', 'CPz', 'CP4',
     'TP8', 'C6', 'C2', 'FC4', 'FT8', 'F6', 'AF8', 'AF4', 'F2'
]

In [16]:
chanels_dict = dict()

for i in range(len(chanels)):
    chanels_dict[chanels[i]] = i

In [17]:
data = mne.io.read_raw_eeglab(cognitive_load_1[0], preload=True)


Reading /Users/aleksandrdedkov/Documents/Диссертация/data/sub-01/ses-S1/eeg/MATBdiff.fdt
Reading 0 ... 149741  =      0.000 ...   299.482 secs...


  data = mne.io.read_raw_eeglab(cognitive_load_1[0], preload=True)
  data = mne.io.read_raw_eeglab(cognitive_load_1[0], preload=True)
['ECG1']
Consider setting the channel types to be of EEG/sEEG/ECoG/DBS/fNIRS using inst.set_channel_types before calling inst.set_montage, or omit these channels when creating your montage.
  data = mne.io.read_raw_eeglab(cognitive_load_1[0], preload=True)


In [18]:
def read_data(file):
    data = mne.io.read_raw_eeglab(file, preload=True)
    data.set_eeg_reference()
    data.filter(l_freq=0.5, h_freq=45)
    epochs = mne.make_fixed_length_epochs(data, duration=5, overlap=1)
    array = epochs.get_data(picks=chanels)
    return array

In [19]:
%%capture
m1 = [len(read_data(i)) for i in tqdm(cognitive_load_1)]
m2 = [len(read_data(i)) for i in tqdm(cognitive_load_0)]

In [20]:
print(m1, m2)

[74, 74, 74, 74, 74] [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14]


In [21]:
%%capture
cognitive_MATBdiff_ep_ar = [read_data(i) for i in tqdm(cognitive_MATBdiff_load_1)]
cognitive_MATBmed_ep_ar = [read_data(i) for i in tqdm(cognitive_MATBmed_load_1)]
cognitive_MATBeasy_ep_ar = [read_data(i) for i in tqdm(cognitive_MATBeasy_load_1)]

cognitive_RS_Beg_EC_ep_ar = [read_data(i) for i in tqdm(cognitive_RS_Beg_EC_load_0)]
cognitive_RS_Beg_EO_ep_ar = [read_data(i) for i in tqdm(cognitive_RS_Beg_EO_load_0)]
cognitive_RS_End_EO_ar_ar = [read_data(i) for i in tqdm(cognitive_RS_End_EO_load_0)]
cognitive_RS_End_EC_ep_ar = [read_data(i) for i in tqdm(cognitive_RS_End_EC_load_0)]

In [22]:
print(cognitive_MATBdiff_ep_ar[0].shape, cognitive_RS_Beg_EC_ep_ar[0].shape)

(74, 62, 2500) (14, 62, 2500)


In [23]:
len(cognitive_MATBdiff_ep_ar)

5

In [24]:
ep_label_6 = [len(i) * [6] for i in cognitive_MATBdiff_ep_ar]
ep_label_5 = [len(i) * [5] for i in cognitive_MATBmed_ep_ar]
ep_label_4 = [len(i) * [4] for i in cognitive_MATBeasy_ep_ar]

ep_label_3 = [len(i) * [3] for i in cognitive_RS_End_EC_ep_ar]
ep_label_2 = [len(i) * [2] for i in cognitive_RS_End_EO_ar_ar]
ep_label_1 = [len(i) * [1] for i in cognitive_RS_Beg_EO_ep_ar]
ep_label_0 = [len(i) * [0] for i in cognitive_RS_Beg_EC_ep_ar]
print(len(ep_label_6), len(ep_label_5),len(ep_label_4), len(ep_label_3), len(ep_label_2), len(ep_label_1), len(ep_label_0))

5 5 5 5 5 5 5


In [25]:
print(2500 * 74 * 3, 2500 * 14 * 4)
print(2500 * 74 * 3 + 2500 * 14 * 4)

555000 140000
695000


In [26]:
df_X = cognitive_MATBdiff_ep_ar + cognitive_MATBmed_ep_ar + cognitive_MATBeasy_ep_ar + cognitive_RS_Beg_EC_ep_ar + cognitive_RS_Beg_EO_ep_ar + cognitive_RS_End_EO_ar_ar + cognitive_RS_End_EC_ep_ar
len(df_X)

35

In [27]:
df_Y = ep_label_0 + ep_label_1 + ep_label_2 + ep_label_3 + ep_label_4 + ep_label_5 + ep_label_6
len(df_Y)

35

In [28]:
df_group = [[i] * len(j) for i, j in enumerate(df_X)]
len(df_group)

35

In [29]:
len(df_X[0][0])

62

### 2 Создание новых признаков

In [30]:
from scipy import stats
def mean(data):
    return np.mean(data,axis=-1)
    
def std(data):
    return np.std(data,axis=-1)

def ptp(data):
    return np.ptp(data,axis=-1)

def var(data):
        return np.var(data,axis=-1)

def minim(data):
      return np.min(data,axis=-1)


def maxim(data):
      return np.max(data,axis=-1)

def argminim(data):
      return np.argmin(data,axis=-1)


def argmaxim(data):
      return np.argmax(data,axis=-1)

def mean_square(data):
      return np.mean(data**2,axis=-1)

def rms(data): #root mean square
      return  np.sqrt(np.mean(data**2,axis=-1))  

def abs_diffs_signal(data):
    return np.sum(np.abs(np.diff(data,axis=-1)),axis=-1)


def skewness(data):
    return stats.skew(data,axis=-1)

def kurtosis(data):
    return stats.kurtosis(data,axis=-1)

def concatenate_features(data):
    return np.concatenate((mean(data),std(data),ptp(data),var(data),minim(data),maxim(data),argminim(data),argmaxim(data),
                          mean_square(data),rms(data),abs_diffs_signal(data),
                          skewness(data),kurtosis(data)),axis=-1)

In [31]:
X_array = np.vstack(df_X)
Y_array = np.hstack(df_Y)
group_array = np.hstack(df_group)
X_array.shape, Y_array.shape, group_array.shape

((1390, 62, 2500), (1390,), (1390,))

In [32]:
from tqdm import tqdm_notebook
features=[]
for X in tqdm_notebook(X_array):
    features.append(concatenate_features(X))
features=np.array(features)
features.shape

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for X in tqdm_notebook(X_array):


  0%|          | 0/1390 [00:00<?, ?it/s]

(1390, 806)

In [33]:
806 / 62

13.0

In [34]:
np.corrcoef(features, rowvar=False)

array([[1.        , 0.51862746, 0.54206889, ..., 0.04420594, 0.04432906,
        0.01503327],
       [0.51862746, 1.        , 0.52863585, ..., 0.04384487, 0.04840572,
        0.02119071],
       [0.54206889, 0.52863585, 1.        , ..., 0.04866915, 0.03617768,
        0.01506785],
       ...,
       [0.04420594, 0.04384487, 0.04866915, ..., 1.        , 0.76780998,
        0.48310992],
       [0.04432906, 0.04840572, 0.03617768, ..., 0.76780998, 1.        ,
        0.66138442],
       [0.01503327, 0.02119071, 0.01506785, ..., 0.48310992, 0.66138442,
        1.        ]])

In [35]:
! pip install imblearn



In [36]:
from imblearn.under_sampling import RandomUnderSampler

# Создание экземпляра RandomUnderSampler
rus = RandomUnderSampler()

# Применение уменьшения выборки к данным
X_resampled, y_resampled = rus.fit_resample(features, Y_array)

## 3. Тест различных комбинаций моделей для выбора наиболее эффективной

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, y_resampled, 
                                                    random_state=0, 
                                                    test_size=0.2,
                                                    stratify=y_resampled)

### Обучение на новых фичах

In [38]:
from sklearn.multiclass import OneVsRestClassifier ### Один против всех
from sklearn.linear_model import SGDClassifier ### 
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsOneClassifier
from sklearn import datasets
from sklearn.multiclass import OutputCodeClassifier
from sklearn.svm import LinearSVC

SGDClassifier(loss='log') ### max(0; 1-M)

pipe_one_vs_all = Pipeline([("std_scaler", StandardScaler()),
                 ("one_vs_all", OneVsRestClassifier(SGDClassifier(random_state=42)))])

pipe_one_vs_all.fit(X_train, Y_train)

Pipeline(steps=[('std_scaler', StandardScaler()),
                ('one_vs_all',
                 OneVsRestClassifier(estimator=SGDClassifier(random_state=42)))])

### one_vs_all

In [39]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']

for i in loss:
    pipe_one_vs_all = Pipeline([("std_scaler", StandardScaler()),
                 ("one_vs_all", OneVsRestClassifier(SGDClassifier(loss=i, random_state=42)))])

    pipe_one_vs_all.fit(X_train, Y_train)
    print('-' * 5, i, '-' * 5)
    print(pipe_one_vs_all.classes_)
    print(classification_report(y_true = Y_test, y_pred = pipe_one_vs_all.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))
    print('-' * 15)

----- hinge -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.786     0.786     0.786        14
           1      0.632     0.857     0.727        14
           2      0.800     0.857     0.828        14
           3      0.917     0.786     0.846        14
           4      0.545     0.429     0.480        14
           5      0.500     0.500     0.500        14
           6      0.769     0.714     0.741        14

    accuracy                          0.704        98
   macro avg      0.707     0.704     0.701        98
weighted avg      0.707     0.704     0.701        98

---------------
----- log -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.765     0.929     0.839        14
           1      0.706     0.857     0.774        14
           2      0.786     0.786     0.786        14
           3      0.769     0.714     0.741        14
           4      0.455     0.357     0.400       

### all_vs_all

In [40]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']

for i in loss:
    pipe_all_all = Pipeline([("scaler", StandardScaler()),
                         ("all_vs_all", OneVsOneClassifier(SGDClassifier(loss=i, random_state=42)))])

    pipe_all_all.fit(X_train, Y_train)
    print('-' * 5, i, '-' * 5)
    print(pipe_all_all.classes_)
    print(classification_report(y_true = Y_test, y_pred = pipe_all_all.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))
    print('-' * 15)

----- hinge -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.714     0.714     0.714        14
           1      0.800     0.857     0.828        14
           2      0.727     0.571     0.640        14
           3      0.857     0.857     0.857        14
           4      0.385     0.357     0.370        14
           5      0.467     0.500     0.483        14
           6      0.750     0.857     0.800        14

    accuracy                          0.673        98
   macro avg      0.671     0.673     0.670        98
weighted avg      0.671     0.673     0.670        98

---------------
----- log -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.786     0.786     0.786        14
           1      0.688     0.786     0.733        14
           2      0.692     0.643     0.667        14
           3      0.833     0.714     0.769        14
           4      0.333     0.357     0.345       

### OutputCode

In [41]:
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']

for i in loss:
    pipe_OutputCode = Pipeline([("scaler", StandardScaler()),
                         ("OutputCode", OutputCodeClassifier(SGDClassifier(loss=i, random_state=42)))])

    pipe_OutputCode.fit(X_train, Y_train)
    print('-' * 5, i, '-' * 5)
    print(pipe_OutputCode.classes_)
    print(classification_report(y_true = Y_test, y_pred = pipe_OutputCode.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))
    print('-' * 15)

----- hinge -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.636     1.000     0.778        14
           1      0.647     0.786     0.710        14
           2      0.857     0.857     0.857        14
           3      0.714     0.714     0.714        14
           4      1.000     0.429     0.600        14
           5      0.750     0.643     0.692        14
           6      0.846     0.786     0.815        14

    accuracy                          0.745        98
   macro avg      0.779     0.745     0.738        98
weighted avg      0.779     0.745     0.738        98

---------------
----- log -----
[0 1 2 3 4 5 6]
              precision    recall  f1-score   support

           0      0.786     0.786     0.786        14
           1      0.692     0.643     0.667        14
           2      0.750     0.643     0.692        14
           3      0.909     0.714     0.800        14
           4      0.556     0.357     0.435       

## Подбор лучших парамметров модели

### one_vs_all

In [42]:
param_grid = {
    'one_vs_all__estimator__loss': ['hinge', 'log', 'modified_huber',
                                    'squared_hinge', 'perceptron'],
    
    'one_vs_all__estimator__penalty': ['l1', 'l2', 'elasticnet'],
    
    'one_vs_all__estimator__alpha': [0.001, 0.01, 0.1, 0.5, 1, 10]
}

In [43]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [44]:
search_one_all = GridSearchCV(pipe_one_vs_all, param_grid, scoring='accuracy')
search_one_all.fit(X_train, Y_train)



GridSearchCV(estimator=Pipeline(steps=[('std_scaler', StandardScaler()),
                                       ('one_vs_all',
                                        OneVsRestClassifier(estimator=SGDClassifier(loss='perceptron',
                                                                                    random_state=42)))]),
             param_grid={'one_vs_all__estimator__alpha': [0.001, 0.01, 0.1, 0.5,
                                                          1, 10],
                         'one_vs_all__estimator__loss': ['hinge', 'log',
                                                         'modified_huber',
                                                         'squared_hinge',
                                                         'perceptron'],
                         'one_vs_all__estimator__penalty': ['l1', 'l2',
                                                            'elasticnet']},
             scoring='accuracy')

In [45]:
search_one_all.best_params_

{'one_vs_all__estimator__alpha': 0.5,
 'one_vs_all__estimator__loss': 'hinge',
 'one_vs_all__estimator__penalty': 'l2'}

In [46]:
print(classification_report(y_true = Y_train, y_pred = search_one_all.predict(X_train)))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91        56
           1       0.84      0.95      0.89        56
           2       0.80      0.91      0.85        56
           3       0.93      0.93      0.93        56
           4       1.00      0.38      0.55        56
           5       0.84      0.91      0.87        56
           6       0.90      0.96      0.93        56

    accuracy                           0.86       392
   macro avg       0.88      0.86      0.85       392
weighted avg       0.88      0.86      0.85       392



In [47]:
print(classification_report(y_true = Y_test, y_pred = search_one_all.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))

              precision    recall  f1-score   support

           0      0.778     1.000     0.875        14
           1      0.684     0.929     0.788        14
           2      0.800     0.857     0.828        14
           3      0.846     0.786     0.815        14
           4      0.600     0.214     0.316        14
           5      0.500     0.643     0.563        14
           6      1.000     0.714     0.833        14

    accuracy                          0.735        98
   macro avg      0.744     0.735     0.717        98
weighted avg      0.744     0.735     0.717        98



### all_vs_all

In [48]:
param_grid = {
    'all_vs_all__estimator__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'all_vs_all__estimator__penalty': ['l1', 'l2', 'elasticnet'],
    'all_vs_all__estimator__alpha': [0.001, 0.01, 0.1, 0.5, 1, 10]
}

In [49]:
search_all_all = GridSearchCV(pipe_all_all, param_grid, scoring='accuracy')
search_all_all.fit(X_train, Y_train)



GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('all_vs_all',
                                        OneVsOneClassifier(estimator=SGDClassifier(loss='perceptron',
                                                                                   random_state=42)))]),
             param_grid={'all_vs_all__estimator__alpha': [0.001, 0.01, 0.1, 0.5,
                                                          1, 10],
                         'all_vs_all__estimator__loss': ['hinge', 'log',
                                                         'modified_huber',
                                                         'squared_hinge',
                                                         'perceptron'],
                         'all_vs_all__estimator__penalty': ['l1', 'l2',
                                                            'elasticnet']},
             scoring='accuracy')

In [50]:
search_all_all.best_params_

{'all_vs_all__estimator__alpha': 0.5,
 'all_vs_all__estimator__loss': 'log',
 'all_vs_all__estimator__penalty': 'l2'}

In [51]:
print(classification_report(y_true = Y_train, y_pred = search_all_all.predict(X_train)))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        56
           1       0.93      0.96      0.95        56
           2       0.90      0.93      0.91        56
           3       0.96      0.95      0.95        56
           4       0.96      0.82      0.88        56
           5       0.93      0.95      0.94        56
           6       0.98      0.98      0.98        56

    accuracy                           0.94       392
   macro avg       0.94      0.94      0.94       392
weighted avg       0.94      0.94      0.94       392



In [52]:
print(classification_report(y_true = Y_test, y_pred = search_all_all.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))

              precision    recall  f1-score   support

           0      0.824     1.000     0.903        14
           1      0.800     0.857     0.828        14
           2      0.800     0.857     0.828        14
           3      0.833     0.714     0.769        14
           4      0.615     0.571     0.593        14
           5      0.500     0.500     0.500        14
           6      0.833     0.714     0.769        14

    accuracy                          0.745        98
   macro avg      0.744     0.745     0.741        98
weighted avg      0.744     0.745     0.741        98



In [53]:
param_grid = {
    'OutputCode__estimator__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'OutputCode__estimator__penalty': ['l1', 'l2', 'elasticnet'],
    'OutputCode__estimator__alpha': [0.001, 0.01, 0.1, 0.5, 1, 10]
}

In [54]:
search_all_all = GridSearchCV(pipe_OutputCode, param_grid, scoring='accuracy')
search_all_all.fit(X_train, Y_train)



GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('OutputCode',
                                        OutputCodeClassifier(estimator=SGDClassifier(loss='perceptron',
                                                                                     random_state=42)))]),
             param_grid={'OutputCode__estimator__alpha': [0.001, 0.01, 0.1, 0.5,
                                                          1, 10],
                         'OutputCode__estimator__loss': ['hinge', 'log',
                                                         'modified_huber',
                                                         'squared_hinge',
                                                         'perceptron'],
                         'OutputCode__estimator__penalty': ['l1', 'l2',
                                                            'elasticnet']},
             scoring='accuracy')

In [55]:
search_all_all.best_params_

{'OutputCode__estimator__alpha': 0.1,
 'OutputCode__estimator__loss': 'hinge',
 'OutputCode__estimator__penalty': 'elasticnet'}

In [56]:
print(classification_report(y_true = Y_train, y_pred = search_all_all.predict(X_train)))

              precision    recall  f1-score   support

           0       0.96      0.91      0.94        56
           1       0.92      0.96      0.94        56
           2       0.96      0.86      0.91        56
           3       0.85      1.00      0.92        56
           4       0.84      0.88      0.86        56
           5       0.90      0.84      0.87        56
           6       0.94      0.91      0.93        56

    accuracy                           0.91       392
   macro avg       0.91      0.91      0.91       392
weighted avg       0.91      0.91      0.91       392



In [57]:
print(classification_report(y_true = Y_test, y_pred = search_all_all.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))

              precision    recall  f1-score   support

           0      1.000     0.786     0.880        14
           1      0.867     0.929     0.897        14
           2      0.900     0.643     0.750        14
           3      0.722     0.929     0.813        14
           4      0.545     0.857     0.667        14
           5      0.571     0.571     0.571        14
           6      1.000     0.571     0.727        14

    accuracy                          0.755        98
   macro avg      0.801     0.755     0.758        98
weighted avg      0.801     0.755     0.758        98



## Обучение стекинга

In [58]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier

pipe_dt = Pipeline([('std_scaler', StandardScaler()),
                    ('one_vs_all', OneVsRestClassifier(estimator=SGDClassifier(
                        alpha=0.1,
                        loss='log',
                        penalty='elasticnet'
                        )))])

pipe_lr = Pipeline([('std_scaler', StandardScaler()),
                    ('one_vs_all', OneVsRestClassifier(estimator=SGDClassifier(
                        alpha=0.1,
                        loss='hinge',
                        penalty='elasticnet'
                        )))])

 
estimators = [
    ("dt", pipe_dt),
    ("lr", pipe_lr)
]

final_estimator = DecisionTreeClassifier(max_depth=5)

### Base learners are fitted on the full X
### while the final estimator is trained
### using cross-validated predictions of the base learners
### using cross_val_predict.

stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=final_estimator,
    cv=2
)

In [59]:
stacking_model.fit(X_train, Y_train)

StackingClassifier(cv=2,
                   estimators=[('dt',
                                Pipeline(steps=[('std_scaler',
                                                 StandardScaler()),
                                                ('one_vs_all',
                                                 OneVsRestClassifier(estimator=SGDClassifier(alpha=0.1,
                                                                                             loss='log',
                                                                                             penalty='elasticnet')))])),
                               ('lr',
                                Pipeline(steps=[('std_scaler',
                                                 StandardScaler()),
                                                ('one_vs_all',
                                                 OneVsRestClassifier(estimator=SGDClassifier(alpha=0.1,
                                                                             

In [60]:
train_preds = stacking_model.predict(X_train)
test_preds = stacking_model.predict(X_test)

train_error = np.mean((train_preds - Y_train)**2)
test_error = np.mean((test_preds - Y_test)**2)


print(f"Качество на трейне: {train_error.round(3)}")
print(f"Качество на тесте: {test_error.round(3)}")

Качество на трейне: 0.781
Качество на тесте: 1.49


In [61]:
print(classification_report(y_true = Y_train, y_pred = stacking_model.predict(X_train)))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92        56
           1       0.87      0.86      0.86        56
           2       0.69      0.89      0.78        56
           3       0.92      0.82      0.87        56
           4       0.00      0.00      0.00        56
           5       0.44      0.93      0.59        56
           6       1.00      0.52      0.68        56

    accuracy                           0.72       392
   macro avg       0.68      0.72      0.67       392
weighted avg       0.68      0.72      0.67       392



In [62]:
print(classification_report(y_true = Y_test, y_pred = stacking_model.predict(X_test),labels=[0, 1, 2, 3, 4, 5, 6], digits=3))

              precision    recall  f1-score   support

           0      0.824     1.000     0.903        14
           1      0.714     0.714     0.714        14
           2      0.625     0.714     0.667        14
           3      0.846     0.786     0.815        14
           4      0.000     0.000     0.000        14
           5      0.387     0.857     0.533        14
           6      1.000     0.429     0.600        14

    accuracy                          0.643        98
   macro avg      0.628     0.643     0.605        98
weighted avg      0.628     0.643     0.605        98



## catboost

In [63]:
!pip install catboost



In [64]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(random_state=42)

catboost.fit(X_train, Y_train)

Learning rate set to 0.075436
0:	learn: 1.8817063	total: 594ms	remaining: 9m 53s
1:	learn: 1.8061744	total: 901ms	remaining: 7m 29s
2:	learn: 1.7287730	total: 1.2s	remaining: 6m 39s
3:	learn: 1.6740210	total: 1.37s	remaining: 5m 41s
4:	learn: 1.6246169	total: 1.57s	remaining: 5m 11s
5:	learn: 1.5816309	total: 1.74s	remaining: 4m 48s
6:	learn: 1.5355407	total: 1.91s	remaining: 4m 30s
7:	learn: 1.4986724	total: 2.08s	remaining: 4m 18s
8:	learn: 1.4713266	total: 2.34s	remaining: 4m 17s
9:	learn: 1.4314544	total: 2.53s	remaining: 4m 10s
10:	learn: 1.3901059	total: 2.74s	remaining: 4m 6s
11:	learn: 1.3556976	total: 2.92s	remaining: 4m
12:	learn: 1.3333898	total: 3.12s	remaining: 3m 56s
13:	learn: 1.3100160	total: 3.33s	remaining: 3m 54s
14:	learn: 1.2843080	total: 3.63s	remaining: 3m 58s
15:	learn: 1.2631236	total: 3.85s	remaining: 3m 56s
16:	learn: 1.2385926	total: 4.07s	remaining: 3m 55s
17:	learn: 1.2190160	total: 4.24s	remaining: 3m 51s
18:	learn: 1.1913501	total: 4.4s	remaining: 3m 47s

<catboost.core.CatBoostClassifier at 0x7fa9893ce8b0>

In [65]:
print(classification_report(y_true = Y_train, y_pred = catboost.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        56
           2       1.00      1.00      1.00        56
           3       1.00      1.00      1.00        56
           4       1.00      1.00      1.00        56
           5       1.00      1.00      1.00        56
           6       1.00      1.00      1.00        56

    accuracy                           1.00       392
   macro avg       1.00      1.00      1.00       392
weighted avg       1.00      1.00      1.00       392



In [66]:
print(classification_report(y_true = Y_test, y_pred = catboost.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       0.81      0.93      0.87        14
           2       0.92      0.86      0.89        14
           3       0.87      0.93      0.90        14
           4       0.82      0.64      0.72        14
           5       0.60      0.64      0.62        14
           6       0.92      0.79      0.85        14

    accuracy                           0.83        98
   macro avg       0.83      0.83      0.82        98
weighted avg       0.83      0.83      0.82        98



## xgboost

In [67]:
! pip install xgboost



In [68]:
from xgboost import XGBClassifier

xgboost = XGBClassifier(random_state=42)
xgboost.fit(X_train, Y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='multi:softprob', ...)

In [69]:
print(classification_report(y_true = Y_train, y_pred = xgboost.predict(X_train)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        56
           2       1.00      1.00      1.00        56
           3       1.00      1.00      1.00        56
           4       1.00      1.00      1.00        56
           5       1.00      1.00      1.00        56
           6       1.00      1.00      1.00        56

    accuracy                           1.00       392
   macro avg       1.00      1.00      1.00       392
weighted avg       1.00      1.00      1.00       392



In [70]:
print(classification_report(y_true = Y_test, y_pred = xgboost.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        14
           1       0.86      0.86      0.86        14
           2       0.92      0.86      0.89        14
           3       0.81      0.93      0.87        14
           4       0.69      0.64      0.67        14
           5       0.65      0.79      0.71        14
           6       1.00      0.71      0.83        14

    accuracy                           0.82        98
   macro avg       0.83      0.82      0.82        98
weighted avg       0.83      0.82      0.82        98



# Search best parammetrs

## catbost

In [71]:
from sklearn.metrics import make_scorer, f1_score

params = {'n_estimators': [298], 
          'max_depth': [3], 
          'subsample': np.linspace(0.55, 0.6, 10), 
          'l2_leaf_reg': np.linspace(3, 3.5, 5), 
          'random_strength': np.linspace(1.1, 1.2, 10), 
          'eta': np.linspace(0.09, 0.1, 10), 
          'min_data_in_leaf': [5], 
          'random_state': [777]}

# Initialize the CatBoost classifier
model = CatBoostClassifier(random_state=42)

# Define the parameter grid for Grid Search
param_grid = {
	'iterations': [100, 200],
	'learning_rate': [0.01, 0.1],
	'depth': [3, 6]
}

# Use 'f1_weighted' as the scoring metric for Grid Search
scorer = make_scorer(f1_score, average='weighted')
grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Print the best hyperparameters for Grid Search
print("Grid Search - Best Hyperparameters:", grid_search.best_params_)


0:	learn: 1.9373367	total: 861ms	remaining: 1m 25s
0:	learn: 1.9371367	total: 861ms	remaining: 1m 25s
0:	learn: 1.8538278	total: 852ms	remaining: 1m 24s
0:	learn: 1.9349541	total: 852ms	remaining: 1m 24s
0:	learn: 1.9349482	total: 861ms	remaining: 1m 25s
0:	learn: 1.9363882	total: 853ms	remaining: 1m 24s
0:	learn: 1.8399517	total: 852ms	remaining: 1m 24s
0:	learn: 1.8611677	total: 861ms	remaining: 1m 25s
1:	learn: 1.9268630	total: 1.13s	remaining: 55.4s
1:	learn: 1.7910388	total: 1.27s	remaining: 1m 2s
1:	learn: 1.9217923	total: 1.31s	remaining: 1m 4s
1:	learn: 1.7910738	total: 1.42s	remaining: 1m 9s
1:	learn: 1.9290516	total: 1.43s	remaining: 1m 9s
1:	learn: 1.9287401	total: 1.42s	remaining: 1m 9s
1:	learn: 1.9290567	total: 1.44s	remaining: 1m 10s
1:	learn: 1.7728603	total: 1.47s	remaining: 1m 12s
2:	learn: 1.9149799	total: 1.7s	remaining: 55s
2:	learn: 1.6866349	total: 1.76s	remaining: 57s
2:	learn: 1.9116295	total: 1.75s	remaining: 56.7s
2:	learn: 1.7133289	total: 1.8s	remaining: 58

In [72]:
print(classification_report(y_true = Y_train, y_pred = grid_search.predict(X_train)))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        56
           1       1.00      0.98      0.99        56
           2       1.00      1.00      1.00        56
           3       1.00      1.00      1.00        56
           4       1.00      1.00      1.00        56
           5       1.00      1.00      1.00        56
           6       1.00      1.00      1.00        56

    accuracy                           1.00       392
   macro avg       1.00      1.00      1.00       392
weighted avg       1.00      1.00      1.00       392



In [73]:
print(classification_report(y_true = Y_test, y_pred = grid_search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       0.76      0.93      0.84        14
           2       0.86      0.86      0.86        14
           3       0.80      0.86      0.83        14
           4       0.73      0.57      0.64        14
           5       0.69      0.79      0.73        14
           6       1.00      0.71      0.83        14

    accuracy                           0.82        98
   macro avg       0.82      0.82      0.81        98
weighted avg       0.82      0.82      0.81        98



In [97]:
model = XGBClassifier()

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('model', model)
])

param_grid = {
    'model__max_depth': [2, 3, 5, 7, 10],
    'model__n_estimators': [10, 100, 500],
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')
grid.fit(X_train, Y_train)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 349, in _score
    raise ValueError("{0} format is not suppo

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('standard_scaler', StandardScaler()),
                                       ('model',
                                        XGBClassifier(base_score=None,
                                                      booster=None,
                                                      callbacks=None,
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      device=None,
                                                      early_stopping_rounds=None,
                                                      enable_categorical=False,
                                                      eval_metric=None,
                                                      feature_types=None,
                                      

In [103]:
print(classification_report(y_true = Y_train, y_pred = grid.predict(X_train)))

              precision    recall  f1-score   support

           0       0.88      0.99      0.93       112
           1       0.89      0.93      0.91       112
           2       0.84      0.90      0.87       112
           3       0.77      0.90      0.83       112
           4       0.89      0.52      0.66       112
           5       0.85      0.94      0.89       112
           6       0.95      0.86      0.90       112

    accuracy                           0.86       784
   macro avg       0.87      0.86      0.86       784
weighted avg       0.87      0.86      0.86       784



In [102]:
print(classification_report(y_true = Y_test, y_pred = grid.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92        28
           1       0.71      0.79      0.75        28
           2       0.85      0.79      0.81        28
           3       0.62      0.93      0.74        28
           4       0.57      0.29      0.38        28
           5       0.59      0.68      0.63        28
           6       0.70      0.50      0.58        28

    accuracy                           0.70       196
   macro avg       0.70      0.70      0.69       196
weighted avg       0.70      0.70      0.69       196



In [100]:
precision_and_confusion_matrix(X, y, CatBoostClassifier(verbose=False))

NameError: name 'precision_and_confusion_matrix' is not defined