In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import numpy as np

In [2]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

In [4]:
#загрузка датасета
df = pd.read_csv("breathes.csv")
df = df.drop(columns=['Unnamed: 0', 'Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid'])
df.head()

Unnamed: 0,ID,Sex,Had_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,Amplitude,DominatorFreq,DominatorFreqPower,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_2_3,0.145303,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,0.00505,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,-0.002082,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.362165,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.536107,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [5]:
#целевой признак
y = df['Breathing_Type']

In [6]:
#категориальные переменные
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
X_categorical = df[categorical_cols]

In [7]:
#численные переменные
non_numerical_cols = categorical_cols + ['ID', 'Breathing_Type']
X_numeric = df.drop(columns=non_numerical_cols)

In [8]:
#трансформация категориальных переменных
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col]= le.fit_transform(X_categorical[col])
le = LabelEncoder()
y = le.fit_transform(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a c

In [9]:
#все входные признаки
X = np.concatenate((X_numeric, X_categorical), axis=1)
X_df = pd.concat([X_numeric, X_categorical], axis=1)

In [10]:
#разбиение на выборки
X_train_numeric, X_test_numeric, X_train_categorical, X_test_categorical, y_train, y_test = train_test_split(X_numeric, X_categorical, y, test_size=0.2, random_state=42)

In [11]:
#обучающая и тестовые выборки
X_train = np.concatenate((X_train_numeric, X_train_categorical), axis=1)
X_test = np.concatenate((X_test_numeric, X_test_categorical), axis=1)

In [12]:
#расчет метрик по тесовой выборке
def calculate_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1-score: ", f1)
    report = classification_report(y_test, y_pred)
    print(report)

In [13]:
#кросс-валидация
def calculate_cv_metrics(model, X, y):
    num_folds = 5
    kfold = KFold(n_splits=num_folds)
    scores = ('accuracy', 'recall_weighted', 'precision_weighted', 'f1_weighted', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted')
    cv_results = cross_validate(model, X, y, cv=kfold, scoring=scores)
    print("Mean accuracy:", cv_results['test_accuracy'].mean())
    print("Mean recall:", cv_results['test_recall_weighted'].mean())
    print("Mean precision", cv_results['test_precision_weighted'].mean())
    print("Mean f1-score", cv_results['test_f1_weighted'].mean())
    print("Mean ROC-AUC_OVR", cv_results['test_roc_auc_ovr_weighted'].mean())
    print("Mean ROC-AUC-OVO", cv_results['test_roc_auc_ovo_weighted'].mean())

In [14]:
#расчет знамости признаков
def calc_feature_importances(model, X_df):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

In [15]:
# def calculate_feature_importances(model, X_df):
#     importance = model.feature_importances_
#     feature_names = X_df.columns
#     features_importances = dict()
#
#     for i, column in enumerate(feature_names):
#         features_importances[column] = importance[i]
#         i += 1
#
#     sorted_importances = dict(sorted(features_importances.items(), key=lambda x: x[1], reverse=True))
#     return sorted_importances

<h3>XGBoost</h3>

In [16]:
import xgboost as xgb

In [50]:
model_xgb = xgb.XGBClassifier()

In [51]:
res = model_xgb.fit(X_train, y_train)

In [52]:
y_pred = model_xgb.predict(X_test)

In [20]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.8404255319148937
Recall:  0.8404255319148937
Precision:  0.8426440062272963
F1-score:  0.8403787211494961
              precision    recall  f1-score   support

           0       0.83      0.87      0.85        95
           1       0.80      0.86      0.83        77
           2       0.88      0.80      0.84       110

    accuracy                           0.84       282
   macro avg       0.84      0.84      0.84       282
weighted avg       0.84      0.84      0.84       282



In [21]:
calculate_cv_metrics(model_xgb, X, y)

Mean accuracy: 0.5021377552308119
Mean recall: 0.5021377552308119
Mean precision 0.5168937300282517
Mean f1-score 0.49776123888171037
Mean ROC-AUC_OVR 0.6982402649361484
Mean ROC-AUC-OVO 0.6984264645053975


In [54]:
feature_importances = calc_feature_importances(model_xgb, X_df)
feature_importances

Unnamed: 0,Feature,Importance
26,Sin_Amp12/Sin_Amp23,0.074987
0,Damage_Percent,0.07368
23,Sin_Period23,0.062662
1,Frequency,0.037759
19,Sin_Freq23,0.036476
2,Amplitude,0.036238
12,Sin_Amp13,0.036169
11,Sin_Period12,0.034872
4,D_1_2_Avg,0.032819
29,Sin_Omega12/Sin_Omega23,0.032704


In [56]:
feature_importances.to_excel('export/xgboost_features.xlsx')

<h3>LogisticRegression</h3>

In [57]:
from sklearn.linear_model import LogisticRegression

model_lgr = LogisticRegression()
res = model_lgr.fit(X_train, y_train)
y_pred = model_lgr.predict(X_test)

In [58]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.39361702127659576
Recall:  0.39361702127659576
Precision:  0.4198728406789364
F1-score:  0.39712187135472543
              precision    recall  f1-score   support

           0       0.45      0.46      0.46        95
           1       0.28      0.40      0.33        77
           2       0.49      0.33      0.39       110

    accuracy                           0.39       282
   macro avg       0.41      0.40      0.39       282
weighted avg       0.42      0.39      0.40       282



In [25]:
calculate_cv_metrics(model_lgr, X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Mean accuracy: 0.3777769364730824
Mean recall: 0.3777769364730824
Mean precision 0.38041261416382527
Mean f1-score 0.3693456017869673
Mean ROC-AUC_OVR 0.5701073228889493
Mean ROC-AUC-OVO 0.5695541138729046


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [59]:
coefficients = model_lgr.coef_

avg_importance = np.mean(np.abs(coefficients), axis=0)
feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': avg_importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
feature_importance

Unnamed: 0,Feature,Importance
31,(Sin_Offset12/Sin_Offset13)^2,0.504797
32,(Sin_Offset12/Sin_Offset23)^2,0.470238
2,Amplitude,0.22025
10,Sin_Offset12,0.216395
5,D_2_3_Avg,0.202655
1,Frequency,0.181025
30,(Sin_Offset23/Sin_Offset13)^2,0.164516
26,Sin_Amp12/Sin_Amp23,0.161682
17,Sin_Period13,0.150123
36,DominatorFreq,0.136122


In [60]:
feature_importance.to_excel('export/logistic_regression_features.xlsx')

<h3>SVM</h3>

In [61]:
from sklearn.svm import SVC

model_svc = SVC(kernel='linear')
res = model_svc.fit(X_train, y_train)
y_pred = model_svc.predict(X_test)

In [62]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.4078014184397163
Recall:  0.4078014184397163
Precision:  0.44600352566520796
F1-score:  0.40657199742031386
              precision    recall  f1-score   support

           0       0.48      0.47      0.48        95
           1       0.30      0.51      0.38        77
           2       0.52      0.28      0.36       110

    accuracy                           0.41       282
   macro avg       0.43      0.42      0.41       282
weighted avg       0.45      0.41      0.41       282



In [29]:
calculate_cv_metrics(model_svc, X, y)

Traceback (most recent call last):
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 76, in _cached_call
    return cache[method]
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 78, in _cached_call
    result = getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\utils\_

Mean accuracy: 0.3791979001034804
Mean recall: 0.3791979001034804
Mean precision 0.38409740583278923
Mean f1-score 0.37078961112039355
Mean ROC-AUC_OVR nan
Mean ROC-AUC-OVO nan


Traceback (most recent call last):
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 76, in _cached_call
    return cache[method]
KeyError: 'predict_proba'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 115, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 316, in _score
    y_pred = method_caller(clf, "predict_proba", X)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\metrics\_scorer.py", line 78, in _cached_call
    result = getattr(estimator, method)(*args, **kwargs)
  File "C:\Users\admin\PycharmProjects\TimeSeriesLibraries\venv\lib\site-packages\sklearn\utils\_

In [63]:
coefficients = model_svc.coef_

avg_importance = np.mean(np.abs(coefficients), axis=0)
feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': avg_importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
32,(Sin_Offset12/Sin_Offset23)^2,0.659548
1,Frequency,0.582531
5,D_2_3_Avg,0.51534
31,(Sin_Offset12/Sin_Offset13)^2,0.510258
10,Sin_Offset12,0.478584
25,Sin_Amp12/Sin_Amp13,0.476403
6,Sin_Amp12,0.438355
35,Lung_Damage,0.425711
2,Amplitude,0.418275
4,D_1_2_Avg,0.375023


In [64]:
feature_importance.to_excel('export/svm_features.xlsx')

<h3>Random Forest</h3>

In [65]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
res = model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

In [32]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.624113475177305
Recall:  0.624113475177305
Precision:  0.633563082961098
F1-score:  0.6185056974542374
              precision    recall  f1-score   support

           0       0.65      0.79      0.71        95
           1       0.54      0.62      0.58        77
           2       0.69      0.48      0.57       110

    accuracy                           0.62       282
   macro avg       0.62      0.63      0.62       282
weighted avg       0.63      0.62      0.62       282



In [33]:
calculate_cv_metrics(model_rf, X, y)

Mean accuracy: 0.5163272002221044
Mean recall: 0.5163272002221044
Mean precision 0.5178875006829264
Mean f1-score 0.5088821749115707
Mean ROC-AUC_OVR 0.7132711983710022
Mean ROC-AUC-OVO 0.7130293883724432


In [66]:
feature_importances = calc_feature_importances(model_rf, X_df)
feature_importances

Unnamed: 0,Feature,Importance
26,Sin_Amp12/Sin_Amp23,0.055893
2,Amplitude,0.049238
6,Sin_Amp12,0.04771
12,Sin_Amp13,0.043905
25,Sin_Amp12/Sin_Amp13,0.041226
24,Sin_Amp23/Sin_Amp13,0.041168
18,Sin_Amp23,0.037119
32,(Sin_Offset12/Sin_Offset23)^2,0.034107
1,Frequency,0.03347
16,Sin_Offset13,0.032198


In [67]:
feature_importances.to_excel('export/random_forest_features.xlsx')

<h3>LightGBM</h3>

In [68]:
import lightgbm as lgb

In [69]:
train_data = lgb.Dataset(X_train, label=y_train)

In [70]:
params = {
    'objective': 'multiclass',  # Многоклассовая классификация
    'num_classes': 3,  # Количество классов
    'metric': 'multi_logloss'  # Метрика оценки
}

In [71]:
model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7711
[LightGBM] [Info] Number of data points in the train set: 1126, number of used features: 37
[LightGBM] [Info] Start training from score -1.070589
[LightGBM] [Info] Start training from score -1.075784
[LightGBM] [Info] Start training from score -1.151496


In [72]:
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(axis=1)  # Преобразование вероятностей в классы

calculate_metrics(y_test, y_pred_class)

Accuracy:  0.8794326241134752
Recall:  0.8794326241134752
Precision:  0.883262310366997
F1-score:  0.8794076845264543
              precision    recall  f1-score   support

           0       0.88      0.91      0.89        95
           1       0.83      0.92      0.87        77
           2       0.93      0.83      0.87       110

    accuracy                           0.88       282
   macro avg       0.88      0.88      0.88       282
weighted avg       0.88      0.88      0.88       282



In [40]:
from sklearn.model_selection import StratifiedKFold

# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# cv_data = lgb.Dataset(X, label=y)
# cv_results = lgb.cv(params, cv_data, num_boost_round=100, folds=kfold, early_stopping_rounds=10)

In [73]:
importance = model.feature_importance()

feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
2,Amplitude,1351
1,Frequency,699
16,Sin_Offset13,502
32,(Sin_Offset12/Sin_Offset23)^2,405
26,Sin_Amp12/Sin_Amp23,381
30,(Sin_Offset23/Sin_Offset13)^2,372
6,Sin_Amp12,346
12,Sin_Amp13,335
10,Sin_Offset12,331
31,(Sin_Offset12/Sin_Offset13)^2,324


In [74]:
feature_importance.to_excel('export/lightGBM_features.xlsx')

<h3>CatBoost</h3>

In [43]:
import catboost as cb

In [75]:
model_cb = cb.CatBoostClassifier()

In [76]:
model_cb.fit(X_train, y_train)

Learning rate set to 0.079607
0:	learn: 1.0883671	total: 24.6ms	remaining: 24.5s
1:	learn: 1.0702876	total: 46.1ms	remaining: 23s
2:	learn: 1.0591107	total: 68.8ms	remaining: 22.9s
3:	learn: 1.0512725	total: 91.2ms	remaining: 22.7s
4:	learn: 1.0403130	total: 113ms	remaining: 22.4s
5:	learn: 1.0285724	total: 135ms	remaining: 22.4s
6:	learn: 1.0188972	total: 157ms	remaining: 22.3s
7:	learn: 1.0088297	total: 181ms	remaining: 22.4s
8:	learn: 1.0014987	total: 213ms	remaining: 23.5s
9:	learn: 0.9932085	total: 235ms	remaining: 23.3s
10:	learn: 0.9856672	total: 270ms	remaining: 24.3s
11:	learn: 0.9782041	total: 297ms	remaining: 24.5s
12:	learn: 0.9696593	total: 320ms	remaining: 24.3s
13:	learn: 0.9626921	total: 341ms	remaining: 24s
14:	learn: 0.9548047	total: 365ms	remaining: 23.9s
15:	learn: 0.9480126	total: 392ms	remaining: 24.1s
16:	learn: 0.9401724	total: 418ms	remaining: 24.2s
17:	learn: 0.9297811	total: 441ms	remaining: 24.1s
18:	learn: 0.9239710	total: 468ms	remaining: 24.2s
19:	learn: 

<catboost.core.CatBoostClassifier at 0x237ee3ea260>

In [77]:
y_pred = model_cb.predict(X_test)
calculate_metrics(y_test, y_pred)

Accuracy:  0.8085106382978723
Recall:  0.8085106382978723
Precision:  0.8142740537473083
F1-score:  0.8076821531640951
              precision    recall  f1-score   support

           0       0.83      0.92      0.87        95
           1       0.72      0.79      0.75        77
           2       0.87      0.73      0.79       110

    accuracy                           0.81       282
   macro avg       0.81      0.81      0.81       282
weighted avg       0.81      0.81      0.81       282



In [47]:
calculate_cv_metrics(model, X, y)

Learning rate set to 0.079607
0:	learn: 1.0867358	total: 26.9ms	remaining: 26.8s
1:	learn: 1.0681019	total: 49.7ms	remaining: 24.8s
2:	learn: 1.0567033	total: 70.8ms	remaining: 23.5s
3:	learn: 1.0433578	total: 92.9ms	remaining: 23.1s
4:	learn: 1.0319768	total: 115ms	remaining: 22.9s
5:	learn: 1.0190510	total: 136ms	remaining: 22.6s
6:	learn: 1.0102735	total: 158ms	remaining: 22.4s
7:	learn: 1.0014006	total: 192ms	remaining: 23.8s
8:	learn: 0.9908546	total: 231ms	remaining: 25.5s
9:	learn: 0.9838347	total: 262ms	remaining: 25.9s
10:	learn: 0.9762503	total: 294ms	remaining: 26.4s
11:	learn: 0.9697891	total: 316ms	remaining: 26s
12:	learn: 0.9600565	total: 348ms	remaining: 26.4s
13:	learn: 0.9510292	total: 369ms	remaining: 26s
14:	learn: 0.9431737	total: 394ms	remaining: 25.8s
15:	learn: 0.9370325	total: 415ms	remaining: 25.5s
16:	learn: 0.9276251	total: 439ms	remaining: 25.4s
17:	learn: 0.9179336	total: 462ms	remaining: 25.2s
18:	learn: 0.9117249	total: 491ms	remaining: 25.3s
19:	learn: 

In [79]:
importance = model_cb.get_feature_importance()

feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
2,Amplitude,12.850299
1,Frequency,8.323705
26,Sin_Amp12/Sin_Amp23,8.081231
6,Sin_Amp12,5.088439
16,Sin_Offset13,5.052605
32,(Sin_Offset12/Sin_Offset23)^2,4.437788
12,Sin_Amp13,3.627446
25,Sin_Amp12/Sin_Amp13,3.4988
24,Sin_Amp23/Sin_Amp13,3.306765
31,(Sin_Offset12/Sin_Offset13)^2,3.093323


In [80]:
feature_importance.to_excel('export/catboost_features.xlsx')