In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score

In [45]:
#загрузка датасета
df = pd.read_csv("breathes.csv")
df = df.drop(columns=['Unnamed: 0', 'Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid'])
df.head()

Unnamed: 0,ID,Sex,Had_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,Amplitude,DominatorFreq,DominatorFreqPower,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_2_3,0.145303,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,0.00505,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,-0.002082,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.362165,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.536107,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [46]:
#целевой признак
y = df['Breathing_Type']
#категориальные переменные
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
X_categorical = df[categorical_cols]
#численные переменные
non_numerical_cols = categorical_cols + ['ID', 'Breathing_Type']
X_numeric = df.drop(columns=non_numerical_cols)

In [47]:
X_numeric_calc = X_numeric
X_numeric['Min'] = np.min(X_numeric_calc, axis=1)
X_numeric['Max'] = np.max(X_numeric_calc, axis=1)
X_numeric['Std'] = np.std(X_numeric_calc, axis=1)
X_numeric['Median'] = np.median(X_numeric_calc, axis=1)
X_numeric['Mean'] = np.mean(X_numeric_calc, axis=1)
X_numeric['Var'] = np.var(X_numeric_calc, axis=1)
X_numeric['Sum'] = np.sum(X_numeric_calc, axis=1)

In [48]:
#трансформация категориальных переменных
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col]= le.fit_transform(X_categorical[col])
le = LabelEncoder()
y = le.fit_transform(y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a c

In [49]:
#все входные признаки
X = np.concatenate((X_numeric, X_categorical), axis=1)
X_df = pd.concat([X_numeric, X_categorical], axis=1)

In [50]:
#разбиение на выборки
X_train_numeric, X_test_numeric, X_train_categorical, X_test_categorical, y_train, y_test = train_test_split(X_numeric, X_categorical, y, test_size=0.2, random_state=42)
#обучающая и тестовые выборки
X_train = np.concatenate((X_train_numeric, X_train_categorical), axis=1)
X_test = np.concatenate((X_test_numeric, X_test_categorical), axis=1)

In [51]:
#расчет метрик по тесовой выборке
def calculate_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1-score: ", f1)
    report = classification_report(y_test, y_pred)
    print(report)

In [52]:
#расчет знамости признаков
def calc_feature_importances(model, X_df):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

<h3>Testing</h3>

In [53]:
import xgboost as xgb

In [54]:
model_xgb = xgb.XGBClassifier()

In [55]:
res = model_xgb.fit(X_train, y_train)

In [56]:
y_pred = model_xgb.predict(X_test)

In [57]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.8404255319148937
Recall:  0.8404255319148937
Precision:  0.8410303520607064
F1-score:  0.8399090396919352
              precision    recall  f1-score   support

           0       0.85      0.78      0.81        95
           1       0.82      0.87      0.84        77
           2       0.85      0.87      0.86       110

    accuracy                           0.84       282
   macro avg       0.84      0.84      0.84       282
weighted avg       0.84      0.84      0.84       282



In [58]:
feature_importances = calc_feature_importances(model_xgb, X_df)
feature_importances

Unnamed: 0,Feature,Importance
26,Sin_Amp12/Sin_Amp23,0.062389
23,Sin_Period23,0.050756
41,Had_Covid,0.042089
0,Damage_Percent,0.035972
2,Amplitude,0.033972
1,Frequency,0.033959
19,Sin_Freq23,0.033805
12,Sin_Amp13,0.032527
29,Sin_Omega12/Sin_Omega23,0.031487
6,Sin_Amp12,0.02889
