In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder

<h3>Загрузка и предобработка датасета</h3>

In [43]:
#загрузка датасета
df = pd.read_csv('coords_data_numerated.csv')
df = df.drop(columns=['Unnamed: 0', 'GSR'])
df.head()

Unnamed: 0,id,BreathingType,TimeStamp_sec,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
0,1,0,0.058895,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0,0.111889,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0,0.158371,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0,0.262516,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0,0.29562,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


In [44]:
#Определение выходного признака и входных признаков
y = df['BreathingType']
X = df.drop(columns=['BreathingType','TimeStamp_sec'])
X.tail()

Unnamed: 0,id,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
230921,258,1.063,-1.649,3.843,1.093,-1.839,3.624,1.03,-1.867,3.859
230922,258,1.062,-1.648,3.842,1.094,-1.841,3.625,1.03,-1.867,3.857
230923,258,1.062,-1.648,3.842,1.093,-1.841,3.624,1.03,-1.868,3.855
230924,258,1.063,-1.647,3.842,1.094,-1.841,3.624,1.029,-1.868,3.853
230925,258,1.063,-1.647,3.842,1.094,-1.841,3.623,1.03,-1.868,3.853


In [45]:
#переименование столбцов
dict_renames = {
    'FirstMarkerXCoord' : 'FMX',
    'FirstMarkerYCoord' : 'FMY',
    'FirstMarkerZCoord' : 'FMZ',
    'SecondMarkerXCoord' :  'SMX',
    'SecondMarkerYCoord' : 'SMY',
    'SecondMarkerZCoord' : 'SMZ',
    'ThirdMarkerXCoord' : 'TMX',
    'ThirdMarkerYCoord' : 'TMY',
    'ThirdMarkerZCoord' : 'TMZ'
}
X = X.rename(columns=dict_renames)
X.head()

Unnamed: 0,id,FMX,FMY,FMZ,SMX,SMY,SMZ,TMX,TMY,TMZ
0,1,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


<h3>Извлечение признаков с помощью преобразования Фурье</h3>

In [52]:
def calc_phases(ts):
    return np.angle(ts)

def calc_amplitudes(ts):
    return np.abs(ts)

def calc_stats_features(prefix, values):
    stats_features = pd.DataFrame()
    stats_features[prefix+'mean'] = [np.mean(values)]
    stats_features[prefix+'median'] = [np.median(values)]
    stats_features[prefix+'diff']  = [np.mean(np.diff(values))]
    stats_features[prefix+'std'] = [np.std(values)]
    stats_features[prefix+'max'] = [np.max(values)]
    stats_features[prefix+'min'] = [np.min(values)]
    stats_features[prefix+'p25'] = [np.percentile(values, 25)]
    stats_features[prefix+'p75'] = [np.percentile(values, 75)]
    return stats_features

def calc_phases_features(prefix, phases):
    prefix += '_phases_'
    phases_features = calc_stats_features(prefix, phases)
    return phases_features

def calc_amplitude_features(prefix, amplitudes):
    prefix += '_amp_'
    amp_features = calc_stats_features(prefix, amplitudes)
    #мощность
    amp_features[prefix+'power'] = np.sum(amplitudes ** 2)
    #спектральная плотность мощности
    amp_features[prefix+'power_density'] = np.sum(amplitudes ** 2) / len(amplitudes)
    return amp_features

In [53]:
#расчет для всех 9 столбцов
numeric_cols = X.drop(columns='id')

In [58]:
#расчет новых признаков
new_features = pd.DataFrame()
num_ts = 259
for i in range(1, num_ts):
    ts_features = pd.DataFrame()
    #расчет статистических показателей по каждому столбцу
    for col in numeric_cols:
        col_values = X[X['id'] == i] [col]
        prefix = col

        #преобразование фурье
        fourier_ts = np.fft.fft(col_values)

        #расчет фаз частот и амплитуд
        phases = calc_phases(fourier_ts)
        amplitudes = calc_amplitudes(fourier_ts)

        phases_features = calc_phases_features(prefix, phases)
        amp_features = calc_amplitude_features(prefix, amplitudes)

        ts_features = pd.concat([ts_features, phases_features, amp_features], axis=1)

    new_features = pd.concat([new_features,ts_features], axis=0, ignore_index=True)

In [59]:
new_features.head()

Unnamed: 0,FMX_phases_mean,FMX_phases_median,FMX_phases_diff,FMX_phases_std,FMX_phases_max,FMX_phases_min,FMX_phases_p25,FMX_phases_p75,FMX_amp_mean,FMX_amp_median,...,TMZ_amp_mean,TMZ_amp_median,TMZ_amp_diff,TMZ_amp_std,TMZ_amp_max,TMZ_amp_min,TMZ_amp_p25,TMZ_amp_p75,TMZ_amp_power,TMZ_amp_power_density
0,3.154463e-16,0.0,0.000319,0.985981,2.740332,-2.740332,-0.749555,0.749555,0.736157,0.019573,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
1,1.261785e-16,0.0,0.001638,0.963013,2.934504,-2.934504,-0.832137,0.832137,0.763412,0.033113,...,4.443864,0.021628,-4.364635,130.827955,3929.278,0.001029,0.01343,0.044879,15439270.0,17135.701632
2,-6.308926e-17,0.0,-0.002362,1.162453,3.120591,-3.120591,-0.940972,0.940972,0.740657,0.022448,...,4.457085,0.021126,-4.367949,130.939655,3932.639,0.001064,0.012213,0.047521,15465720.0,17165.058955
3,-6.308926e-17,0.0,0.001501,1.136111,3.096238,-3.096238,-0.886534,0.886534,0.927711,0.019978,...,4.350862,0.044459,-4.186469,125.653479,3773.917,0.002379,0.02954,0.086832,14242760.0,15807.726859
4,0.0,0.0,0.003,2.22057,3.138254,-3.138254,-2.201429,2.201429,0.700369,0.015132,...,4.255412,0.021429,-4.176523,125.203759,3760.364,0.001801,0.012707,0.042025,14140370.0,15694.089816


In [None]:
# ts = X[X['id'] == 1]['TMX']
# fourier_ts = np.fft.fft(ts)
# phases = np.angle(fourier_ts)
# phases

In [68]:
new_features.to_csv('01_07/fourier_features.csv')

<h3>Добавление новых признаков в датасет</h3>

In [60]:
#загрузка датасета
data = pd.read_csv('breathes.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,ID,Date_Of_Birth,Sex,Had_Covid,Begin_Of_Covid,End_Of_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [61]:
#добавление новых признаков в датасет
i = 0
new_features_cols = new_features.columns
for id in range(1,87):
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'брюшное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'смешанное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1

  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_f

In [62]:
data.head(15)

Unnamed: 0,ID,Date_Of_Birth,Sex,Had_Covid,Begin_Of_Covid,End_Of_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,...,TMZ_amp_mean,TMZ_amp_median,TMZ_amp_diff,TMZ_amp_std,TMZ_amp_max,TMZ_amp_min,TMZ_amp_p25,TMZ_amp_p75,TMZ_amp_power,TMZ_amp_power_density
0,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
1,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
2,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
3,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
4,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
5,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,4.446874,0.019364,-4.36528,130.863794,3930.354,0.000598,0.011952,0.042236,15447740.0,17145.107302
6,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,4.443864,0.021628,-4.364635,130.827955,3929.278,0.001029,0.01343,0.044879,15439270.0,17135.701632
7,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,4.443864,0.021628,-4.364635,130.827955,3929.278,0.001029,0.01343,0.044879,15439270.0,17135.701632
8,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,4.443864,0.021628,-4.364635,130.827955,3929.278,0.001029,0.01343,0.044879,15439270.0,17135.701632
9,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,4.443864,0.021628,-4.364635,130.827955,3929.278,0.001029,0.01343,0.044879,15439270.0,17135.701632


<h3>Подготовка датасета для моделей машинного обучения</h3>

In [63]:
#отбрасывание ненужных столбцов
drop_columns = ['Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid']
df = data.drop(columns=drop_columns)
#определение численных и категориальных признаков
y = df['Breathing_Type']
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
X_categorical = df[categorical_cols]
non_numerical_cols = categorical_cols + ['ID', 'Breathing_Type']
X_numeric = df.drop(columns=non_numerical_cols)
#трансформация категориальных признаков
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col]= le.fit_transform(X_categorical[col])
le = LabelEncoder()
y = le.fit_transform(y)
#все входные признаки
X = np.concatenate((X_numeric, X_categorical), axis=1)
X_df = pd.concat([X_numeric, X_categorical], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_categorical[col]= le.fit_transform(X_categorical[col])
A value is trying to be set on a c

In [64]:
#разделение на обучающую(70%) и тестовую (30%) выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [65]:
from sklearn.metrics import recall_score, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
#расчет метрик по тесовой выборке
def calculate_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1-score: ", f1)
    report = classification_report(y_test, y_pred)
    print(report)

#расчет знамости признаков
def calc_feature_importances(model, X_df):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

<h3>Прямой отбор признаков</h3>

In [None]:
from sklearn.tree import DecisionTreeClassifier
from mlxtend.feature_selection import SequentialFeatureSelector


model_tree = DecisionTreeClassifier()
sfs = SequentialFeatureSelector(estimator=model_tree,
                                k_features=15,
                                forward=True,
                                scoring='accuracy',
                                cv=5)
sfs.fit(X_df, y)

In [113]:
# Выбранные признаки
selected_features = sfs.k_feature_names_

In [114]:
selected_features

In [None]:
selected_features.to_excel('sequential_features.xlsx')

<h3>Отбор признаков с помощью BorutaPy</h3>

In [94]:
from boruta import BorutaPy

In [104]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

In [None]:
feat_selector.fit(X, y)

In [106]:
feat_selector.support_

array([False,  True,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,

In [107]:
feat_selector.ranking_

array([11,  1,  1,  6,  1,  1,  1,  1,  8,  1,  1,  1,  1,  1,  8,  1,  1,
        1,  1,  1,  8,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, 14,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, 14,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 14,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  4,  5, 11, 10])

In [109]:
feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Rank': feat_selector.ranking_, 'Support': feat_selector.support_})
feature_importance = feature_importance.sort_values('Rank', ascending=True)
feature_importance

Unnamed: 0,Feature,Rank,Support
99,SMX_amp_max,1,True
122,SMY_amp_power_density,1,True
124,SMZ_phases_median,1,True
125,SMZ_phases_diff,1,True
126,SMZ_phases_std,1,True
...,...,...,...
197,Lung_Damage,11,False
0,Damage_Percent,11,False
109,SMY_phases_max,14,False
55,FMY_phases_max,14,False


In [110]:
feature_importance.to_excel('01_07/boruta_features.xlsx')

<h3>XGBoost</h3>

In [66]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier()
details = model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)
calculate_metrics(y_test, y_pred)

Accuracy:  0.9905437352245863
Recall:  0.9905437352245863
Precision:  0.9906042795368738
F1-score:  0.9905428945182585
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       149
           1       1.00      1.00      1.00       123
           2       0.98      0.99      0.99       151

    accuracy                           0.99       423
   macro avg       0.99      0.99      0.99       423
weighted avg       0.99      0.99      0.99       423



In [67]:
feature_importances = calc_feature_importances(model_xgb, X_df)
feature_importances

Unnamed: 0,Feature,Importance
26,Sin_Amp12/Sin_Amp23,0.031446
105,SMY_phases_mean,0.030445
124,SMZ_phases_median,0.029663
142,TMX_phases_median,0.027863
48,FMX_amp_p75,0.021484
...,...,...
169,TMY_amp_diff,0.000000
85,FMZ_amp_power,0.000000
171,TMY_amp_max,0.000000
81,FMZ_amp_max,0.000000


In [116]:
feature_importance.to_excel('01_07/xgboost_features.xlsx')