In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import  LabelEncoder

In [3]:
import pywt

<h3>Загрузка и предобработка датасета</h3>

In [4]:
#загрузка датасета
df = pd.read_csv('coords_data_numerated.csv')
df = df.drop(columns=['Unnamed: 0', 'GSR'])
df.head()

Unnamed: 0,id,BreathingType,TimeStamp_sec,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
0,1,0,0.058895,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0,0.111889,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0,0.158371,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0,0.262516,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0,0.29562,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


In [5]:
#Определение выходного признака и входных признаков
y = df['BreathingType']
X = df.drop(columns=['BreathingType','TimeStamp_sec'])
X.tail()

Unnamed: 0,id,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
230921,258,1.063,-1.649,3.843,1.093,-1.839,3.624,1.03,-1.867,3.859
230922,258,1.062,-1.648,3.842,1.094,-1.841,3.625,1.03,-1.867,3.857
230923,258,1.062,-1.648,3.842,1.093,-1.841,3.624,1.03,-1.868,3.855
230924,258,1.063,-1.647,3.842,1.094,-1.841,3.624,1.029,-1.868,3.853
230925,258,1.063,-1.647,3.842,1.094,-1.841,3.623,1.03,-1.868,3.853


In [59]:
#переименование столбцов
dict_renames = {
    'FirstMarkerXCoord' : 'FMX',
    'FirstMarkerYCoord' : 'FMY',
    'FirstMarkerZCoord' : 'FMZ',
    'SecondMarkerXCoord' :  'SMX',
    'SecondMarkerYCoord' : 'SMY',
    'SecondMarkerZCoord' : 'SMZ',
    'ThirdMarkerXCoord' : 'TMX',
    'ThirdMarkerYCoord' : 'TMY',
    'ThirdMarkerZCoord' : 'TMZ'
}
X = X.rename(columns=dict_renames)
X.head()

Unnamed: 0,id,FMX,FMY,FMZ,SMX,SMY,SMZ,TMX,TMY,TMZ
0,1,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


<h3>Извлечение признаков с помощью PyWavelets</h3>

In [145]:
#расчет общей энергии вейвлет-преобразования
def total_energy(prefix,all_ts):
    energy_df = pd.DataFrame()
    flatten_ts = [item for sub_list in all_ts for item in sub_list]
    energy_df[prefix+'_total_energy'] = [np.sum(np.square(flatten_ts))]
    return energy_df

#расчет статистических величин вейвлет-преобразования
def calc_features(prefix, ts):
    features = pd.DataFrame()
    #энергия
    features[prefix+'energy'] = [np.sum(np.square(ts))]
    features[prefix+'mean'] = [np.mean(ts)]
    features[prefix+'std'] = [np.std(ts)]
    features[prefix+'median'] = [np.median(ts)]
    #коэффициент ассиметрии
    features[prefix+'diff'] = [np.mean(np.diff(ts))]
    features[prefix+'max'] = [np.max(ts)]
    features[prefix+'min'] = [np.min(ts)]
    #нижний квантиль
    features[prefix+'p25'] = [np.percentile(ts, 25)]
    #верхний квантиль
    features[prefix+'p75'] = [np.percentile(ts, 75)]
    return features

#расчет стат. величин коэффициентов аппроксимации
def approximation_features(prefix, approx_ts):
    prefix += '_approx_'
    approx_features = calc_features(prefix, approx_ts)
    return approx_features

#расчет стат.величин коэффициентов уровней детализации
def detailed_features(prefix, detailed_tss):
    detailed_features = pd.DataFrame()
    prefix += "_d"
    for i, detail_ts in enumerate(detailed_tss):
        j = i + 1
        curr_prefix = prefix+str(j)+"_"
        curr_detailed_features = calc_features(curr_prefix, detail_ts)
        #detailed_features = detailed_features.append(curr_detailed_features, ignore_index=True)
        detailed_features = pd.concat([detailed_features, curr_detailed_features], axis=1)
    return detailed_features

In [146]:
#рассматриваемые семейства вейвлетов
wavelets = ['db8', 'sym8', 'coif6', 'bior6.8', 'haar']
#уровень детализации
level = 3

In [136]:
#расчет для всех 9 столбцов
numeric_cols = X.drop(columns='id')

In [None]:
#расчет новых признаков
new_features = pd.DataFrame()
for i in range(1, 259):
    ts_features = pd.DataFrame()
    #расчет статистических показателей по каждому столбцу
    for col in numeric_cols:
        col_values = X[X['id'] == i] [col]
        for wavelet in wavelets:
            coeffs = pywt.wavedec(col_values, wavelet=wavelet, level=level)
            prefix = col+"_"+wavelet
            energy_features = total_energy(prefix, coeffs)
            approx_features = approximation_features(prefix, coeffs[0])
            detail_features  = detailed_features(prefix, coeffs[1:])
            ts_features = pd.concat([ts_features, energy_features, approx_features, detail_features], axis=1)
    new_features = pd.concat([new_features,ts_features], axis=0, ignore_index=True)

In [149]:
#всего больше 1500 признаков
new_features.to_csv('pywave_features.csv')

<h3>Добавление новых признаков в датасет</h3>

In [150]:
#загрузка датасета
data = pd.read_csv('breathes.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,ID,Date_Of_Birth,Sex,Had_Covid,Begin_Of_Covid,End_Of_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [None]:
#добавление новых признаков в датасет
i = 0
new_features_cols = new_features.columns
for id in range(1,87):
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'брюшное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'смешанное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1

In [None]:
data.head(20)

<h3>Подготовка датасета для моделей машинного обучения</h3>

In [None]:
#отбрасывание ненужных столбцов
drop_columns = ['Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid']
df = data.drop(columns=drop_columns)
#определение численных и категориальных признаков
y = df['Breathing_Type']
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
X_categorical = df[categorical_cols]
non_numerical_cols = categorical_cols + ['ID', 'Breathing_Type']
X_numeric = df.drop(columns=non_numerical_cols)
#трансформация категориальных признаков
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col]= le.fit_transform(X_categorical[col])
le = LabelEncoder()
y = le.fit_transform(y)
#все входные признаки
X = np.concatenate((X_numeric, X_categorical), axis=1)
X_df = pd.concat([X_numeric, X_categorical], axis=1)

In [154]:
#разделение на обучающую(70%) и тестовую (30%) выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [155]:
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
#расчет метрик по тесовой выборке
def calculate_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1-score: ", f1)
    report = classification_report(y_test, y_pred)
    print(report)

#расчет знамости признаков
def calc_feature_importances(model, X_df):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

<h3>Метод взаимной информации (Mutual Info)</h3>

In [173]:
from sklearn.feature_selection import mutual_info_classif

importances = mutual_info_classif(X, y)

In [174]:
feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importances})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
best_features = feature_importance[:100]

In [175]:
best_features.head(15)

Unnamed: 0,Feature,Importance
130,FMX_coif6_d2_diff,1.096671
184,FMX_haar_approx_std,1.095743
245,FMY_db8_d2_p75,1.09454
84,FMX_sym8_d1_diff,1.093345
356,FMY_bior6.8_d2_p75,1.093282
142,FMX_coif6_d3_p25,1.092791
236,FMY_db8_d1_p75,1.092651
453,FMZ_sym8_d1_median,1.092033
262,FMY_sym8_approx_min,1.091828
250,FMY_db8_d3_diff,1.091806


In [176]:
best_features.to_excel('mutual_best_features.xlsx')

<h3>F-Test (Тест Фишера)</h3>

In [177]:
from sklearn.feature_selection import f_classif

#расчет f-scores
f_scores, p_values = f_classif(X, y)

  f = msb / msw


In [179]:
feature_importance = pd.DataFrame({'Feature': X_df.columns, 'F-score': f_scores, 'p':p_values})
feature_importance = feature_importance.sort_values('F-score', ascending=False)
best_features = feature_importance[:100]

In [180]:
best_features.head(15)

Unnamed: 0,Feature,F-score,p
384,FMY_haar_d1_p75,132.165305,2.550422e-53
383,FMY_haar_d1_p25,121.973832,1.428204e-49
378,FMY_haar_d1_std,119.782672,9.261814e-49
387,FMY_haar_d2_std,99.818216,2.920937e-41
392,FMY_haar_d2_p25,95.189741,1.7008120000000002e-39
401,FMY_haar_d3_p25,94.631534,2.7811710000000003e-39
369,FMY_haar_approx_std,88.544459,6.066229e-37
393,FMY_haar_d2_p75,88.330651,7.334857e-37
221,FMY_db8_approx_std,86.155189,5.079398e-36
332,FMY_bior6.8_approx_std,85.18729,1.203579e-35


In [181]:
best_features.to_excel('ftest_best_features.xlsx')

<h3>Decision Tree</h3>

In [182]:
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()
details = model_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)

In [183]:
calculate_metrics(y_test, y_pred)

Accuracy:  0.9905437352245863
Recall:  0.9905437352245863
Precision:  0.9906042795368738
F1-score:  0.9905428945182585
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       149
           1       1.00      1.00      1.00       123
           2       0.98      0.99      0.99       151

    accuracy                           0.99       423
   macro avg       0.99      0.99      0.99       423
weighted avg       0.99      0.99      0.99       423



In [184]:
feature_importances = calc_feature_importances(model_tree, X_df)
best_features = feature_importances[:100]

In [185]:
best_features.head(15)

Unnamed: 0,Feature,Importance
378,FMY_haar_d1_std,0.097624
143,FMX_coif6_d3_p75,0.059502
1303,TMX_haar_d1_std,0.057595
1277,TMX_bior6.8_d2_diff,0.054836
1085,SMZ_bior6.8_d1_min,0.044514
1195,TMX_sym8_d1_max,0.042823
1605,TMZ_coif6_d1_p75,0.041956
140,FMX_coif6_d3_max,0.040577
1651,TMZ_bior6.8_d2_p75,0.039582
1197,TMX_sym8_d1_p25,0.03646


In [186]:
best_features.to_excel('decisiontree_best_features.xlsx')