In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [2]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import LocallyLinearEmbedding

In [3]:
#загрузка датасета
df = pd.read_csv("breathes.csv")
df = df.drop(columns=['Unnamed: 0', 'Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid'])
df.head()

Unnamed: 0,ID,Sex,Had_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,Amplitude,DominatorFreq,DominatorFreqPower,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_2_3,0.145303,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,0.00505,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_2,-0.002082,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.362165,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,M,No,No,-0.207005,грудное,-0.144237,0.147009,D_1_3,2.536107,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [4]:
#целевой признак
y = df['Breathing_Type']
#категориальные переменные
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
X_categorical = df[categorical_cols]
#численные переменные
non_numerical_cols = categorical_cols + ['ID', 'Breathing_Type']
X_numeric = df.drop(columns=non_numerical_cols)

In [None]:
#трансформация категориальных переменных
le = LabelEncoder()
for col in X_categorical.columns:
    X_categorical[col]= le.fit_transform(X_categorical[col])
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
#все входные признаки
X = np.concatenate((X_numeric, X_categorical), axis=1)
X_df = pd.concat([X_numeric, X_categorical], axis=1)

In [12]:
def calc_feature_importances(coeffs):
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': coeffs})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

<h3>PCA</h3>

In [23]:
# Создание объекта PCA
pca = PCA()
# Применение PCA к данным
res = pca.fit(X)

In [24]:
# Получение вклада каждой компоненты (explained variance ratio)
explained_variance = pca.explained_variance_ratio_
df_features = calc_feature_importances(explained_variance)

In [25]:
# Получение модулей собственных чисел
singular_values = pca.singular_values_
df_features['explained_variance'] = df_features['Importance']
df_features['singular_values'] = calc_feature_importances(singular_values)['Importance']

In [27]:
df_features.drop(columns=['Importance'])

Unnamed: 0,Feature,explained_variance,singular_values
0,Damage_Percent,0.1561671,77.69965
1,Frequency,0.09620651,60.98549
2,Amplitude,0.07380878,53.41685
3,DominatorFreqPower,0.0615431,48.77688
4,D_1_2_Avg,0.0570602,46.9668
5,D_2_3_Avg,0.04997057,43.95228
6,Sin_Amp12,0.04611448,42.2224
7,Sin_Freq12,0.04479772,41.61522
8,Sin_Phase12,0.0385721,38.61543
9,Sin_Omega12,0.03768146,38.16701


In [None]:
df_features.to_excel('export/pca_features.xlsx')

<h3> Linear Discriminant Analysis (LDA)</h3>

In [40]:
# Создание объекта LDA
lda = LinearDiscriminantAnalysis()
# Применение LDA к данным
lda.fit(X, y)
# Получение весов признаков
feature_weights = lda.coef_
feature_weights = np.abs(np.mean(feature_weights, axis=0))

In [52]:
#значимость признаков для определения каждого из классов
#брюшное - 0, грудное - 1, смешанное - 2
feature_weights = lda.coef_
df_weight = pd.DataFrame(feature_weights)
names = ['брюшное', 'грудное', 'смешанное']
df_weight.columns = X_df.columns
df_weight['BreathType'] = names
df_weight

Unnamed: 0,Damage_Percent,Frequency,Amplitude,DominatorFreqPower,D_1_2_Avg,D_2_3_Avg,Sin_Amp12,Sin_Freq12,Sin_Phase12,Sin_Omega12,...,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2,Sex,Had_Covid,Lung_Damage,DominatorFreq,BreathType
0,0.029682,0.360408,0.383974,-0.033394,-0.135442,-0.024299,-0.030017,0.004581,-0.031435,0.004581,...,0.010028,0.000191,-0.03141,-0.023935,-0.070905,-0.007889,0.024991,-0.19271,0.172797,брюшное
1,0.051385,-0.265699,-0.175037,-0.089972,0.108872,-0.098143,0.039801,0.010048,0.020672,0.010048,...,0.111337,0.030097,-0.034764,0.298299,-0.265084,0.025337,-0.036025,-0.328117,-0.086905,грудное
2,-0.08147,-0.109161,-0.223175,0.123476,0.032099,0.122171,-0.008391,-0.014668,0.011996,-0.014668,...,-0.120493,-0.029971,0.066813,-0.270393,0.335428,-0.016923,0.009843,0.523509,-0.092386,смешанное


In [54]:
feature_weights = np.abs(np.mean(feature_weights, axis=0))
df_features = calc_feature_importances(feature_weights)
df_features

Unnamed: 0,Feature,Importance
1,Frequency,0.004817
2,Amplitude,0.004746
36,DominatorFreq,0.002165
4,D_1_2_Avg,0.001843
18,Sin_Amp23,0.001818
11,Sin_Period12,0.001701
17,Sin_Period13,0.001698
31,(Sin_Offset12/Sin_Offset13)^2,0.001324
20,Sin_Phase23,0.001204
27,Sin_Omega23/Sin_Omega13,0.001121


In [None]:
df_features.to_excel('export/lda_features.xlsx')

<h3> Locally Linear Embedding (LLE)</h3>

In [57]:
# Создание объекта LLE
lle = LocallyLinearEmbedding(n_components=37)
# Применение LLE к данным
res = lle.fit(X)

In [63]:
# Получение матрицы весов
feature_weights = lle.embedding_
feature_weights = np.abs(np.mean(feature_weights, axis=0))
df_features = calc_feature_importances(feature_weights)

In [64]:
df_features

Unnamed: 0,Feature,Importance
3,DominatorFreqPower,0.01520589
2,Amplitude,0.009051585
0,Damage_Percent,0.001537832
1,Frequency,0.001451064
4,D_1_2_Avg,4.637935e-09
5,D_2_3_Avg,6.148765e-12
9,Sin_Omega12,1.229686e-12
7,Sin_Freq12,6.414675e-13
13,Sin_Freq13,6.072594e-13
11,Sin_Period12,5.652743e-13


In [None]:
df_features.to_excel('export/lle_features.xlsx')