In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  LabelEncoder

<h3>Загрузка и предобработка датасета</h3>

In [3]:
#загрузка датасета
df = pd.read_csv('coords_data_numerated.csv')
df = df.drop(columns=['Unnamed: 0', 'GSR'])
df.head()

Unnamed: 0,id,BreathingType,TimeStamp_sec,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
0,1,0,0.058895,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0,0.111889,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0,0.158371,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0,0.262516,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0,0.29562,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


In [4]:
#Определение выходного признака и входных признаков
y = df['BreathingType']
X = df.drop(columns=['BreathingType'])
X.tail()

Unnamed: 0,id,TimeStamp_sec,FirstMarkerXCoord,FirstMarkerYCoord,FirstMarkerZCoord,SecondMarkerXCoord,SecondMarkerYCoord,SecondMarkerZCoord,ThirdMarkerXCoord,ThirdMarkerYCoord,ThirdMarkerZCoord
230921,258,59.76925,1.063,-1.649,3.843,1.093,-1.839,3.624,1.03,-1.867,3.859
230922,258,59.80188,1.062,-1.648,3.842,1.094,-1.841,3.625,1.03,-1.867,3.857
230923,258,59.90696,1.062,-1.648,3.842,1.093,-1.841,3.624,1.03,-1.868,3.855
230924,258,59.92954,1.063,-1.647,3.842,1.094,-1.841,3.624,1.029,-1.868,3.853
230925,258,59.99477,1.063,-1.647,3.842,1.094,-1.841,3.623,1.03,-1.868,3.853


In [5]:
#переименование столбцов
dict_renames = {
    'FirstMarkerXCoord' : 'FMX',
    'FirstMarkerYCoord' : 'FMY',
    'FirstMarkerZCoord' : 'FMZ',
    'SecondMarkerXCoord' :  'SMX',
    'SecondMarkerYCoord' : 'SMY',
    'SecondMarkerZCoord' : 'SMZ',
    'ThirdMarkerXCoord' : 'TMX',
    'ThirdMarkerYCoord' : 'TMY',
    'ThirdMarkerZCoord' : 'TMZ'
}
X = X.rename(columns=dict_renames)
X.head()

Unnamed: 0,id,TimeStamp_sec,FMX,FMY,FMZ,SMX,SMY,SMZ,TMX,TMY,TMZ
0,1,0.058895,0.689,-1.453,4.417,0.881,-1.544,4.547,0.718,-1.607,4.369
1,1,0.111889,0.687,-1.452,4.418,0.881,-1.544,4.547,0.715,-1.607,4.371
2,1,0.158371,0.687,-1.451,4.419,0.881,-1.544,4.545,0.714,-1.607,4.371
3,1,0.262516,0.685,-1.45,4.42,0.882,-1.544,4.541,0.711,-1.608,4.373
4,1,0.29562,0.684,-1.449,4.42,0.883,-1.544,4.54,0.709,-1.607,4.37


<h3>Извлечение признаков с помощью библиотеки TSFresh</h3>

In [6]:
import tsfresh

In [7]:
settings = tsfresh.feature_extraction.settings.ComprehensiveFCParameters()

In [8]:
extracted_features = tsfresh.extract_features(X, column_sort='TimeStamp_sec', column_id='id', default_fc_parameters=settings)

Feature Extraction: 100%|██████████| 30/30 [07:27<00:00, 14.92s/it]


In [12]:
extracted_features.head()

Unnamed: 0,SMX__variance_larger_than_standard_deviation,SMX__has_duplicate_max,SMX__has_duplicate_min,SMX__has_duplicate,SMX__sum_values,SMX__abs_energy,SMX__mean_abs_change,SMX__mean_change,SMX__mean_second_derivative_central,SMX__median,...,FMZ__fourier_entropy__bins_5,FMZ__fourier_entropy__bins_10,FMZ__fourier_entropy__bins_100,FMZ__permutation_entropy__dimension_3__tau_1,FMZ__permutation_entropy__dimension_4__tau_1,FMZ__permutation_entropy__dimension_5__tau_1,FMZ__permutation_entropy__dimension_6__tau_1,FMZ__permutation_entropy__dimension_7__tau_1,FMZ__query_similarity_count__query_None__threshold_0.0,FMZ__mean_n_absolute_max__number_of_maxima_7
1,0.0,1.0,1.0,1.0,786.339,686.379751,0.000353,-4e-06,0.0,0.875,...,0.204871,0.367289,0.929162,1.472701,2.41964,3.438875,4.449709,5.273135,,4.435429
2,0.0,1.0,1.0,1.0,802.607,715.117611,0.000334,-4.1e-05,-5.561735e-07,0.891,...,0.183378,0.310337,0.842774,1.562161,2.638883,3.795032,4.872082,5.708663,,4.417429
3,0.0,0.0,1.0,1.0,773.539,664.234557,0.000439,-2.6e-05,-5.561735e-07,0.856,...,0.136002,0.215617,0.758199,1.515849,2.540378,3.625466,4.63189,5.442175,,4.445286
4,0.0,1.0,1.0,1.0,894.241,887.645061,0.000349,-7e-06,-1.112347e-06,0.995,...,0.090729,0.090729,0.442198,1.338456,2.129546,2.938742,3.717525,4.405405,,4.298143
5,0.0,1.0,1.0,1.0,749.086,622.802878,0.00031,1.4e-05,0.0,0.831,...,0.249958,0.433549,1.201067,1.343022,2.17533,3.074689,4.002324,4.82314,,4.238


In [29]:
unique_pairs = df[['id', 'BreathingType']].drop_duplicates()
unique_pairs = unique_pairs.reset_index(drop=True)
y = unique_pairs
y.index = y['id']
y = y.drop(columns=['id'])
y = pd.Series(y['BreathingType'])

In [30]:
y

id
1      0
2      1
3      2
4      0
5      1
      ..
254    1
255    2
256    0
257    1
258    2
Name: BreathingType, Length: 258, dtype: int64

In [31]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y, multiclass=True)

In [32]:
features_filtered.head()

Unnamed: 0,"FMY__agg_autocorrelation__f_agg_""var""__maxlag_40","FMY__agg_linear_trend__attr_""intercept""__chunk_len_50__f_agg_""var""",FMY__partial_autocorrelation__lag_3,FMY__fourier_entropy__bins_10,"FMY__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""mean""","FMY__linear_trend__attr_""stderr""","FMY__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""mean""","FMY__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""max""","FMY__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""min""","FMY__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""",...,"FMZ__fft_coefficient__attr_""real""__coeff_37","TMX__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""var""","TMX__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""","TMX__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""var""","TMX__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.8","TMX__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""","TMX__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.8","TMX__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.8","TMX__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""var""",TMX__ar_coefficient__coeff_1__k_10
1,0.203342,1.3e-05,-0.222898,0.136002,1.2e-05,4.290971e-07,5e-06,5e-06,5e-06,1.3e-05,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
2,0.368654,5.5e-05,-0.325654,0.136002,3.1e-05,1.024434e-06,1.1e-05,1.1e-05,1.1e-05,3.2e-05,...,0.109409,3.231085e-07,2.8e-05,8.509713e-09,0.000358,7.471823e-10,3.678488e-07,2.400601e-07,7.830371e-07,0.936582
3,0.368037,5.2e-05,-0.381678,0.136002,3.5e-05,1.141785e-06,1.3e-05,1.3e-05,1.3e-05,3.4e-05,...,0.180152,1.314077e-06,5.5e-05,2.155637e-08,0.000715,2.00641e-09,1.060953e-06,5.501077e-07,2.354984e-06,0.937611
4,0.029265,3e-06,-0.126742,0.280179,9e-06,2.89439e-07,3e-06,3e-06,3e-06,9e-06,...,0.124672,7.638346e-07,3.6e-05,1.435572e-08,0.000398,1.644779e-09,4.077468e-07,2.497918e-07,7.3603e-07,1.090171
5,0.254175,7e-06,-0.265776,0.170467,1e-05,3.4656e-07,4e-06,4e-06,4e-06,1.1e-05,...,0.123733,2.256864e-07,1.4e-05,2.989264e-09,0.000254,3.413762e-10,2.613493e-07,1.969553e-07,1.301934e-07,0.831451


In [33]:
features_filtered.to_csv('features_filtered.csv')

In [34]:
new_features = features_filtered.reset_index()

<h3>Добавление новых признаков в датасет</h3>

In [35]:
#загрузка датасета
data = pd.read_csv('breathes.csv')
data = data.drop(columns=['Unnamed: 0'])
data.head()

Unnamed: 0,ID,Date_Of_Birth,Sex,Had_Covid,Begin_Of_Covid,End_Of_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,...,Sin_Period23,Sin_Amp23/Sin_Amp13,Sin_Amp12/Sin_Amp13,Sin_Amp12/Sin_Amp23,Sin_Omega23/Sin_Omega13,Sin_Omega12/Sin_Omega13,Sin_Omega12/Sin_Omega23,(Sin_Offset23/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset13)^2,(Sin_Offset12/Sin_Offset23)^2
0,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.178712,0.002273,-0.001832,-0.032392,-0.156378,-0.155622,-0.156309,-0.088206,-0.063808,-0.059907
1,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.183415,0.002486,-0.001808,-0.029759,-0.151883,-0.154821,-0.159698,-0.088211,-0.063809,-0.059906
2,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.177844,0.002607,-0.001792,-0.030606,-0.157144,-0.156177,-0.155971,-0.088215,-0.063811,-0.059905
3,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.182131,0.002219,-0.002349,-0.030553,-0.152623,-0.155988,-0.16006,-0.088219,-0.063815,-0.059905
4,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,-0.191648,0.003473,-0.001519,-0.031048,-0.165326,-0.156451,-0.137226,-0.088204,-0.063811,-0.059908


In [36]:
#добавление новых признаков в датасет
i = 0
new_features_cols = new_features.columns
for id in range(1,87):
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'брюшное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1
    for col in new_features_cols:
        data.loc[(data['Breathing_Type'] == 'смешанное') & (data['ID'] == id), col] = new_features.loc[i][col]
    i += 1

  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_f

  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_f

  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_f

  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_features.loc[i][col]
  data.loc[(data['Breathing_Type'] == 'грудное') & (data['ID'] == id), col]= new_f

In [37]:
data.head(15)

Unnamed: 0,ID,Date_Of_Birth,Sex,Had_Covid,Begin_Of_Covid,End_Of_Covid,Lung_Damage,Damage_Percent,Breathing_Type,Frequency,...,"FMZ__fft_coefficient__attr_""real""__coeff_37","TMX__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""var""","TMX__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""","TMX__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""var""","TMX__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.8","TMX__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""","TMX__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.8","TMX__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.8","TMX__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""var""",TMX__ar_coefficient__coeff_1__k_10
0,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
1,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
2,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
3,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
4,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
5,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,грудное,-0.144237,...,0.156939,1.293342e-06,4.5e-05,2.11418e-08,0.000635,2.22945e-09,9.175544e-07,5.300205e-07,1.498184e-06,1.075671
6,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,0.109409,3.231085e-07,2.8e-05,8.509713e-09,0.000358,7.471823e-10,3.678488e-07,2.400601e-07,7.830371e-07,0.936582
7,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,0.109409,3.231085e-07,2.8e-05,8.509713e-09,0.000358,7.471823e-10,3.678488e-07,2.400601e-07,7.830371e-07,0.936582
8,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,0.109409,3.231085e-07,2.8e-05,8.509713e-09,0.000358,7.471823e-10,3.678488e-07,2.400601e-07,7.830371e-07,0.936582
9,1,04.07.2003,M,No,00.00.0000,00.00.0000,No,-0.207005,брюшное,-0.4641,...,0.109409,3.231085e-07,2.8e-05,8.509713e-09,0.000358,7.471823e-10,3.678488e-07,2.400601e-07,7.830371e-07,0.936582


<h3>Подготовка датасета для тестирования</h3>

In [38]:
#отбрасывание ненужных столбцов
categorical_cols = ['Sex', 'Had_Covid', 'Lung_Damage', 'DominatorFreq']
drop_columns = ['Date_Of_Birth', 'Begin_Of_Covid', 'End_Of_Covid'] + categorical_cols
df = data.drop(columns=drop_columns)
#целевой признак
y = df['Breathing_Type']
#определение численных признаков
non_numerical_cols = ['ID', 'Breathing_Type']
X = df.drop(columns=non_numerical_cols)
#трансформация категориальных признаков
le = LabelEncoder()
y = le.fit_transform(y)

In [39]:
#разделение на обучающую(70%) и тестовую (30%) выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
from sklearn.metrics import recall_score, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report
#расчет метрик по тесовой выборке
def calculate_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Accuracy: ", accuracy)
    print("Recall: ", recall)
    print("Precision: ", precision)
    print("F1-score: ", f1)
    report = classification_report(y_test, y_pred)
    print(report)

#расчет знамости признаков
def calc_feature_importances(model, X_df):
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': X_df.columns, 'Importance': importance})
    feature_importance = feature_importance.sort_values('Importance', ascending=False)
    return feature_importance

<h2>Тестирование новых признаков</h2>

<h3>BorutaPy</h3>

In [63]:
from boruta import BorutaPy

In [64]:
X_np = X.to_numpy()

In [65]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=1)

In [66]:
X_new = feat_selector.fit_transform(X_np, y)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	253
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	187
Tentative: 	45
Rejected: 	21
Iteration: 	9 / 100
Confirmed: 	187
Tentative: 	45
Rejected: 	21
Iteration: 	10 / 100
Confirmed: 	187
Tentative: 	45
Rejected: 	21
Iteration: 	11 / 100
Confirmed: 	187
Tentative: 	45
Rejected: 	21
Iteration: 	12 / 100
Confirmed: 	199
Tentative: 	33
Rejected: 	21
Iteration: 	13 / 100
Confirmed: 	199
Tentative: 	33
Rejected: 	21
Iteration: 	14 / 100
Confirmed: 	199
Tentative: 	33
Rejected: 	21
Iteration: 	15 / 100
Confirmed: 	199
Tentative: 	33
Rejected: 	21
Iteration: 	16 / 100
Confirmed: 	

In [75]:
feature_importance = pd.DataFrame({'Feature': X.columns, 'Rank': feat_selector.ranking_, 'Support': feat_selector.support_})
#feature_importance = feature_importance.sort_values('Rank', ascending=True)
bad_features = feature_importance[feature_importance['Support'] == False]
drop_columns = list(bad_features['Feature'])
drop_columns

['Damage_Percent',
 'DominatorFreqPower',
 'Sin_Phase12',
 'Sin_Amp13',
 'Sin_Phase13',
 'Sin_Phase23',
 'Sin_Omega23/Sin_Omega13',
 'Sin_Omega12/Sin_Omega23',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.4',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.8__ql_0.6',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.8__ql_0.4',
 'FMY__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"var"',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.8__ql_0.2',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.2',
 'FMY__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.4',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.2',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.8__ql_0.0',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_1.0__ql_0.6',
 'FMY__change_quantiles__f_agg_"var"__isabs_True__qh_0.6__ql_0.0',
 'FMY__change_quantiles__f_agg_"var

In [76]:
X2 = X.drop(columns=drop_columns)
X2.head()

Unnamed: 0,Frequency,Amplitude,D_1_2_Avg,D_2_3_Avg,Sin_Amp12,Sin_Freq12,Sin_Omega12,Sin_Offset12,Sin_Period12,Sin_Freq13,...,"SMX__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.2","TMX__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""max""","FMZ__fft_coefficient__attr_""real""__coeff_37","TMX__agg_linear_trend__attr_""intercept""__chunk_len_5__f_agg_""var""","TMX__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""","TMX__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.8","TMX__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.8","TMX__change_quantiles__f_agg_""var""__isabs_True__qh_1.0__ql_0.8","TMX__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""var""",TMX__ar_coefficient__coeff_1__k_10
0,-0.144237,0.147009,-0.124027,0.00422,-0.034874,-0.346438,-0.346438,0.027031,-0.179298,-0.320825,...,0.000335,1.6e-05,0.156939,1e-06,4.5e-05,0.000635,9.175544e-07,5.300205e-07,1e-06,1.075671
1,-0.144237,0.147009,-0.063376,0.028864,-0.035017,-0.287699,-0.287699,0.027561,-0.180414,-0.308442,...,0.000335,1.6e-05,0.156939,1e-06,4.5e-05,0.000635,9.175544e-07,5.300205e-07,1e-06,1.075671
2,-0.144237,0.147009,-0.083593,0.028864,-0.034989,-0.37041,-0.37041,0.027551,-0.178809,-0.314792,...,0.000335,1.6e-05,0.156939,1e-06,4.5e-05,0.000635,9.175544e-07,5.300205e-07,1e-06,1.075671
3,-0.144237,0.147009,-0.10381,0.028864,-0.032681,-0.39397,-0.39397,0.027388,-0.178306,-0.345404,...,0.000335,1.6e-05,0.156939,1e-06,4.5e-05,0.000635,9.175544e-07,5.300205e-07,1e-06,1.075671
4,-0.144237,0.147009,-0.10381,0.06583,-0.033667,3.960226,3.960226,0.027337,-0.19695,3.686541,...,0.000335,1.6e-05,0.156939,1e-06,4.5e-05,0.000635,9.175544e-07,5.300205e-07,1e-06,1.075671


<h3>Generic Univariate Select (Тест-фишера)</h3>

In [77]:
from sklearn.feature_selection import GenericUnivariateSelect, f_classif

selector = GenericUnivariateSelect(f_classif, mode='k_best', param=50)

In [78]:
X_new = selector.fit_transform(X2, y)

In [80]:
importance = selector.scores_
feature_importance = pd.DataFrame({'Feature': X2.columns, 'Scores': importance})
feature_importance = feature_importance.sort_values('Scores', ascending=False)
feature_importance

Unnamed: 0,Feature,Scores
26,"FMY__agg_autocorrelation__f_agg_""var""__maxlag_40",206.868797
28,FMY__partial_autocorrelation__lag_3,186.130765
29,FMY__fourier_entropy__bins_10,166.267675
38,FMY__fourier_entropy__bins_100,163.495489
33,"FMY__agg_linear_trend__attr_""stderr""__chunk_le...",147.095832
...,...,...
19,Sin_Amp12/Sin_Amp13,0.312955
17,Sin_Period23,0.221363
22,(Sin_Offset23/Sin_Offset13)^2,0.139079
11,Sin_Offset13,0.136538


In [83]:
feature_importance.to_excel('ftest_features.xlsx')

<h3>Generic Univariate Select (Mutual Info)</h3>

In [95]:
from sklearn.feature_selection import mutual_info_classif
selector = GenericUnivariateSelect(mutual_info_classif, mode='k_best', param=50)

In [96]:
X_new = selector.fit_transform(X2, y)

In [97]:
importance = selector.scores_
feature_importance = pd.DataFrame({'Feature': X2.columns, 'Scores': importance, 'Support': selector.get_support()})
feature_importance = feature_importance.sort_values('Scores', ascending=False)
feature_importance

Unnamed: 0,Feature,Scores,Support
210,"FMZ__fft_coefficient__attr_""real""__coeff_37",1.092985,True
100,FMY__spkt_welch_density__coeff_2,1.090960,True
51,"FMY__fft_coefficient__attr_""abs""__coeff_12",1.090916,True
64,"FMY__fft_coefficient__attr_""abs""__coeff_14",1.090313,True
78,FMY__partial_autocorrelation__lag_4,1.088674,True
...,...,...,...
14,Sin_Freq23,0.030357,False
15,Sin_Omega23,0.030357,False
17,Sin_Period23,0.028520,False
126,FMX__large_standard_deviation__r_0.15000000000...,0.027014,False


In [98]:
feature_importance.to_excel('mutual_features.xlsx')

<h3>LightGBM</h3>

In [85]:
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5


In [99]:
import lightgbm as lgb

In [100]:
#разделение на обучающую(70%) и тестовую (30%) выборки
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=42)

In [101]:
train_data = lgb.Dataset(X_train, label=y_train)

In [102]:
params = {
    'objective': 'multiclass',  # Многоклассовая классификация
    'num_classes': 3,  # Количество классов
    'metric': 'multi_logloss'  # Метрика оценки
}

In [103]:
model = lgb.train(params, train_data, num_boost_round=100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10929
[LightGBM] [Info] Number of data points in the train set: 985, number of used features: 50
[LightGBM] [Info] Start training from score -1.087507
[LightGBM] [Info] Start training from score -1.069596
[LightGBM] [Info] Start training from score -1.140069


In [104]:
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(axis=1)  # Преобразование вероятностей в классы

calculate_metrics(y_test, y_pred_class)

Accuracy:  1.0
Recall:  1.0
Precision:  1.0
F1-score:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       149
           1       1.00      1.00      1.00       123
           2       1.00      1.00      1.00       151

    accuracy                           1.00       423
   macro avg       1.00      1.00      1.00       423
weighted avg       1.00      1.00      1.00       423



In [105]:
importance = model.feature_importance()

feature_importance = pd.DataFrame({'Feature': selector.get_feature_names_out(), 'Importance': importance})
feature_importance = feature_importance.sort_values('Importance', ascending=False)

feature_importance

Unnamed: 0,Feature,Importance
39,"TMZ__agg_autocorrelation__f_agg_""var""__maxlag_40",370
0,"FMY__agg_autocorrelation__f_agg_""var""__maxlag_40",343
1,"FMY__agg_linear_trend__attr_""intercept""__chunk...",309
48,"FMZ__fft_coefficient__attr_""real""__coeff_37",307
40,TMX__partial_autocorrelation__lag_2,300
43,"TMX__agg_autocorrelation__f_agg_""median""__maxl...",298
31,"FMX__fft_coefficient__attr_""abs""__coeff_40",292
37,"TMX__change_quantiles__f_agg_""mean""__isabs_Tru...",289
17,"FMY__fft_coefficient__attr_""abs""__coeff_14",287
21,FMY__partial_autocorrelation__lag_4,279


In [106]:
feature_importance.to_excel('lightGBM_mutual_features.xlsx')