In [2]:
import sys
import os
sys.path.append(os.path.abspath("../.."))

from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


from sklearn.decomposition import PCA


### Carregando os dados integralmente

In [13]:
df = pd.read_excel(
    '../../datasets/base_vertical/intermediate/HS_agregado_codificado_20windows_Binary.xlsx')
del df['Heart Sound Type_Normal']


### **dividindo os grupos (Heart Sound ID) em treino/validação/teste**

#### Passo 1: obter a classe dominante (Normal ou não) por 'Heart Sound ID'

Assumimos: se 'Heart Sound Type_Normal' == 1 -> Normal; senão -> Anormal

In [5]:

df['Heart Sound ID'] = df['Heart Sound ID'].astype(str)
group_labels_df = df.groupby('Heart Sound ID')['Heart Sound Type_Normal'].agg(
    lambda x: int(x.mode()[0])).reset_index()

group_ids = group_labels_df['Heart Sound ID']
group_labels = group_labels_df['Heart Sound Type_Normal']


#### Passo 2: dividindo os grupos em treino+val e teste com estratificação


In [6]:

group_trainval, group_test = train_test_split(
    group_ids, test_size=0.2, random_state=42, stratify=group_labels
)


#### Passo 3: Obtendo labels para o subset de treino+validação

In [7]:

trainval_mask = group_labels_df['Heart Sound ID'].isin(group_trainval)
group_labels_trainval = group_labels_df[trainval_mask]['Heart Sound Type_Normal']


#### Passo 4: Dividir treino e validação com estratificação


In [8]:

group_train, group_val = train_test_split(
    group_trainval, test_size=0.25, random_state=42, stratify=group_labels_trainval
)


#### Passo 5: Filtrar os DataFrames com base nas divisões


In [9]:

df_train = df[df['Heart Sound ID'].isin(group_train)].copy()
df_val = df[df['Heart Sound ID'].isin(group_val)].copy()
df_test = df[df['Heart Sound ID'].isin(group_test)].copy()


##### Salvando

In [10]:

output_dir = '../../datasets/base_vertical/'
os.makedirs(output_dir, exist_ok=True)

train_path = os.path.join(output_dir, "train/train_stratified.csv")
val_path = os.path.join(output_dir, "val/val_stratified.csv")
test_path = os.path.join(output_dir, "test/test_stratified.csv")

df_train.to_csv(train_path, index=False)
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)

train_path, val_path, test_path

('../../datasets/base_vertical/train/train_stratified.csv',
 '../../datasets/base_vertical/val/val_stratified.csv',
 '../../datasets/base_vertical/test/test_stratified.csv')

### **Divisão usando groupkfold estratificado**

#### criando df com rótulo por grupo

In [11]:
group_label_df = df.groupby("Heart Sound ID")["Heart Sound Type_Normal"].agg(
    lambda x: int(x.mode()[0])).reset_index()
group_ids = group_label_df["Heart Sound ID"].values
group_labels = group_label_df["Heart Sound Type_Normal"].values

#### StratifiedKFold sobre os grupos

In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_paths = []

for fold, (train_group_idx, val_group_idx) in enumerate(skf.split(group_ids, group_labels)):
     train_ids = group_ids[train_group_idx]
     val_ids = group_ids[val_group_idx]

     df_train = df[df["Heart Sound ID"].isin(train_ids)].copy()
     df_val = df[df["Heart Sound ID"].isin(val_ids)].copy()

     fold_dir = f"..\\..\\datasets\\base_vertical\\groupkfold\\fold_{fold+1}"
     os.makedirs(fold_dir, exist_ok=True)

     train_path = os.path.join(fold_dir, "train.csv")
     val_path = os.path.join(fold_dir, "val.csv")

     df_train.to_csv(train_path, index=False)
     df_val.to_csv(val_path, index=False)

     fold_paths.append((train_path, val_path))

fold_paths

[('..\\..\\datasets\\base_vertical\\groupkfold\\fold_1\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_1\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_2\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_2\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_3\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_3\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_4\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_4\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_5\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_5\\val.csv')]

### Normalizando os dados

In [None]:
dummy_cols = [col for col in df.columns
              if df[col].dropna().isin([0, 1]).all() and df[col].nunique() <= 2]
non_dummy_cols = [col for col in df.columns if col not in dummy_cols]




# X_train, X_test, y_train, y_test = train_test_split()

# sca = StandardScaler()

# X_train_scaled = sca.fit_transform(X_train[non_dummy_cols])
# X_train = pd.DataFrame(X_train_scaled, columns=non_dummy_cols)


# X_val_scaled = sca.transform(X_val[non_dummy_cols])
# X_val = pd.DataFrame(X_val_scaled, columns=non_dummy_cols)


# X_train.to_excel(
#     "../../datasets/base_vertical/train/train_binario_normalized.xlsx", index=False)
# X_val.assign(Heart_Sound_Type_Normal=y_val).to_excel(
#     "../../datasets/base_vertical/val/val_binario_normalized.xlsx", index=False)

In [16]:
X, y = get_splited_data(df, 'Heart Sound Type_Abnormal')

In [19]:
X

Unnamed: 0,Heart Sound ID,window_id,freq_mean,centro_x_mean,centro_y_mean,raio_medio_mean,raio_std_mean,raio_max_mean,raio_min_mean,simetria_x_mean,...,auto_intersecoes_std,rms_energy_std,Gender_F,Gender_M,Location_Apex,Location_LC,Location_LLSB,Location_LUSB,Location_RC,Location_RUSB
0,F_AF_A,0,164.354167,-0.000004,0.000024,0.030488,0.029553,0.221206,0.000223,0.029517,...,349.096316,0,1,0,1,0,0,0,0,0
1,F_AF_A,1,165.967851,-0.000125,0.000078,0.029203,0.027863,0.197124,0.000237,0.029405,...,384.229543,0,1,0,1,0,0,0,0,0
2,F_AF_A,2,134.653533,0.000016,0.000108,0.039275,0.039114,0.466807,0.000273,0.038954,...,358.621247,0,1,0,1,0,0,0,0,0
3,F_AF_A,3,128.112793,-0.000062,-0.000403,0.042433,0.039972,0.466564,0.000424,0.040655,...,334.691566,0,1,0,1,0,0,0,0,0
4,F_AF_A,4,148.798077,-0.000023,-0.000161,0.050374,0.046386,0.466689,0.000442,0.048990,...,170.214114,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,M_T_LUSB,15,72.182513,-0.000078,-0.000060,0.074462,0.100202,0.706487,0.000553,0.074324,...,464.743882,0,0,1,0,0,0,1,0,0
996,M_T_LUSB,16,84.000000,-0.000307,-0.000269,0.089283,0.124158,0.705161,0.000713,0.096586,...,590.804500,0,0,1,0,0,0,1,0,0
997,M_T_LUSB,17,94.170673,-0.003138,0.004882,0.101705,0.141314,0.702215,0.000741,0.101515,...,116.964978,0,0,1,0,0,0,1,0,0
998,M_T_LUSB,18,89.746094,0.000609,-0.003273,0.102482,0.143831,0.699830,0.000990,0.109614,...,188.629266,0,0,1,0,0,0,1,0,0


In [None]:
X_train["Heart_Sound_Type_Normal"] = y_train
X_train.corr()["Heart_Sound_Type_Normal"].sort_values(ascending=False)

Heart_Sound_Type_Normal      1.000000
variacao_curvatura_std       0.071450
comprimento_curva_std        0.059091
mudanca_media_direcao_std    0.058131
mfcc_1_mean                  0.057869
                               ...   
zero_crossing_rate_mean     -0.072901
centro_y_mean               -0.123374
rqa_fs_mean                       NaN
rqa_embedding_dim_mean            NaN
rqa_delay_mean                    NaN
Name: Heart_Sound_Type_Normal, Length: 67, dtype: float64

In [23]:
X_train[["rqa_fs_mean", "rqa_embedding_dim_mean", "rqa_delay_mean"]].describe()

Unnamed: 0,rqa_fs_mean,rqa_embedding_dim_mean,rqa_delay_mean
count,800.0,800.0,800.0
mean,0.0,0.0,0.0
std,0.0,0.0,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,0.0,0.0,0.0
