In [31]:
import sys
import os
sys.path.append(os.path.abspath("../.."))

from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.stats import f_oneway
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


from sklearn.decomposition import PCA


### Carregando os dados integralmente

In [None]:
df_agg_cod = pd.read_excel(
    '../../datasets/base_vertical/intermediate/HS_agregado_codificado_20windows.xlsx')

### dividindo os grupos (Heart Sound ID) em treino/validação/teste

#### Passo 1: obter a classe dominante (Normal ou não) por 'Heart Sound ID'

Assumimos: se 'Heart Sound Type_Normal' == 1 -> Normal; senão -> Anormal

In [14]:
df = df_agg_cod.copy()

df['Heart Sound ID'] = df['Heart Sound ID'].astype(str)
group_labels_df = df.groupby('Heart Sound ID')['Heart Sound Type_Normal'].agg(
    lambda x: int(x.mode()[0])).reset_index()

group_ids = group_labels_df['Heart Sound ID']
group_labels = group_labels_df['Heart Sound Type_Normal']


#### Passo 2: dividindo os grupos em treino+val e teste com estratificação


In [15]:

group_trainval, group_test = train_test_split(
    group_ids, test_size=0.2, random_state=42, stratify=group_labels
)


#### Passo 3: Obtendo labels para o subset de treino+validação

In [16]:

trainval_mask = group_labels_df['Heart Sound ID'].isin(group_trainval)
group_labels_trainval = group_labels_df[trainval_mask]['Heart Sound Type_Normal']


#### Passo 4: Dividir treino e validação com estratificação


In [17]:

group_train, group_val = train_test_split(
    group_trainval, test_size=0.25, random_state=42, stratify=group_labels_trainval
)


#### Passo 5: Filtrar os DataFrames com base nas divisões


In [18]:

df_train = df[df['Heart Sound ID'].isin(group_train)].copy()
df_val = df[df['Heart Sound ID'].isin(group_val)].copy()
df_test = df[df['Heart Sound ID'].isin(group_test)].copy()


##### Salvando

In [19]:

output_dir = '../../datasets/base_vertical/'
os.makedirs(output_dir, exist_ok=True)

train_path = os.path.join(output_dir, "train/train_stratified.csv")
val_path = os.path.join(output_dir, "val/val_stratified.csv")
test_path = os.path.join(output_dir, "test/test_stratified.csv")

df_train.to_csv(train_path, index=False)
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)

train_path, val_path, test_path

('../../datasets/base_vertical/train/train_stratified.csv',
 '../../datasets/base_vertical/val/val_stratified.csv',
 '../../datasets/base_vertical/test/test_stratified.csv')

### Usando groupkfold

In [None]:

n_splits = 5

gkf = GroupKFold(n_splits=n_splits)

groups = df['Heart Sound ID']
X = df.drop(columns=["Heart Sound ID"])
# Usado apenas como placeholder; pode ser outro rótulo conforme o caso
y = df["Heart Sound Type_Normal"]

fold_paths = []
for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups)):
    df_train = df.iloc[train_idx].copy()
    df_val = df.iloc[val_idx].copy()

    fold_dir = f"..\\..\\datasets\\base_vertical\\groupkfold\\fold_{fold+1}"
    os.makedirs(fold_dir, exist_ok=True)

    train_path = os.path.join(fold_dir, "train.csv")
    val_path = os.path.join(fold_dir, "val.csv")

    df_train.to_csv(train_path, index=False)
    df_val.to_csv(val_path, index=False)

    fold_paths.append((train_path, val_path))

fold_paths

[('..\\..\\datasets\\base_vertical\\groupkfold\\fold_1\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_1\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_2\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_2\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_3\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_3\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_4\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_4\\val.csv'),
 ('..\\..\\datasets\\base_vertical\\groupkfold\\fold_5\\train.csv',
  '..\\..\\datasets\\base_vertical\\groupkfold\\fold_5\\val.csv')]

### Normalizando os dados

In [None]:
dummy_cols = [col for col in df_agg_cod.columns
              if df_agg_cod[col].dropna().isin([0, 1]).all() and df_agg_cod[col].nunique() <= 2]
non_dummy_cols = [col for col in df_agg_cod.columns if col not in dummy_cols]

sca = StandardScaler()

X_train_scaled = sca.fit_transform(X_train[non_dummy_cols])
X_train = pd.DataFrame(X_train_scaled, columns=non_dummy_cols)


X_val_scaled = sca.transform(X_val[non_dummy_cols])
X_val = pd.DataFrame(X_val_scaled, columns=non_dummy_cols)


X_train.to_excel(
    "../../datasets/base_vertical/train/train_binario_normalized.xlsx", index=False)
X_val.assign(Heart_Sound_Type_Normal=y_val).to_excel(
    "../../datasets/base_vertical/val/val_binario_normalized.xlsx", index=False)

In [None]:
X_train["Heart_Sound_Type_Normal"] = y_train
X_train.corr()["Heart_Sound_Type_Normal"].sort_values(ascending=False)

Heart_Sound_Type_Normal      1.000000
variacao_curvatura_std       0.071450
comprimento_curva_std        0.059091
mudanca_media_direcao_std    0.058131
mfcc_1_mean                  0.057869
                               ...   
zero_crossing_rate_mean     -0.072901
centro_y_mean               -0.123374
rqa_fs_mean                       NaN
rqa_embedding_dim_mean            NaN
rqa_delay_mean                    NaN
Name: Heart_Sound_Type_Normal, Length: 67, dtype: float64

In [23]:
X_train[["rqa_fs_mean", "rqa_embedding_dim_mean", "rqa_delay_mean"]].describe()

Unnamed: 0,rqa_fs_mean,rqa_embedding_dim_mean,rqa_delay_mean
count,800.0,800.0,800.0
mean,0.0,0.0,0.0
std,0.0,0.0,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,0.0,0.0,0.0
