# CHB dataset 2

Gerando dataset `chb01dwt.csv`

> `neg` e `pos` em variáveis se referem às classes (`target`):
    
- 0: negativo
- 1: positivo

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from pywt import wavedec
from zipfile import ZipFile
from statsmodels.robust.scale import mad as medianAD

## Definindo função para extração de _features_

In [2]:
# Colunas do Dataset
decomposition_level = 5

channels = [column for column in range(18)]

dwt_coefs = [f'A{decomposition_level}'] + [f'D{level}' for level in range(1, decomposition_level + 1)]

stats = ['std', 'mean', 'skew', 'kurt', 'meanAD', 'medianAD', 'energy']

dataset_labels = [f'{feat}-{coef}-{column}' for column in channels for coef in dwt_coefs for feat in stats] + ['target']

def energy(arr:np.ndarray):
    return (arr ** 2).sum()

def extract_statistical_features(matrix: np.ndarray, target: np.float64):
    '''
    É esperado como entrada uma matrix de dimensões 18 x 512.

    Durante a execução, para cada uma das 18 colunas,
    é gerado uma lista de 6 arrays de tamanhos variados
    em função da wavedec.
    De cada um dos 6 arrays é extraido 7 features.

    um dataframe statsDF de dimensões 1 x 757 é retornado
    com as features calculadas.
    '''
    statsDF = pd.DataFrame(columns=dataset_labels)
    
    for i, column in enumerate(matrix):
        wavelet_coefs = wavedec(data=column, wavelet='db2', level=decomposition_level)

        for coef, coef_label in zip(wavelet_coefs, dwt_coefs):
            coefDF = pd.DataFrame(data=coef, dtype=np.float64)

            statsDF[f'std-{coef_label}-{i}'] = coefDF.std()
            statsDF[f'mean-{coef_label}-{i}'] = coefDF.mean()
            statsDF[f'skew-{coef_label}-{i}'] = coefDF.skew()
            statsDF[f'kurt-{coef_label}-{i}'] = coefDF.kurt()
            statsDF[f'meanAD-{coef_label}-{i}'] = coefDF.mad()
            statsDF[f'medianAD-{coef_label}-{i}'] = medianAD(coefDF)
            statsDF[f'energy-{coef_label}-{i}'] = energy(coef)

    statsDF['target'] = target
    return statsDF

## Carregando matrizes de arquivo zip

In [3]:
with ZipFile('chb01.zip', 'r') as data:
    # Cria uma lista com os nomes dos arquivos dentro do zip e os ordena
    file_list = data.namelist()
    file_list.sort()

    pos_list = [pos for pos in file_list if ('chb01/positive/' in pos)]
    neg_list = [file_list[i] for i in range(len(pos_list))]

    print(f'pos_list: {len(pos_list)}\tneg_list: {len(neg_list)}')

    pos_space, neg_space = [], []
    
    # Cada arquivo é uma matriz que será salva nas listas {pos, neg}_space
    for pos_file, neg_file in zip(pos_list, neg_list):
        with data.open(name=pos_file, mode='r') as pos, data.open(name=neg_file, mode='r') as neg:
            pos_space.append(np.load(pos))
            neg_space.append(np.load(neg))

    # Convertendo listas para arrays
    pos_space = np.array(pos_space, dtype=np.float64)
    neg_space = np.array(neg_space, dtype=np.float64)
    
    print(f'pos_space.shape: {pos_space.shape}\tneg_space.shape: {neg_space.shape}')
    print(f'pos_space.dtype: {pos_space.dtype}\tneg_space.dtype: {neg_space.dtype}')

pos_list: 216	neg_list: 216
pos_space.shape: (216, 18, 512)	neg_space.shape: (216, 18, 512)
pos_space.dtype: float64	neg_space.dtype: float64


## Extraindo atributos e gerando dataset

In [4]:
dataset = pd.DataFrame(columns=dataset_labels)

for neg_matrix in neg_space:
    dataset = dataset.append(
        extract_statistical_features(matrix=neg_matrix, target=0),
        ignore_index=True
    )

for pos_matrix in pos_space:
    dataset = dataset.append(
        extract_statistical_features(matrix=pos_matrix, target=1),
        ignore_index=True
    )

dataset

Unnamed: 0,std-A5-0,mean-A5-0,skew-A5-0,kurt-A5-0,meanAD-A5-0,medianAD-A5-0,energy-A5-0,std-D1-0,mean-D1-0,skew-D1-0,...,medianAD-D4-17,energy-D4-17,std-D5-17,mean-D5-17,skew-D5-17,kurt-D5-17,meanAD-D5-17,medianAD-D5-17,energy-D5-17,target
0,3.593364,0.216540,-0.094793,-0.433591,2.926054,3.511545,220.352545,1.649358,-0.241962,1.336603,...,0.246915,12.573883,0.181974,0.003167,9.724373,133.300933,0.082040,0.083934,8.479898,0
1,1.816498,-0.295746,-0.291568,-0.089431,1.388092,1.470678,57.668709,1.314164,-0.060795,-0.387001,...,0.292721,12.146831,0.092132,-0.003614,-0.218151,0.624082,0.071081,0.080088,2.176368,0
2,1.033265,-0.222589,-0.128894,-0.019169,0.798095,1.126226,19.041665,0.550211,-0.093055,0.467804,...,0.235618,7.855882,0.076636,-0.002947,-0.147173,0.124414,0.061075,0.076758,1.505739,0
3,6.742059,-0.973794,1.032822,3.348853,4.153082,2.823792,789.810078,2.680230,-0.281228,-0.385256,...,0.238205,6.185847,0.068992,0.001586,-0.118231,0.113237,0.054806,0.066955,1.219180,0
4,2.674359,0.606153,1.014332,0.611186,2.028158,1.912554,128.200885,2.433295,0.470605,0.662752,...,0.333727,32.457713,0.226774,-0.003516,1.130033,13.504922,0.138429,0.124622,13.168395,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,11.641523,-1.442873,-0.672817,-0.150385,9.404927,11.411076,2341.399848,3.168720,-0.095671,0.046638,...,0.169242,3.472446,0.082743,0.000016,-0.765109,5.477428,0.059849,0.063605,1.752697,1
428,5.727706,0.690229,-0.314147,-0.232164,4.526871,5.207138,566.287925,4.525284,-0.027240,0.359718,...,0.211461,5.144885,0.074138,0.001392,-0.153234,0.346508,0.056782,0.063714,1.407589,1
429,6.815891,-0.970531,0.039953,0.297670,5.250815,5.782513,806.713046,3.926995,-0.258531,0.015235,...,0.209638,5.621385,0.109079,0.003742,2.761773,23.929858,0.071894,0.077642,3.049523,1
430,4.284559,0.094329,-0.121168,-1.153871,3.681980,5.208136,312.236815,5.096354,0.381901,0.742932,...,0.189870,4.830896,0.086006,0.001491,-0.061883,0.680067,0.065126,0.069663,1.894228,1


## Salvando dataset em arquivo csv

In [5]:
dataset.to_csv(path_or_buf='chb01dwt.csv', index=False)