# CHB dataset

Gerando dataset `chb01.csv`

> `neg` e `pos` em variáveis se referem às classes (`target`):
    
- 0: negativo
- 1: positivo

## Importando bibliotecas

In [1]:
import numpy as np
import pandas as pd

from zipfile import ZipFile
from statsmodels.robust.scale import mad as medianAD

## Definindo função para extração de _features_

In [2]:
# Colunas do Dataset
labels = [f'{feature}-{column}' for column in range(18) for feature in ['std', 'mean', 'skew', 'kurt', 'meanAD', 'medianAD', 'energy']] + ['target']

def extract_statistical_features(matrix: np.ndarray, target: np.float64) -> pd.DataFrame:
    '''
    É esperado como entrada uma matrix de dimensões 18 x 512.

    Durante a execução, um dataframe statsDF de dimensões 7 x 18

    Ao fim um DataFrame com 127 colunas e 1 linha
    126 (7 * 18 do statsDF) + 1 (do target)
    '''

    energy = lambda mat: (mat ** 2).sum(axis=1)

    matrixDF, statsDF = pd.DataFrame(data=matrix).transpose(), pd.DataFrame()

    statsDF['std'] = matrixDF.std()
    statsDF['mean'] = matrixDF.mean()
    statsDF['skew'] = matrixDF.skew()
    statsDF['kurt'] = matrixDF.kurt()
    statsDF['meanAD'] = matrixDF.mad()
    statsDF['medianAD'] = medianAD(matrixDF)
    statsDF['energy'] = energy(matrix)
    
    column = list(statsDF.values.reshape(126)) + [target]

    rowDF = pd.DataFrame(data=column, index=labels).transpose()

    return rowDF

## Carregando Matrizes de arquivo zip

In [3]:
with ZipFile('chb01.zip', 'r') as data:
    # Cria uma lista com os nomes dos arquivos dentro do zip e os ordena
    file_list = data.namelist()
    file_list.sort()

    pos_list = [pos for pos in file_list if ('chb01/positive/' in pos)]
    neg_list = [file_list[i] for i in range(len(pos_list))]

    print(f'pos_list: {len(pos_list)}\tneg_list: {len(neg_list)}')

    pos_space, neg_space = [], []
    
    # Cada arquivo é uma matriz que será salva nas listas {pos, neg}_space
    for pos_file, neg_file in zip(pos_list, neg_list):
        with data.open(name=pos_file, mode='r') as pos, data.open(name=neg_file, mode='r') as neg:
            pos_space.append(np.load(pos))
            neg_space.append(np.load(neg))

    # Convertendo listas para arrays
    pos_space = np.array(pos_space, dtype=np.float64)
    neg_space = np.array(neg_space, dtype=np.float64)
    
    print(f'pos_space.shape: {pos_space.shape}\tneg_space.shape: {neg_space.shape}')
    print(f'pos_space.dtype: {pos_space.dtype}\tneg_space.dtype: {neg_space.dtype}')

pos_list: 216	neg_list: 216
pos_space.shape: (216, 18, 512)	neg_space.shape: (216, 18, 512)
pos_space.dtype: float64	neg_space.dtype: float64


## Extraindo atributos

In [4]:
dataset = pd.DataFrame(columns=labels)

for neg_matrix in neg_space: dataset = dataset.append(extract_statistical_features(matrix=neg_matrix, target=0), ignore_index=True)

for pos_matrix in pos_space: dataset = dataset.append(extract_statistical_features(matrix=pos_matrix, target=1), ignore_index=True)

dataset

Unnamed: 0,std-0,mean-0,skew-0,kurt-0,meanAD-0,medianAD-0,energy-0,std-1,mean-1,skew-1,...,medianAD-16,energy-16,std-17,mean-17,skew-17,kurt-17,meanAD-17,medianAD-17,energy-17,target
0,0.627303,0.172795,0.769000,1.204490,0.495133,0.565023,216.370324,0.693649,0.203381,-0.461872,...,0.940461,569.233117,1.270521,-0.259197,-0.541244,0.361269,0.965853,0.970435,859.265535,0.0
1,0.620640,-0.002412,0.586425,1.930250,0.468183,0.552915,196.837180,0.685201,-0.063472,0.348192,...,0.914337,463.816666,1.044232,-0.077918,0.025368,-0.525171,0.850698,1.122932,560.313689,0.0
2,0.373727,-0.000243,0.162980,0.222483,0.292714,0.355157,71.372244,0.512936,0.089147,0.361823,...,1.188638,787.382472,0.943828,0.073044,0.194259,-0.468731,0.751064,0.914981,457.936278,0.0
3,1.180971,0.026585,2.207182,7.923325,0.636369,0.411660,713.049792,0.495783,0.011315,-0.638179,...,1.175576,539.007678,1.047116,-0.131575,0.152097,-0.600635,0.846984,1.122932,569.150561,0.0
4,1.367195,0.083176,1.601121,7.791932,0.813831,0.669956,958.715201,2.371494,-0.086806,-0.012601,...,0.914337,625.666169,0.977431,0.120894,-0.348281,0.281433,0.766334,0.914981,495.677609,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
427,1.734864,0.189973,-0.434409,-0.196803,1.413317,1.785240,1556.462688,1.348079,0.162067,-0.468730,...,1.933745,2382.989550,1.613827,0.083167,0.047172,-0.046523,1.271779,1.606071,1334.409583,1.0
428,1.389863,0.119624,0.235493,-0.321001,1.131378,1.542033,994.435246,2.138321,-0.062255,0.139719,...,1.395307,1450.315373,1.115923,-0.266775,-0.394510,0.305225,0.855908,0.918414,672.778310,1.0
429,1.494870,-0.068467,0.029137,0.312264,1.157461,1.366097,1144.299163,1.730368,-0.099793,0.248734,...,2.257966,2693.343104,1.252646,-0.415135,-0.405169,-0.385152,1.011867,1.329162,890.058502,1.0
430,1.363770,-0.082891,0.571057,1.272426,1.059180,1.262605,953.910072,1.603137,-0.282140,-0.225029,...,1.557418,1267.710183,1.562439,-0.118513,0.256041,-0.342590,1.281636,1.679914,1254.652481,1.0


## Salvando dataset em arquivo csv

In [5]:
dataset.to_csv(path_or_buf='chb01.csv', index=False)