# Preprocessing

In [1]:
import os
import sys
import math
import random

import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.io import loadmat

from sklearn import preprocessing
from scipy.signal import butter, lfilter

from tensorflow.keras.utils import to_categorical

In [2]:
# 巴特沃斯滤波器
def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a

def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data) 
    return y

In [3]:
# 读取文件
def read_file(file):
    data = sio.loadmat(file)
    data = data['data']
    data_1st = loadmat(file)['data'][0]

    # print("读取data_preprocessed_matlab数据,打印第一个数据")
    # print(data.shape)
    # print("First data point:", data_1st)
    # print("")
    return data

In [4]:
def compute_DE(signal):
    # 使用 NumPy 中的 var 函数计算信号 signal 的方差。ddof=1 表示使用样本方差的无偏估计
    variance = np.var(signal, ddof=1)

    # 计算微分熵的估计值，使用方差来估计信号分布的"广度"
    return math.log(2 * math.pi * math.e * variance) / 2

In [5]:
def decompose(file):
    print("decompose:") # 函数开始运行
    # trial*channel*sample
    # start_index 用于确定从信号中提取的时间范围的起始点。具体来说，它是用于指定从每个信号中提取的信号部分的开始位置。
    # 这里 start_index 的值为 384，表示在每个试验信号中，从第 384 个样本点开始提取信号。
    # 这3秒是预实验基线
    start_index = 384  # 3s pre-trial signals 128(hz)x3(s)=384 
    data = read_file(file)
    # 确定信号的形状 shape 和采样频率 frequency
    shape = data.shape
    frequency = 128

    # 用于存储处理后的微分熵数据
    # 第一个维度（0 维）：表示数组中包含的元素数量。初始时是空数组，因此为 0。随着数据的逐步添加，这个维度会动态增长。
    # 第二个维度（1 维）：表示每个元素的维度，即一个元素包含的子数组的个数。在这里是 4，表示每个试验信号包含四个频带的微分熵。
    # 第三个维度（2 维）：表示每个子数组的长度，即每个频带的微分熵数据点数。在这里是 120。
    # 这种初始化方式允许在循环中逐步添加处理后的微分熵数据，构建一个多维数组，其中每个元素代表一个试验信号的微分熵数据。
    # 在代码的执行过程中，decomposed_de 会根据每个试验信号的微分熵数据的维度逐步增长。
    decomposed_de = np.empty([0, 4, 120])
    
    # 用于存储基线信号在不同频带的微分熵数据。
    # 第一个维度（0 维）：表示数组中包含的元素数量。初始时是空数组，因此为 0。随着数据的逐步添加，这个维度会动态增长。
    # 第二个维度（1 维）：表示每个元素的维度，即一个元素包含的数据点数。在这里是 128，表示每个试验的基线信号在不同频带的微分熵数据点数。
    base_DE = np.empty([0, 128])

    # 在一个试验信号循环中（共有40个video）对每个通道进行处理，提取频带特征并计算微分熵。
    for trial in range(40):
        temp_base_DE = np.empty([0])
        temp_base_theta_DE = np.empty([0])
        temp_base_alpha_DE = np.empty([0])
        temp_base_beta_DE = np.empty([0])
        temp_base_gamma_DE = np.empty([0])

        temp_de = np.empty([0, 120])

        # 对于每个试验信号的每个通道，获取试验信号的部分（从第 384 个样本点开始，总共 60x128=7680 个）和基线信号的部分（前 384 个样本点）
        # 这里的range(32) 是因为前32个channel是EEG数据
        for channel in range(32):
            trial_signal = data[trial, channel, start_index:]
            base_signal = data[trial, channel, :start_index]
            # ****************compute base DE****************
            # 使用了巴特沃斯滤波器对基线信号进行频带滤波。
            # 具体来说，对基线信号 base_signal 分别进行了四个频带的滤波，
            # 分别是 theta (4-8 Hz)、alpha (8-14 Hz)、beta (14-31 Hz) 和 gamma (31-45 Hz)。
            base_theta = butter_bandpass_filter(base_signal, 4, 8, frequency, order=3)
            base_alpha = butter_bandpass_filter(base_signal, 8, 14, frequency, order=3)
            base_beta = butter_bandpass_filter(base_signal, 14, 31, frequency, order=3)
            base_gamma = butter_bandpass_filter(base_signal, 31, 45, frequency, order=3)

            # 计算了基线信号在不同频带下的平均微分熵 (base_theta_DE, base_alpha_DE, base_beta_DE, base_gamma_DE)。
            # 它首先将每个频带划分为六个子段，每段0.5s，然后计算每个子段的微分熵，最后取平均值
            base_theta_DE = (compute_DE(base_theta[:64]) + compute_DE(base_theta[64:128]) + compute_DE(
                base_theta[128:192]) + compute_DE(base_theta[192:256]) + compute_DE(base_theta[256:320]) + compute_DE(
                base_theta[320:])) / 6
            base_alpha_DE = (compute_DE(base_alpha[:64]) + compute_DE(base_alpha[64:128]) + compute_DE(
                base_alpha[128:192]) + compute_DE(base_theta[192:256]) + compute_DE(base_theta[256:320]) + compute_DE(
                base_theta[320:])) / 6
            base_beta_DE = (compute_DE(base_beta[:64]) + compute_DE(base_beta[64:128]) + compute_DE(
                base_beta[128:192]) + compute_DE(base_theta[192:256]) + compute_DE(base_theta[256:320]) + compute_DE(
                base_theta[320:])) / 6
            base_gamma_DE = (compute_DE(base_gamma[:64]) + compute_DE(base_gamma[64:128]) + compute_DE(
                base_gamma[128:192]) + compute_DE(base_theta[192:256]) + compute_DE(base_theta[256:320]) + compute_DE(
                base_theta[320:])) / 6

            # 将基线信号在不同频带的微分熵数据添加到相应的临时数组中
            temp_base_theta_DE = np.append(temp_base_theta_DE, base_theta_DE)
            temp_base_gamma_DE = np.append(temp_base_gamma_DE, base_gamma_DE)
            temp_base_beta_DE = np.append(temp_base_beta_DE, base_beta_DE)
            temp_base_alpha_DE = np.append(temp_base_alpha_DE, base_alpha_DE)

            # 对试验信号进行巴特沃斯带通滤波，原理和基线信号一样
            theta = butter_bandpass_filter(trial_signal, 4, 8, frequency, order=3)
            alpha = butter_bandpass_filter(trial_signal, 8, 14, frequency, order=3)
            beta = butter_bandpass_filter(trial_signal, 14, 31, frequency, order=3)
            gamma = butter_bandpass_filter(trial_signal, 31, 45, frequency, order=3)

            # 将这四个数组初始化为全零数组，用于存储试验信号在不同频带下的微分熵数据
            DE_theta = np.zeros(shape=[0], dtype=float)
            DE_alpha = np.zeros(shape=[0], dtype=float)
            DE_beta = np.zeros(shape=[0], dtype=float)
            DE_gamma = np.zeros(shape=[0], dtype=float)

            # 这里实际上是把试验信号分成120个相对短的时间段或窗口，然后对每个窗口进行微分熵的计算。
            # 每个窗口的长度为64个样本点，窗口持续时间为0.5秒
            for index in range(120):
                DE_theta = np.append(DE_theta, compute_DE(theta[index * 64:(index + 1) * 64]))
                DE_alpha = np.append(DE_alpha, compute_DE(alpha[index * 64:(index + 1) * 64]))
                DE_beta = np.append(DE_beta, compute_DE(beta[index * 64:(index + 1) * 64]))
                DE_gamma = np.append(DE_gamma, compute_DE(gamma[index * 64:(index + 1) * 64]))

            # 通过 np.vstack 函数将每个频带下的微分熵数据（DE_theta、DE_alpha、DE_beta、DE_gamma）
            # 垂直堆叠在一起，形成一个临时的二维数组 temp_de。这个数组的每一行代表一个频带下的微分熵数据。
            temp_de = np.vstack([temp_de, DE_theta])
            temp_de = np.vstack([temp_de, DE_alpha])
            temp_de = np.vstack([temp_de, DE_beta])
            temp_de = np.vstack([temp_de, DE_gamma])

            # print("temp_de shape:", temp_de.shape)

        temp_trial_de = temp_de.reshape(-1, 4, 120)
        # print("temp_trial_de shape:", temp_trial_de.shape)
        decomposed_de = np.vstack([decomposed_de, temp_trial_de])
        # print("decomposed_de shape:", decomposed_de.shape)

        temp_base_DE = np.append(temp_base_theta_DE, temp_base_alpha_DE)
        temp_base_DE = np.append(temp_base_DE, temp_base_beta_DE)
        temp_base_DE = np.append(temp_base_DE, temp_base_gamma_DE)
        # base_DE 将包含所有通道在四个频带下的基线微分熵数据。
        base_DE = np.vstack([base_DE, temp_base_DE])
        # print("temp_base_DE:", temp_base_DE.shape)
        print("base_DE:", base_DE.shape)

    # print("decomposed_de before reshape:", decomposed_de.shape)
    #40 视频 x 60秒/视频 / 0.5s (窗口) = 4800
    # decomposed_de = decomposed_de.reshape(-1, 32, 4, 120).transpose([0, 3, 2, 1]).reshape(-1, 4, 32).reshape(-1, 128)
    # 在这里要把每个视频的数据分开来计算
    decomposed_de = decomposed_de.reshape(-1, 32, 4, 120).transpose([0, 3, 2, 1]).reshape(40, 120, -1) # (40, 120, 128)
    
    print("base_DE shape:", base_DE.shape)
    print("trial_DE shape:", decomposed_de.shape)
    print("")
    return base_DE, decomposed_de

In [6]:
def compute_PSD(signal):
    # do PSD not DE
    if True:
        return np.sum(signal**2)
    else:
        variance = np.var(signal, ddof=1)
        return math.log(2 * math.pi * math.e * variance) / 2

In [7]:
def decompose_PSD(file):
    print("decompose:") # 函数开始运行
    # 这3秒是预实验基线
    start_index = 384  # 3s pre-trial signals 128(hz)x3(s)=384 
    data = read_file(file)
    # 确定信号的形状 shape 和采样频率 frequency
    shape = data.shape
    frequency = 128

    # 用于存储处理后的功率谱密度数据
    decomposed_de = np.empty([0, 4, 120])
    
    # 用于存储基线信号在不同频带的功率谱密度数据。
    base_DE = np.empty([0, 128])

    # 在一个试验信号循环中（共有40个video）对每个通道进行处理，提取频带特征并计算功率谱密度。
    for trial in range(40):
        temp_base_DE = np.empty([0])
        temp_base_theta_DE = np.empty([0])
        temp_base_alpha_DE = np.empty([0])
        temp_base_beta_DE = np.empty([0])
        temp_base_gamma_DE = np.empty([0])

        temp_de = np.empty([0, 120])

        # 对于每个试验信号的每个通道，获取试验信号的部分（从第 384 个样本点开始，总共 60x128=7680 个）和基线信号的部分（前 384 个样本点）
        # 这里的range(32) 是因为前32个channel是EEG数据
        for channel in range(32):
            trial_signal = data[trial, channel, start_index:]
            base_signal = data[trial, channel, :start_index]
            # ****************compute base DE****************
            # 使用了巴特沃斯滤波器对基线信号进行频带滤波。
            # 具体来说，对基线信号 base_signal 分别进行了四个频带的滤波，
            # 分别是 theta (4-8 Hz)、alpha (8-14 Hz)、beta (14-31 Hz) 和 gamma (31-45 Hz)。
            base_theta = butter_bandpass_filter(base_signal, 4, 8, frequency, order=3)
            base_alpha = butter_bandpass_filter(base_signal, 8, 14, frequency, order=3)
            base_beta = butter_bandpass_filter(base_signal, 14, 31, frequency, order=3)
            base_gamma = butter_bandpass_filter(base_signal, 31, 45, frequency, order=3)

            # 计算了基线信号在不同频带下的平均微分熵 (base_theta_DE, base_alpha_DE, base_beta_DE, base_gamma_DE)。
            # 它首先将每个频带划分为六个子段，每段0.5s，然后计算每个子段的微分熵，最后取平均值
            base_theta_DE = (compute_PSD(base_theta[:64]) + compute_PSD(base_theta[64:128]) + compute_PSD(
                base_theta[128:192]) + compute_PSD(base_theta[192:256]) + compute_PSD(base_theta[256:320]) + compute_PSD(
                base_theta[320:])) / 6
            base_alpha_DE = (compute_PSD(base_alpha[:64]) + compute_PSD(base_alpha[64:128]) + compute_PSD(
                base_alpha[128:192]) + compute_PSD(base_theta[192:256]) + compute_PSD(base_theta[256:320]) + compute_PSD(
                base_theta[320:])) / 6
            base_beta_DE = (compute_PSD(base_beta[:64]) + compute_PSD(base_beta[64:128]) + compute_PSD(
                base_beta[128:192]) + compute_PSD(base_theta[192:256]) + compute_PSD(base_theta[256:320]) + compute_PSD(
                base_theta[320:])) / 6
            base_gamma_DE = (compute_PSD(base_gamma[:64]) + compute_PSD(base_gamma[64:128]) + compute_PSD(
                base_gamma[128:192]) + compute_PSD(base_theta[192:256]) + compute_PSD(base_theta[256:320]) + compute_PSD(
                base_theta[320:])) / 6

            # 将基线信号在不同频带的微分熵数据添加到相应的临时数组中
            temp_base_theta_DE = np.append(temp_base_theta_DE, base_theta_DE)
            temp_base_gamma_DE = np.append(temp_base_gamma_DE, base_gamma_DE)
            temp_base_beta_DE = np.append(temp_base_beta_DE, base_beta_DE)
            temp_base_alpha_DE = np.append(temp_base_alpha_DE, base_alpha_DE)

            # 对试验信号进行巴特沃斯带通滤波，原理和基线信号一样
            theta = butter_bandpass_filter(trial_signal, 4, 8, frequency, order=3)
            alpha = butter_bandpass_filter(trial_signal, 8, 14, frequency, order=3)
            beta = butter_bandpass_filter(trial_signal, 14, 31, frequency, order=3)
            gamma = butter_bandpass_filter(trial_signal, 31, 45, frequency, order=3)

            # 将这四个数组初始化为全零数组，用于存储试验信号在不同频带下的微分熵数据
            DE_theta = np.zeros(shape=[0], dtype=float)
            DE_alpha = np.zeros(shape=[0], dtype=float)
            DE_beta = np.zeros(shape=[0], dtype=float)
            DE_gamma = np.zeros(shape=[0], dtype=float)

            # 这里实际上是把试验信号分成120个相对短的时间段或窗口，然后对每个窗口进行微分熵的计算。
            # 每个窗口的长度为64个样本点，窗口持续时间为0.5秒
            for index in range(120):
                DE_theta = np.append(DE_theta, compute_PSD(theta[index * 64:(index + 1) * 64]))
                DE_alpha = np.append(DE_alpha, compute_PSD(alpha[index * 64:(index + 1) * 64]))
                DE_beta = np.append(DE_beta, compute_PSD(beta[index * 64:(index + 1) * 64]))
                DE_gamma = np.append(DE_gamma, compute_PSD(gamma[index * 64:(index + 1) * 64]))

            # 通过 np.vstack 函数将每个频带下的微分熵数据（DE_theta、DE_alpha、DE_beta、DE_gamma）
            # 垂直堆叠在一起，形成一个临时的二维数组 temp_de。这个数组的每一行代表一个频带下的微分熵数据。
            temp_de = np.vstack([temp_de, DE_theta])
            temp_de = np.vstack([temp_de, DE_alpha])
            temp_de = np.vstack([temp_de, DE_beta])
            temp_de = np.vstack([temp_de, DE_gamma])

            # print("temp_de shape:", temp_de.shape)

        temp_trial_de = temp_de.reshape(-1, 4, 120)
        # print("temp_trial_de shape:", temp_trial_de.shape)
        decomposed_de = np.vstack([decomposed_de, temp_trial_de])
        # print("decomposed_de shape:", decomposed_de.shape)

        temp_base_DE = np.append(temp_base_theta_DE, temp_base_alpha_DE)
        temp_base_DE = np.append(temp_base_DE, temp_base_beta_DE)
        temp_base_DE = np.append(temp_base_DE, temp_base_gamma_DE)
        # base_DE 将包含所有通道在四个频带下的基线微分熵数据。
        base_DE = np.vstack([base_DE, temp_base_DE])
        # print("temp_base_DE:", temp_base_DE.shape)
        # print("base_DE:", base_DE.shape)

    # print("decomposed_de before reshape:", decomposed_de.shape)
    #40 视频 x 60秒/视频 / 0.5s (窗口) = 4800
    # decomposed_de = decomposed_de.reshape(-1, 32, 4, 120).transpose([0, 3, 2, 1]).reshape(-1, 4, 32).reshape(-1, 128)
    # 在这里要把每个视频的数据分开来计算
    decomposed_de = decomposed_de.reshape(-1, 32, 4, 120).transpose([0, 3, 2, 1]).reshape(40, 120, -1) # (40, 120, 128)
    
    print("base_PSD shape:", base_DE.shape)
    print("trial_PSD shape:", decomposed_de.shape)
    print("")
    return base_DE, decomposed_de

In [None]:
def get_labels(file):
    # 0 valence, 1 arousal, 2 dominance, 3 liking
    valence_labels = sio.loadmat(file)["labels"][:, 0] > 5  # valence labels
    arousal_labels = sio.loadmat(file)["labels"][:, 1] > 5  # arousal labels
    valence_labels = valence_labels.astype(int)
    arousal_labels = arousal_labels.astype(int)
    
    final_valence_labels = np.empty((40, 120))
    final_arousal_labels = np.empty((40, 120))
    
    for i in range(0, 40):
        final_valence_labels[i, :] = valence_labels[i]
        final_arousal_labels[i, :] = arousal_labels[i]
    print("get_labels:")
    print("labels:", final_arousal_labels.shape)
    return final_arousal_labels, final_valence_labels

In [9]:
# 输出de文件
dataset_dir = "raw_data"

result_dir = "preprocess_step1/de/"
if os.path.isdir(result_dir) == False:
    os.makedirs(result_dir)

for file in os.listdir(dataset_dir):
    print("processing: ", file, "......")
    file_path = os.path.join(dataset_dir, file)
    base_DE, trial_DE = decompose(file_path)
    arousal_labels, valence_labels = get_labels(file_path)
    sio.savemat(result_dir + "DE_" + file,
                {"base_data": base_DE, "data": trial_DE, "valence_labels": valence_labels,
                    "arousal_labels": arousal_labels})

processing:  s01.mat ......
decompose:
base_DE: (1, 128)
base_DE: (2, 128)
base_DE: (3, 128)
base_DE: (4, 128)
base_DE: (5, 128)
base_DE: (6, 128)
base_DE: (7, 128)
base_DE: (8, 128)
base_DE: (9, 128)
base_DE: (10, 128)
base_DE: (11, 128)
base_DE: (12, 128)
base_DE: (13, 128)
base_DE: (14, 128)
base_DE: (15, 128)
base_DE: (16, 128)
base_DE: (17, 128)
base_DE: (18, 128)
base_DE: (19, 128)
base_DE: (20, 128)
base_DE: (21, 128)
base_DE: (22, 128)
base_DE: (23, 128)
base_DE: (24, 128)
base_DE: (25, 128)
base_DE: (26, 128)
base_DE: (27, 128)
base_DE: (28, 128)
base_DE: (29, 128)
base_DE: (30, 128)
base_DE: (31, 128)
base_DE: (32, 128)
base_DE: (33, 128)
base_DE: (34, 128)
base_DE: (35, 128)
base_DE: (36, 128)
base_DE: (37, 128)
base_DE: (38, 128)
base_DE: (39, 128)
base_DE: (40, 128)
base_DE shape: (40, 128)
trial_DE shape: (40, 120, 128)



KeyboardInterrupt: 

In [None]:
# 输出psd文件
dataset_dir = "E:/dataset/deap_dataset/data_preprocessed_matlab/"

result_dir = "E:/dataset/deap_dataset/preprocessing_data/psd/"
if os.path.isdir(result_dir) == False:
    os.makedirs(result_dir)

for file in os.listdir(dataset_dir):
    print("processing: ", file, "......")
    file_path = os.path.join(dataset_dir, file)
    base_PSD, trial_PSD = decompose_PSD(file_path)
    arousal_labels, valence_labels = get_labels(file_path)
    sio.savemat(result_dir + "PSD_" + file,
                {"base_data": base_PSD, "data": trial_PSD, "valence_labels": valence_labels,
                    "arousal_labels": arousal_labels})

In [2]:
# data(40*120*128)
# base_data(40*128)
# arousal_labels(40*120)
# valence_labels(40*120)
def read_file(file):
    file = sio.loadmat(file)
    trial_data = file['data']
    base_data = file["base_data"]
    return trial_data, base_data, file["arousal_labels"], file["valence_labels"]

##### 计算试验数据和基准数据之间的偏移

In [3]:
def get_vector_deviation(vector1, vector2):
    return vector1 - vector2

def get_dataset_deviation(trial_data, base_data):
    new_dataset = np.empty([0, 120, 128])
    for i in range(0, 40):
        new_record = np.array([get_vector_deviation(trial_data[i][j], base_data[i]) for j in range(0, 120)])
        # print("new_record shape:", new_record.shape)
        new_record = new_record[np.newaxis, :, :]  # 添加一个额外的维度
        new_dataset = np.vstack([new_dataset, new_record])
    # print("get_dataset_deviation:")
    # print("new_dataset shape:", new_dataset.shape) # new_dataset shape: (40, 120, 128)
    # print("new_dataset:" ,new_dataset)
    return new_dataset

In [4]:
def data_1Dto2D(data, Y=8, X=9):

    # print("data_1Dto2D data shape:", data.shape)

    data_2D = np.zeros([Y, X])
    data_2D[0] = (0, 0, data[1], data[0], 0, data[16], data[17], 0, 0)
    data_2D[1] = (data[3], 0, data[2], 0, data[18], 0, data[19], 0, data[20])
    data_2D[2] = (0, data[4], 0, data[5], 0, data[22], 0, data[21], 0)
    data_2D[3] = (data[7], 0, data[6], 0, data[23], 0, data[24], 0, data[25])
    data_2D[4] = (0, data[8], 0, data[9], 0, data[27], 0, data[26], 0)
    data_2D[5] = (data[11], 0, data[10], 0, data[15], 0, data[28], 0, data[29])
    data_2D[6] = (0, 0, 0, data[12], 0, data[30], 0, 0, 0)
    data_2D[7] = (0, 0, 0, data[13], data[14], data[31], 0, 0, 0)
    # return shape:9*9
    return data_2D

In [5]:
def pre_process(path, y_n):
    # DE feature vector dimension of each band
    data_3D = np.empty([0, 120, 4, 8, 9])
    sub_vector_len = 32
    trial_data, base_data, arousal_labels, valence_labels = read_file(path)
    if y_n == "yes":
        data = get_dataset_deviation(trial_data, base_data)

        # 将三维数组转换为二维数组
        reshaped_data = data.reshape(-1, data.shape[-1])
        # 对二维数组进行标准化，axis=1 表示按行标准化
        scaled_data = preprocessing.scale(reshaped_data, axis=1, with_mean=True, with_std=True, copy=True)
        # 将标准化后的二维数组重新转回三维数组
        data = scaled_data.reshape(data.shape)
    else:
        reshaped_trial_data = trial_data.reshape(-1, trial_data.shape[-1])
        scaled_trial_data = preprocessing.scale(reshaped_trial_data, axis=1, with_mean=True, with_std=True, copy=True)
        data = scaled_trial_data.reshape(data.shape)
    # convert 128 vector ---> 4*9*9 cube
    # data(40*120*128)
    for vector in data:
        temp_data = np.empty((0, 4, 8, 9))
        for i in range(0, 120):
            vector_3D = np.empty((0, 8, 9))  # 初始化一个空的二维数组，用于存放当前 vector 的处理结果
            for band in range(0, 4):
                data_2D_temp = data_1Dto2D(vector[i][band * sub_vector_len:(band + 1) * sub_vector_len])
                # print("data_2D_temp:", data_2D_temp)
                # print("data_2D_temp shape:", data_2D_temp.shape) # data_2D_temp shape: (8, 9)
                data_2D_temp = data_2D_temp.reshape(1, 8, 9)
                vector_3D = np.vstack([vector_3D, data_2D_temp])
            vector_3D = vector_3D.reshape(1, 4, 8, 9)
            temp_data = np.vstack([temp_data, vector_3D])
        temp_data = temp_data.reshape(1, 120, 4, 8, 9)
        data_3D = np.vstack([data_3D, temp_data])
    # print("final data shape:", data_3D.shape)
    # print("final data:", data_3D)
    return data_3D, arousal_labels, valence_labels

DE数据40个文件，PSD数据40个文件，最终的文件夹当中应当有80个文件

In [8]:
# 处理de
dataset_dir = "E:/dataset/deap_dataset/preprocessing_data/de"
use_baseline = "yes"
if use_baseline == "yes":
    result_dir = "E:/dataset/deap_dataset/preprocessing_data/with_base_0.5/"
    if os.path.isdir(result_dir) == False:
        os.makedirs(result_dir)
else:
    result_dir = "E:/dataset/deap_dataset/preprocessing_data/without_base_0.5/"
    if os.path.isdir(result_dir) == False:
        os.makedirs(result_dir)

final_data = np.empty((0, 40, 4, 8, 9))
final_arousal_labels = np.empty((0, 40))
final_valence_labels = np.empty((0, 40))
for file in os.listdir(dataset_dir):
    print("processing: ", file, "......")
    file_path = os.path.join(dataset_dir, file)
    data, arousal_labels, valence_labels = pre_process(file_path, use_baseline)
    data = data.transpose([1, 0, 2, 3, 4])
    final_data = np.vstack([final_data, data])
    # print("1 person shape:", data.shape)
    # print("final shape:", final_data.shape)
    arousal_labels = arousal_labels.transpose([1, 0])
    valence_labels = valence_labels.transpose([1, 0])
    final_arousal_labels = np.vstack([final_arousal_labels, arousal_labels])
    final_valence_labels = np.vstack([final_valence_labels, valence_labels])
    # print("arousal shape:", arousal_labels.shape)
    # print("valence shape:", valence_labels.shape)
    # break
final_data = final_data.transpose([1, 0, 2, 3, 4])
final_arousal_labels = final_arousal_labels.transpose([1, 0]) # 在这里是一个153600的一维数组,0和1分别代表label
final_valence_labels = final_valence_labels.transpose([1, 0])

# final_arousal_labels = final_arousal_labels.transpose([1, 0])
# final_valence_labels = final_valence_labels.transpose([1, 0])
# print("final shape:", final_data.shape) # (40, 120, 4, 8, 9)
# for video in range(0, 40):
#         print("DE_video", str(video + 1).zfill(2), "is saving ......")
#         sio.savemat(result_dir + "DE_video" + str(video + 1).zfill(2) + ".mat",
#                 {"data": final_data[video], "valence_labels": final_valence_labels[video], "arousal_labels": final_arousal_labels[video]})

processing:  DE_s01.mat ......
processing:  DE_s02.mat ......
processing:  DE_s03.mat ......
processing:  DE_s04.mat ......
processing:  DE_s05.mat ......
processing:  DE_s06.mat ......
processing:  DE_s07.mat ......
processing:  DE_s08.mat ......
processing:  DE_s09.mat ......
processing:  DE_s10.mat ......
processing:  DE_s11.mat ......
processing:  DE_s12.mat ......
processing:  DE_s13.mat ......
processing:  DE_s14.mat ......
processing:  DE_s15.mat ......
processing:  DE_s16.mat ......
processing:  DE_s17.mat ......
processing:  DE_s18.mat ......
processing:  DE_s19.mat ......
processing:  DE_s20.mat ......
processing:  DE_s21.mat ......
processing:  DE_s22.mat ......
processing:  DE_s23.mat ......
processing:  DE_s24.mat ......
processing:  DE_s25.mat ......
processing:  DE_s26.mat ......
processing:  DE_s27.mat ......
processing:  DE_s28.mat ......
processing:  DE_s29.mat ......
processing:  DE_s30.mat ......
processing:  DE_s31.mat ......
processing:  DE_s32.mat ......


KeyboardInterrupt: 

In [None]:
# 处理PSD
dataset_dir = "E:/dataset/deap_dataset/preprocessing_data/psd"
use_baseline = "yes"
if use_baseline == "yes":
    result_dir = "E:/dataset/deap_dataset/preprocessing_data/with_base_0.5/"
    if os.path.isdir(result_dir) == False:
        os.makedirs(result_dir)
else:
    result_dir = "E:/dataset/deap_dataset/preprocessing_data/without_base_0.5/"
    if os.path.isdir(result_dir) == False:
        os.makedirs(result_dir)

final_data = np.empty((0, 40, 4, 8, 9))
final_arousal_labels = np.empty((0, 40))
final_valence_labels = np.empty((0, 40))
for file in os.listdir(dataset_dir):
    print("processing: ", file, "......")
    file_path = os.path.join(dataset_dir, file)
    data, arousal_labels, valence_labels = pre_process(file_path, use_baseline)
    data = data.transpose([1, 0, 2, 3, 4])
    final_data = np.vstack([final_data, data])
    # print("1 person shape:", data.shape)
    # print("final shape:", final_data.shape)
    arousal_labels = arousal_labels.transpose([1, 0])
    valence_labels = valence_labels.transpose([1, 0])
    final_arousal_labels = np.vstack([final_arousal_labels, arousal_labels])
    final_valence_labels = np.vstack([final_valence_labels, valence_labels])
    # print("arousal shape:", arousal_labels.shape)
    # print("valence shape:", valence_labels.shape)
    # break
final_data = final_data.transpose([1, 0, 2, 3, 4])
final_arousal_labels = final_arousal_labels.transpose([1, 0])
final_valence_labels = final_valence_labels.transpose([1, 0])

# final_arousal_labels = final_arousal_labels.transpose([1, 0])
# final_valence_labels = final_valence_labels.transpose([1, 0])
# print("final shape:", final_data.shape) # (40, 120, 4, 8, 9)
for video in range(0, 40):
        print("PSD_video", str(video + 1).zfill(2), "is saving ......")
        sio.savemat(result_dir + "PSD_video" + str(video + 1).zfill(2) + ".mat",
                {"data": final_data[video], "valence_labels": final_valence_labels[video], "arousal_labels": final_arousal_labels[video]})