In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import os
import logging

In [2]:
# Объявление функций оперирующих с df
def add_valence_word(x):
    if x == 0:
        return 'neutral'
    elif x == 1:
        return 'negative'
    elif x == 2:
        return 'positive'
    else:
        return None

def add_veracity_word(x):
    if x == 1:
        return 'true'
    elif x == 2:
        return 'false'
    else:
        return None
    
def add_iat_results2(x):
    if x>=0:
        return 'positive'
    elif x < 0:
        return 'negative'
    else:
        return None
    
def add_exp2(x):
    if x >= 2.5:
        return 'positive'
    elif x < 2.5:
        return 'negative'
    else:
        return None
    
def add_CB(row, column, match_values=('negative', 'positive'), mode=2):
    valence = row['Valence']
    result = row[column]
    
    if valence == 0:
        return 'neutral'
    
    match_map = {
        (1, match_values[0]): 'match_negative' if mode == 4 else 'match',
        (2, match_values[1]): 'match_positive' if mode == 4 else 'match',
        (1, match_values[1]): 'mismatch_positive' if mode == 4 else 'mismatch',
        (2, match_values[0]): 'mismatch_negative' if mode == 4 else 'mismatch'
    }
    
    return match_map.get((valence, result), None)


def get_lists(data_path, participant, folder1, folder2):
    # Формируем путь к директории Vac_3
    vac_3_path = os.path.join(data_path, participant, 'Vac_3')
    
    # Находим единственную папку внутри Vac_3
    subdirectories = [d for d in os.listdir(vac_3_path) if os.path.isdir(os.path.join(vac_3_path, d))]
    if len(subdirectories) != 1:
        raise ValueError(f"Ожидается одна папка в {vac_3_path}, найдено {len(subdirectories)}")
    
    # Получаем имя папки X
    X = subdirectories[0]
    
    # Формируем путь к папке с FixationList
    fixation_list_path = os.path.join(vac_3_path, X, 'Main', folder1, folder2)
    
    # Находим все CSV файлы в FixationList
    csv_files = [os.path.join(fixation_list_path, f) for f in os.listdir(fixation_list_path) if f.endswith('.csv')]
    
    # Читаем все CSV файлы в список DataFrame
    df_list = [pd.read_csv(file) for file in csv_files]

    return df_list

def Vac2Dataset(df, fixation_lists, saccade_lists):
    df = df.copy()
    # Создание списка для хранения строк
    rows = []
    if df.shape[0] != 111:
        return None
    for idx in tqdm(range(df.shape[0]), total=df.shape[0]):
        # Получаем текущие данные Fixation и Saccade
        fixation_df = fixation_lists[idx]
        saccade_df = saccade_lists[idx]

        # Получаем текущую строку Base_Report
        base_row = df.iloc[[idx]].copy()  # Копируем строку как DataFrame

        # Определяем количество строк в Fixation и Saccade для данного индекса
        fixation_rows = fixation_df if not fixation_df.empty else pd.DataFrame(columns=fixation_df.columns)
        saccade_rows = saccade_df if not saccade_df.empty else pd.DataFrame(columns=saccade_df.columns)

        # Максимум строк для этого индекса
        max_rows_local = max(len(fixation_rows), len(saccade_rows))

        # Если ни Fixation, ни Saccade не имеют строк, просто добавляем строку Base_Report
        if max_rows_local == 0:
            rows.append(base_row)

        for i in range(max_rows_local):
            # Создаем копию строки Base_Report для каждой итерации
            new_row = base_row.copy()

            # Добавляем данные из Fixation, если они есть
            if i < len(fixation_rows):
                for col in fixation_df.columns:
                    new_row[f"f_{col}"] = fixation_rows.iloc[i][col]
            else:
                for col in fixation_df.columns:
                    new_row[f"f_{col}"] = None

            # Добавляем данные из Saccade, если они есть
            if i < len(saccade_rows):
                for col in saccade_df.columns:
                    new_row[f"s_{col}"] = saccade_rows.iloc[i][col]
            else:
                for col in saccade_df.columns:
                    new_row[f"s_{col}"] = None

            # Добавляем строку в список
            rows.append(new_row)
    [row for row in rows if row.isnull().all(axis=None) or row.empty]
    # Конкатенируем все строки в один DataFrame
    final_df = pd.concat(rows, ignore_index=True)
    return final_df

def find_header_row(file, expected_columns):
    data = pd.read_csv(file, header=None)  # Читаем файл без заголовков
    for i, row in enumerate(data.values):
        # Проверяем, что все ожидаемые столбцы присутствуют в этой строке
        if all(col in row for col in expected_columns):
            return i
    return None

In [3]:
def get_path_baseReport(data_path, participant):
    # Формируем путь к директории Vac_3
    vac_3_path = os.path.join(data_path, participant, 'Vac_3')
    # Находим единственную папку внутри Vac_3
    subdirectories = [d for d in os.listdir(vac_3_path) if os.path.isdir(os.path.join(vac_3_path, d))]
    if len(subdirectories) != 1:
        raise ValueError(f"Ожидается одна папка в {vac_3_path}, найдено {len(subdirectories)}")
    # Получаем имя папки X
    X = subdirectories[0]
    return os.path.join(data_path, participant, 'Vac_3', X, 'BaseReport.csv')

logging.basicConfig(filename='log.txt', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s', encoding='utf-8')

data_path = 'EEG_CB_IAT'
pivot_df = pd.read_excel('CB_EEG_RSF.xlsx')

columns_order = ['Row_num', 'Valence', 'Veracity', 'Performance Rate1', 'Main_ Answer', 
                'Valence_word', 'Veracity_word', 'IAT_results', 'IAT_streght','EXP_consp', 'EXP_covid', 
                'IAT_results2', 'EXP_general2', 'EXP_consp2', 'EXP_covid2', 
                'CB_IAT2', 'CB_IAT4', 'CB_EXP_general2', 'CB_EXP_general4', 'CB_EXP_covid2', 'CB_EXP_covid4', 'CB_EXP_consp2', 'CB_EXP_consp4', 
                'f_Index', 'f_Center X', 'f_Center Y', 'f_Start Time', 'f_Duration', 'f_End Time', 
                's_Index', 's_Start Time', 's_Start X', 's_Start Y', 's_Duration', 's_Amplitude', 's_Distance',
                's_Peak Velocity', 's_Average Velocity', 's_End Time', 's_End X', 's_End Y']

expected_columns = ['Row_num', 'Valence', 'Veracity', 'Performance Rate1', 'Main_ Answer']

In [4]:
temp_dfs_list = []
name_list = []
temp_avg_dfs_list = []
for participant in os.listdir(data_path):
    #if participant in ['EEGCB12002',]:
    #    continue
    baseReport_path = get_path_baseReport(data_path, participant)
    skip_i = 8#find_header_row(baseReport_path, expected_columns)
    df = pd.read_csv(baseReport_path, skiprows=skip_i)
    df['Valence_word'] = df['Valence'].apply(add_valence_word)
    df['Veracity_word'] = df['Veracity'].apply(add_veracity_word)
    df['Main_ Answer'] = df['Main_ Answer'].replace(5, pd.NA)
    participant_data = pivot_df[pivot_df['ID'] == participant]
    if not participant_data.empty:
        df['IAT_results'] = participant_data['IAT_results'].values[0]
        df['IAT_streght'] = participant_data['IAT_streght'].values[0]
        df['EXP_general'] = participant_data['EXP1 (general)'].values[0]
        df['EXP_consp'] = participant_data['EXP2 (consp)'].values[0]
        df['EXP_covid'] = participant_data['EXP3 (covid)'].values[0]

    # Добавление переменных дублёров
    df['IAT_results2'] = df['IAT_results'].apply(add_iat_results2)
    df['EXP_general2'] = df['EXP_general'].apply(add_exp2)
    df['EXP_consp2'] = df['EXP_consp'].apply(add_exp2)
    df['EXP_covid2'] = df['EXP_covid'].apply(add_exp2)

    # Добавление CB переменных по формулам
    df['CB_IAT2'] = df.apply(lambda row: add_CB(row, 'IAT_results2', mode=2), axis=1)
    df['CB_IAT4'] = df.apply(lambda row: add_CB(row, 'IAT_results2', mode=4), axis=1)
    df['CB_EXP_general2'] = df.apply(lambda row: add_CB(row, 'EXP_general2', mode=2), axis=1)
    df['CB_EXP_general4'] = df.apply(lambda row: add_CB(row, 'EXP_general2', mode=4), axis=1)
    df['CB_EXP_covid2'] = df.apply(lambda row: add_CB(row, 'EXP_covid2', mode=2), axis=1)
    df['CB_EXP_covid4'] = df.apply(lambda row: add_CB(row, 'EXP_covid2', mode=4), axis=1)
    df['CB_EXP_consp2'] = df.apply(lambda row: add_CB(row, 'EXP_consp2', mode=2), axis=1)
    df['CB_EXP_consp4'] = df.apply(lambda row: add_CB(row, 'EXP_consp2', mode=4), axis=1)

    # Get fication lists and saccade lists
    fix_list = get_lists(data_path, participant, 'FD_Screen', 'FixationList')
    sd_list = get_lists(data_path, participant, 'SD_Screen', 'SaccadeList')

    final_data = Vac2Dataset(df, fix_list, sd_list)
    if final_data is None:
        print(f'{participant} data ERROR')
    final_data = final_data[columns_order]
    
    final_data.reset_index(drop=True, inplace=True)

    agg_funcs = {
        'f_Duration': ['count', 'first', 'mean', 'sum'],
        's_Duration': ['count', 'mean', 'sum'],
        's_Amplitude': ['mean', 'sum'],
        's_Distance': ['mean', 'sum'],
        's_Peak Velocity': ['mean', 'sum']
    }
    # Агрегация по метрикам
    metrics = final_data.groupby('Row_num').agg(agg_funcs)
    # Переименование столбцов для удобства
    metrics.columns = [
        'fixation_count', 'first_FD', 'mean_FD', 'total_FD',
        'saccades_count', 'mean_SD', 'total_SD',
        'mean_SA', 'total_SA',
        'mean_SDist', 'total_SDist',
        'mean_PV', 'total_PV'
    ]
    # Сброс индекса, чтобы сохранить trial как столбец
    metrics = metrics.reset_index()
    metrics = pd.merge(df, metrics, on='Row_num', how='left')

    print(f'{participant} is done!')
    temp_dfs_list.append(final_data)
    temp_avg_dfs_list.append(metrics)
    name_list.append(participant)

100%|██████████| 111/111 [00:11<00:00,  9.77it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11005 is done!


100%|██████████| 111/111 [00:09<00:00, 11.43it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11006 is done!


100%|██████████| 111/111 [00:11<00:00,  9.70it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11008 is done!


100%|██████████| 111/111 [00:08<00:00, 12.95it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11010 is done!


100%|██████████| 111/111 [00:10<00:00, 10.14it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11013 is done!


100%|██████████| 111/111 [00:11<00:00,  9.80it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11014 is done!


100%|██████████| 111/111 [00:10<00:00, 11.06it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11016 is done!


100%|██████████| 111/111 [00:24<00:00,  4.52it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11020 is done!


100%|██████████| 111/111 [00:12<00:00,  9.11it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11021 is done!


100%|██████████| 111/111 [00:10<00:00, 10.64it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11025 is done!


100%|██████████| 111/111 [00:16<00:00,  6.83it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB11029 is done!


100%|██████████| 111/111 [00:16<00:00,  6.93it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12002 is done!


100%|██████████| 111/111 [00:11<00:00,  9.32it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12003 is done!


100%|██████████| 111/111 [00:10<00:00, 10.92it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12007 is done!


100%|██████████| 111/111 [00:11<00:00, 10.02it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12009 is done!


100%|██████████| 111/111 [00:14<00:00,  7.56it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12011 is done!


100%|██████████| 111/111 [00:07<00:00, 14.36it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12012 is done!


100%|██████████| 111/111 [00:09<00:00, 12.06it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12015 is done!


100%|██████████| 111/111 [00:12<00:00,  9.21it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12017 is done!


100%|██████████| 111/111 [00:06<00:00, 18.49it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12018 is done!


100%|██████████| 111/111 [00:15<00:00,  7.26it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12019 is done!


100%|██████████| 111/111 [00:09<00:00, 11.43it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12023 is done!


100%|██████████| 111/111 [00:08<00:00, 12.64it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12024 is done!


100%|██████████| 111/111 [00:09<00:00, 12.04it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12026 is done!


100%|██████████| 111/111 [00:15<00:00,  7.38it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12027 is done!


100%|██████████| 111/111 [00:08<00:00, 13.41it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12028 is done!


100%|██████████| 111/111 [00:10<00:00, 11.01it/s]
  final_df = pd.concat(rows, ignore_index=True)


EEGCB12030 is done!


In [5]:
def create_dataset(temp_dfs_list, name_list):
    for i, df in enumerate(temp_dfs_list):
        df['Participant'] = os.path.splitext(name_list[i])[0]
        cols = df.columns.tolist()
        cols = ['Participant'] + [col for col in cols if col != 'Participant']
        temp_dfs_list[i] = df[cols]

        
        temp_avg_dfs_list[i]['Participant'] = os.path.splitext(name_list[i])[0]
        cols = temp_avg_dfs_list[i].columns.tolist()
        cols = ['Participant'] + [col for col in cols if col != 'Participant']
        temp_avg_dfs_list[i] = temp_avg_dfs_list[i][cols]

    # Объединяем все таблицы в одну
    combined_df = pd.concat(temp_dfs_list, ignore_index=True)
    combined_metrics = pd.concat(temp_avg_dfs_list, ignore_index=True)
    return combined_df, combined_metrics
print(f'Список предобработанных участников для создания датасета: {name_list}')
dataset, dataset_metrics = create_dataset(temp_dfs_list, name_list)

Список предобработанных участников для создания датасета: ['EEGCB11005', 'EEGCB11006', 'EEGCB11008', 'EEGCB11010', 'EEGCB11013', 'EEGCB11014', 'EEGCB11016', 'EEGCB11020', 'EEGCB11021', 'EEGCB11025', 'EEGCB11029', 'EEGCB12002', 'EEGCB12003', 'EEGCB12007', 'EEGCB12009', 'EEGCB12011', 'EEGCB12012', 'EEGCB12015', 'EEGCB12017', 'EEGCB12018', 'EEGCB12019', 'EEGCB12023', 'EEGCB12024', 'EEGCB12026', 'EEGCB12027', 'EEGCB12028', 'EEGCB12030']


In [6]:
dataset.to_excel('EYE_RAW_MAIN.xlsx', index=False)
dataset_metrics.to_excel('EYE_MAIN.xlsx', index=False)