File to load basic information from EEG reviews and save both an intermediate data frame with time stamps for future use and summary statistics
 

In [2]:
import numpy as np 
import pandas as pd 
import os
import re

In [2]:
data_path='../../../../../data'
reviews_data_path=f'{data_path}/raw/EEG/Completed Reviews/2nd Pass' # only loading reviews that have been reviewed twice


In [18]:
file_list=os.listdir(reviews_data_path)


In [37]:
def read_eeg_review(file_name):
    '''
    Returns a subject ID and dataframe with event data from a file name
    '''

    file_path=f'{reviews_data_path}/{file_name}'

    #needs openpyxl and pandas 1.2 to load .xlsx files

    #loading file into seperate dataframes to deal with document structure
    event_info=pd.read_excel(file_path,usecols="A:C",skiprows=1).dropna().reset_index(drop=True)
    number_of_events=len(event_info)
    seizure_info=pd.read_excel(file_path,usecols="D,K",skiprows=2).iloc[-number_of_events:].reset_index(drop=True)
    per_dis_info=pd.read_excel(file_path,usecols="M,V",skiprows=2).iloc[-number_of_events:].reset_index(drop=True)
    rda_info=pd.read_excel(file_path,usecols="Y,AH",skiprows=2).iloc[-number_of_events:].reset_index(drop=True)

    comments=pd.read_excel(file_path,usecols="L,X,AI,AJ,AK",skiprows=2,names=["Seizure","PDs","RDAs","General","Surprise"]).iloc[-number_of_events:].reset_index(drop=True)
    event_df=event_info.join([seizure_info,per_dis_info,rda_info])

    for m in re.finditer("\d{4}",file_name): #find a 4 digit number and pull the 5 characters before it as subject ID
        subject_id=file_name[m.start(0)-5:m.end(0)]

    return subject_id,event_df,comments


In [60]:
def save_eeg_review(subject_id,event_df,comments):
    ''' Saves extracted events based on subject_id '''
    save_name=f'{data_path}/intermediate/EEG/{subject_id}.csv'
    event_df.to_csv(save_name)
    comment_save_name=f'{data_path}/intermediate/EEG/{subject_id}_comments.csv'
    comments.to_csv(comment_save_name)
    

In [53]:
def flag_files_events(subject_id,event_df,comments):
    ''' Flags a file if there are comments. Currently not used below.'''
    if ~comments.isnull().values.all():
        print(f"{subject_id},")

    return


In [26]:
def make_save_eeg_features(subject_id,event_df):
    ''' 
    Makes the first round of EEG based features and saves them. 
    
    Initial features are total number and total duration of seizures, PDs, and  RDAs.
    '''

    total_df=pd.DataFrame(event_df.sum(), columns=['Total'])
    if "Event Identified On" in total_df.index:
        total_df=total_df.drop("Event Identified On")
    save_name=f'{data_path}/processed/EEG/{subject_id}.csv'
    total_df.to_csv(save_name)


In [62]:
for file_name in file_list:
    subject_id,event_df,comments=read_eeg_review(file_name)
    save_eeg_review(subject_id,event_df,comments)
    #make_save_eeg_features(subject_id,event_df)

In [29]:
label_df=pd.read_csv('../../../../../data/raw/EEG/EEG_labels.csv')

event_types=["Seizure","PDs","RDAs"]
#event_types=["Seizure","Duration of Seizure","PDs","Total Duration (waxing waning PDs)","RDAs","Total Duration (waxing waning RDAs)"]
#3_17_0007 and 3_17_0012 don't have the full set of duration measurements, so I'm ignoring duration measurements for the initial classifier

eeg_features = pd.DataFrame()

for row_ind,row in label_df.iterrows():

    feature_df=pd.read_csv(f'../../../../../data/processed/EEG/{row["Subject"]}.csv')
    row_dict=row.to_dict()
    for index,event in enumerate(event_types):
        row_dict[event]=int(feature_df["Total"].loc[feature_df["Unnamed: 0"]==event].iat[0])
    eeg_features=eeg_features.append(row_dict,ignore_index=True)


eeg_features.to_csv("../../../../../data/processed/EEG/EEG_features_v0.csv")