In [None]:
# Data labelling and balancing

## Import necessary libraries
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle

## Define your folders
INPUT_CSV_FOLDER = os.environ.get("INPUT_CSV_FOLDER", "C:/data/CSV/seizure C:/data/CSV/normal")
OUTPUT_PICKLE_FOLDER = "C:/data/pickle"

# Define variables
random_seed = 42
n_time_steps = 75
step = 32
n_classes = 2

## Initialize an empty list to store DataFrames
dfs = []

for folder in INPUT_CSV_FOLDER:
    # Get a list of CSV files in the folder
    csv_files = [file for file in os.listdir(folder) if file.endswith('.csv')]

    for csv_file in csv_files:
        # Load the CSV file into a DataFrame
        data = pd.read_csv(os.path.join(folder, csv_file))
        
        # Append the DataFrame to the list
        dfs.append(data)

## Concatenate all DataFrames in the list
df = pd.concat(dfs, ignore_index=True)

## From DataFrame i.e df the "category" heading is the target column
X = df.drop('category', axis=1)
y = df['category']

## Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

## Create a DataFrame with the rebalanced data
balanced_df = pd.concat([pd.DataFrame(y_sm, columns=['category']), pd.DataFrame(X_sm)], axis=1)

# Initialize lists to store segments and labels
segments = []
labels = []

# Loop through the data with a specified step
for i in range(0, balanced_df.shape[0], step):
    mylist = []
    label = None  # Initialize label as None

    if i + n_time_steps < balanced_df.shape[0]:
        for j in range(i, i + n_time_steps):
            mylist.append(balanced_df.iloc[j, 1:322].values)  #  As 321 feature columns

        # Check the label for the segment
        if 'seizure' in balanced_df['category'].iloc[i:i + n_time_steps].values:
            label = 1  # Set label to 1 if 'seizure' is present in this segment
        else:
            label = 0  # Set label to 0 if 'normal' is not present

        segments.append(mylist)
        labels.append(label)

# Convert segments and labels to NumPy arrays
reshaped_segments = np.array(segments, dtype=np.float32)
labels_binary = np.array(labels, dtype=np.float32)
sns.set_palette('Set1', desat=0.80)
facetgrid = sns.FacetGrid(df, hue='category', aspect=2)
facetgrid.map(sns.distplot, '0_x', hist=False)
facetgrid.add_legend()
plt.show()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reshaped_segments, labels_binary, test_size=0.2, random_state=random_seed)

# One-hot encode the labels
n_classes = 2
y_train = tf.keras.utils.to_categorical(y_train, n_classes)
y_test = tf.keras.utils.to_categorical(y_test, n_classes)

# Save X_train, X_test, y_train, y_test to pickle files
pickle_folder = os.environ.get("OUTPUT_PICKLE_FOLDER")
os.makedirs(pickle_folder, exist_ok=True)

with open(os.path.join(pickle_folder, 'X_train.pkl'), 'wb') as f:
    pickle.dump(X_train, f)

with open(os.path.join(pickle_folder, 'X_test.pkl'), 'wb') as f:
    pickle.dump(X_test, f)

with open(os.path.join(pickle_folder, 'y_train.pkl'), 'wb') as f:
    pickle.dump(y_train, f)

with open(os.path.join(pickle_folder, 'y_test.pkl'), 'wb') as f:
    pickle.dump(y_test, f)