# Setup

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import time

sys.path.append(os.getcwd())

import python_utils

START_TIME = time.time()
WINDOW_SIZE = 6616 # From 01_data_reformatting.ipynb
TEST_WINDOW_OVERLAP = 0.5
TRAIN_PROPORTION = 0.66

np.random.seed(42)

# Create Train-Test Split

In [3]:
datafiles = [x[:-4] for x in os.listdir(os.path.join("data", "preprocessed_data", "segments_x"))]
train_number_of_files = round(TRAIN_PROPORTION * len(datafiles))
train_files = np.random.choice(datafiles, train_number_of_files).tolist()
test_files = [x for x in datafiles if x not in train_files]

print("Train Files:", train_files)
print("Test Files:", test_files)

Train Files: ['PXL_20250307_005644623.TS_1', 'PXL_20250222_013140200.TS_1', 'PXL_20250320_200642835.TS_1', 'PXL_20250307_005733826.TS_1', 'PXL_20250223_164321190.TS_1', 'PXL_20250320_200642835.TS_1', 'PXL_20250223_164250127.TS_1', 'PXL_20250307_005644623.TS_1', 'PXL_20250320_200743743.TS_1']
Test Files: ['PXL_20250312_215759199.TS_1', 'PXL_20250320_200905850.TS_2', 'PXL_20250222_012947893.TS_1', 'PXL_20250223_164224211.TS_1', 'PXL_20250320_200905850.TS_1', 'PXL_20250222_012947893.TS_2']


# Load and Process Data

In [4]:
def combine_datafiles(files_list: list, data_categories: list = ["full_array_x", "full_array_y", "segments_x", "summary_y"]) -> dict:
    data = {data_category: pd.DataFrame() for data_category in data_categories}
    for _file in files_list:
        print(f"Reading {_file} files...")
        for data_category in data:
            data[data_category] = pd.concat([
                data[data_category],
                pd.read_csv(
                    os.path.join("data", "preprocessed_data", data_category, _file + ".csv"),
                    header=None,
                )
            ])
        print("Done")
    return data

In [6]:
print("Loading train data...")
data_train = combine_datafiles(files_list=train_files, data_categories=["segments_x", "summary_y"])

print("Loading test data...")
data_test = dict()
for data_category in ["full_array_x", "full_array_y"]:
    data_test[data_category] = []
    # Load test data (individual files)
    for test_file in test_files:
        print(f"Reading {test_file} files...")
        temp = np.squeeze(pd.read_csv(
            os.path.join("data", "preprocessed_data", data_category, test_file + ".csv"),
            header=None,
        ).values)
        # Segment them with test overlap (50%)
        temp2 = python_utils.segment_array(
            array_to_segment=temp,
            window_size=WINDOW_SIZE,
            window_overlap=TEST_WINDOW_OVERLAP,
        )
        for i in range(temp2.shape[0]):
            data_test[data_category].append(temp2[i])
        print("Done")
    data_test[data_category] = pd.DataFrame(data_test[data_category])
    if data_category == "full_array_y":
        data_test[data_category] = data_test[data_category].max(axis=1).to_frame()

Loading train data...
Reading PXL_20250307_005644623.TS_1 files...


Done
Reading PXL_20250222_013140200.TS_1 files...
Done
Reading PXL_20250320_200642835.TS_1 files...
Done
Reading PXL_20250307_005733826.TS_1 files...
Done
Reading PXL_20250223_164321190.TS_1 files...
Done
Reading PXL_20250320_200642835.TS_1 files...
Done
Reading PXL_20250223_164250127.TS_1 files...
Done
Reading PXL_20250307_005644623.TS_1 files...
Done
Reading PXL_20250320_200743743.TS_1 files...
Done
Loading test data...
Reading PXL_20250312_215759199.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_2 files...
Done
Reading PXL_20250222_012947893.TS_1 files...
Done
Reading PXL_20250223_164224211.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_1 files...
Done
Reading PXL_20250222_012947893.TS_2 files...
Done
Reading PXL_20250312_215759199.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_2 files...
Done
Reading PXL_20250222_012947893.TS_1 files...
Done
Reading PXL_20250223_164224211.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_1 files...
Done
Reading PXL_20250222_012

In [7]:
data_train["segments_x"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6606,6607,6608,6609,6610,6611,6612,6613,6614,6615
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-5.6e-05,-0.001261,-0.001319,-0.000914,-0.001018,-0.000767,-0.000648,-0.000161,-0.0011,-0.000636
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.000146,0.000218,-2.2e-05,0.000435,-0.000282,0.000226,-0.000678,-0.001838,-0.00132,-0.000941
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000751,-0.000401,-0.000624,-0.001431,-0.001754,-0.001696,-0.002016,-0.001749,-0.001306,-0.001332
3,2e-05,2.1e-05,1.7e-05,1.5e-05,2.4e-05,1.1e-05,2.8e-05,1.6e-05,3e-06,4e-06,...,-0.001338,-0.002432,-0.002045,-0.002329,-0.001347,-0.00055,0.000458,0.001015,0.001188,0.000824
4,-0.001755,-0.000824,-0.00109,-0.000684,-0.00115,-0.000924,-0.001274,-0.001736,-0.001363,-0.0016,...,-0.000362,-0.000163,-0.000575,-0.0009,-0.001017,-0.00095,-0.000902,-7.9e-05,-6.1e-05,-0.00025


In [8]:
data_train["summary_y"].head()

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [9]:
data_test["full_array_x"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6606,6607,6608,6609,6610,6611,6612,6613,6614,6615
0,-7.081498e-12,3.023198e-12,-4.807648e-13,5.150886e-12,-9.863128e-12,-1.635877e-11,-1.718166e-11,-1.611888e-11,2.926109e-13,-8.349654e-12,...,-0.000801,-0.000809,-0.000719,-0.000568,-0.000373,-0.000154,-0.00031,-0.00073,-0.00089,-0.001302
1,-0.0001180445,-9.090206e-05,-4.473932e-05,-8.19936e-06,4.844622e-05,6.966181e-05,8.276513e-05,0.0001221657,9.334985e-05,6.694806e-05,...,0.00018,0.000117,9.3e-05,7.6e-05,0.000122,8.1e-05,9.9e-05,2e-05,3.7e-05,1.3e-05
2,-0.001290956,-0.001098792,-0.0007605386,-0.0006580441,-0.0006196516,-0.0003201471,-0.0001974843,9.233471e-05,4.130124e-05,0.0002850668,...,-0.000516,-0.000366,-0.000552,5.6e-05,-0.000458,0.000124,0.000334,-0.000393,0.000316,-0.000138
3,0.0001361324,3.857104e-05,9.772467e-05,4.131103e-05,9.039197e-05,0.0001373085,7.410397e-05,0.0001828057,9.585278e-05,0.0002145263,...,0.000226,0.000178,8.1e-05,0.000181,0.000263,0.000245,0.000267,0.000212,4.3e-05,-1.3e-05
4,-1.379376e-06,0.001027835,0.0007862512,0.001164297,0.0009587529,0.0008629552,0.000614072,-0.0002450478,-0.0003455326,-0.0003960984,...,0.000123,-3.1e-05,-4.4e-05,0.000187,0.000218,0.000273,6.9e-05,-6e-06,0.000175,0.000268


In [11]:
data_test["full_array_y"].head()

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


# Save Train-Test Split Data

In [12]:
data_train["segments_x"].to_csv(
    os.path.join("data", "train_test_split", "train_x.csv"),
    header=False,
    index=False,
)
data_train["summary_y"].to_csv(
    os.path.join("data", "train_test_split", "train_y.csv"),
    header=False,
    index=False,
)
data_test["full_array_x"].to_csv(
    os.path.join("data", "train_test_split", "test_x.csv"),
    header=False,
    index=False,
)
data_test["full_array_y"].to_csv(
    os.path.join("data", "train_test_split", "test_y.csv"),
    header=False,
    index=False,
)

print(time.time() - START_TIME)

149.50476241111755
