# Setup

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import time

sys.path.append(os.getcwd())

import python_utils

START_TIME = time.time()
WINDOW_SIZE = 6616 # From 01_data_reformatting.ipynb
TEST_WINDOW_OVERLAP = 0.5
TRAIN_PROPORTION = 0.66

np.random.seed(42)

# Create Train-Test Split

In [3]:
datafiles = [x[:-4] for x in os.listdir(os.path.join("data", "preprocessed_data", "segments_x"))]
train_number_of_files = round(TRAIN_PROPORTION * len(datafiles))
train_files = np.random.choice(datafiles, train_number_of_files).tolist()
test_files = [x for x in datafiles if x not in train_files]

print("Train Files:", train_files)
print("Test Files:", test_files)

Train Files: ['PXL_20250320_200642835.TS_1', 'PXL_20250222_013140200.TS_1', 'PXL_20250320_200905850.TS_1', 'PXL_20250307_005644623.TS_1', 'PXL_20250312_215759199.TS_1', 'PXL_20250320_200905850.TS_1', 'PXL_20250223_164224211.TS_1', 'PXL_20250320_200642835.TS_1', 'PXL_20250222_012947893.TS_2']
Test Files: ['PXL_20250320_200905850.TS_2', 'PXL_20250320_200743743.TS_1', 'PXL_20250223_164321190.TS_1', 'PXL_20250222_012947893.TS_1', 'PXL_20250223_164250127.TS_1', 'PXL_20250307_005733826.TS_1']


# Load and Process Data

In [4]:
def combine_datafiles(files_list: list, data_categories: list = ["full_array_x", "full_array_y", "segments_x", "summary_y"]) -> dict:
    data = {data_category: pd.DataFrame() for data_category in data_categories}
    for _file in files_list:
        print(f"Reading {_file} files...")
        for data_category in data:
            data[data_category] = pd.concat([
                data[data_category],
                pd.read_csv(
                    os.path.join("data", "preprocessed_data", data_category, _file + ".csv"),
                    header=None,
                )
            ])
        print("Done")
    return data

In [5]:
print("Loading train data...")
data_train = combine_datafiles(files_list=train_files, data_categories=["segments_x", "summary_y"])

print("Loading test data...")
data_test = dict()
for data_category in ["full_array_x", "full_array_y"]:
    data_test[data_category] = []
    # Load test data (individual files)
    for test_file in test_files:
        print(f"Reading {test_file} files...")
        temp = np.squeeze(pd.read_csv(
            os.path.join("data", "preprocessed_data", data_category, test_file + ".csv"),
            header=None,
        ).values)
        # Segment them with test overlap (50%)
        temp2 = python_utils.segment_array(
            array_to_segment=temp,
            window_size=WINDOW_SIZE,
            window_overlap=TEST_WINDOW_OVERLAP,
        )
        for i in range(temp2.shape[0]):
            data_test[data_category].append(temp2[i])
        print("Done")
    data_test[data_category] = pd.DataFrame(data_test[data_category])
    if data_category == "full_array_y":
        data_test[data_category] = data_test[data_category].max(axis=1).to_frame()

Loading train data...
Reading PXL_20250320_200642835.TS_1 files...
Done
Reading PXL_20250222_013140200.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_1 files...
Done
Reading PXL_20250307_005644623.TS_1 files...
Done
Reading PXL_20250312_215759199.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_1 files...
Done
Reading PXL_20250223_164224211.TS_1 files...
Done
Reading PXL_20250320_200642835.TS_1 files...
Done
Reading PXL_20250222_012947893.TS_2 files...
Done
Loading test data...
Reading PXL_20250320_200905850.TS_2 files...
Done
Reading PXL_20250320_200743743.TS_1 files...
Done
Reading PXL_20250223_164321190.TS_1 files...
Done
Reading PXL_20250222_012947893.TS_1 files...
Done
Reading PXL_20250223_164250127.TS_1 files...
Done
Reading PXL_20250307_005733826.TS_1 files...
Done
Reading PXL_20250320_200905850.TS_2 files...
Done
Reading PXL_20250320_200743743.TS_1 files...
Done
Reading PXL_20250223_164321190.TS_1 files...
Done
Reading PXL_20250222_012947893.TS_1 files...
Done
Reading

In [6]:
data_train["segments_x"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6606,6607,6608,6609,6610,6611,6612,6613,6614,6615
0,2.390049e-11,1.094514e-11,2.578337e-11,1.06885e-11,1.664369e-11,3.649222e-12,-9.793982e-11,3.747538e-11,-3.783489e-11,-2.102719e-13,...,0.000319,0.000309,0.000219,0.000182,5.5e-05,-0.000176,-0.000473,-0.000654,-0.000779,-0.000718
1,-0.001358019,-0.001485531,-0.001534951,-0.001438814,-0.001297557,-0.001221835,-0.001170019,-0.001193255,-0.001112021,-0.001107959,...,0.000321,0.000714,0.000962,0.000677,-0.000585,0.000789,0.000256,6.5e-05,-0.000398,-0.000559
2,3.444898e-05,-6.758863e-05,-0.0002535971,0.0001123806,-0.0005720851,-0.0004451933,-0.0005045872,-0.0004424039,-0.000986521,-0.0004427924,...,0.000735,0.000815,0.000661,0.000497,0.000395,0.000164,7.1e-05,0.000187,0.000259,5e-05
3,0.000808653,0.0005033203,0.000250774,0.0001297907,5.712712e-05,1.192964e-05,-2.909437e-05,-5.321015e-05,-8.823019e-05,1.482633e-05,...,-0.00091,-0.001065,-0.001275,-0.001498,-0.001632,-0.001715,-0.001791,-0.001817,-0.001896,-0.001898
4,-0.0006016486,-0.0005067146,-0.0003587177,-0.0004051589,-0.0003088365,-0.0002002909,-0.0001407144,-0.000133518,-0.0001934257,-0.0001032648,...,-0.000679,-0.000757,-0.000808,-0.00083,-0.000828,-0.00067,-0.000444,-0.000301,-0.000167,-6.7e-05


In [7]:
data_train["summary_y"].head()

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [8]:
data_test["full_array_x"].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6606,6607,6608,6609,6610,6611,6612,6613,6614,6615
0,-0.000692,-0.000786,-0.00035,-0.000245,-7e-06,-3.4e-05,-0.000226,-0.00015,-7.7e-05,1e-06,...,0.000375,-0.000111,-0.000622,-0.001109,-0.001437,-0.001646,-0.001724,-0.001669,-0.001627,-0.001567
1,0.001357,0.001216,0.001132,0.001078,0.001023,0.000994,0.000872,0.000734,0.000578,0.000413,...,0.003386,0.003645,0.00388,0.004166,0.004499,0.004788,0.004988,0.005147,0.00515,0.005105
2,-0.001551,-0.001497,-0.001317,-0.001037,-0.00077,-0.000608,-0.000514,-0.000511,-0.000532,-0.000597,...,-0.001881,-0.002028,-0.002342,-0.002334,-0.002555,-0.003181,-0.003317,-0.003347,-0.003597,-0.003646
3,0.005151,0.004975,0.004552,0.004036,0.003581,0.003179,0.002771,0.002463,0.002225,0.002,...,0.000872,0.000293,-0.000159,0.00108,0.001646,0.001307,0.00068,0.000271,0.000937,0.000513
4,-0.003485,-0.003434,-0.003068,-0.002507,-0.001982,-0.00148,-0.001233,-0.001251,-0.001468,-0.001486,...,-0.000924,-0.000891,-0.000737,-0.000454,-0.000371,-0.000263,0.00012,0.000596,0.000946,0.001263


In [9]:
data_test["full_array_y"].head()

Unnamed: 0,0
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0


# Save Train-Test Split Data

In [10]:
data_train["segments_x"].to_csv(
    os.path.join("data", "train_test_split", "train_x.csv"),
    header=False,
    index=False,
)
data_train["summary_y"].to_csv(
    os.path.join("data", "train_test_split", "train_y.csv"),
    header=False,
    index=False,
)
data_test["full_array_x"].to_csv(
    os.path.join("data", "train_test_split", "test_x.csv"),
    header=False,
    index=False,
)
data_test["full_array_y"].to_csv(
    os.path.join("data", "train_test_split", "test_y.csv"),
    header=False,
    index=False,
)

print(time.time() - START_TIME)

36.43812704086304
