# Preliminaries

> Get the data ready for the challenge (not for participants)

In [1]:
import glob
import os
import pandas as pd
from fastcore.basics import Path, AttrDict
from sklearn.model_selection import train_test_split
import utils
import shutil
from fastcore.all import *
from utils import *

In [2]:
config = AttrDict(
    nodes_dir = Path('../labels').expanduser(),
    data_dirs = Path('../dataset').expanduser(),
    test_ratio = 0.3, # random test set ratio with respect to the whole dataset
    output_dir = Path('../output'),
    test_ids = None
)


In [3]:
# List all the files in the directories (there are more than one data dir)
nodes_files = [os.path.join(config.nodes_dir, f) for f in os.listdir(config.nodes_dir) if f.endswith(".csv")]
data_filepaths = [os.path.join(config.data_dirs, f) for f in os.listdir(config.data_dirs) if f.endswith(".csv")]

# data_filepaths = []
# for data_dir in config.data_dirs:
#     data_filepaths += [os.path.join(data_dir, f) for f in os.listdir(data_dir)]

# Print the number of files and some file names for verification (verbose printing)
print("Number of files in nodes directory: ", len(nodes_files))
print("Number of files in data directory: ", len(data_filepaths))
print("Some files in nodes directory: ", nodes_files[:5])
print("Some files in data directory: ", data_filepaths[:5])

Number of files in nodes directory:  5
Number of files in data directory:  5
Some files in nodes directory:  ['../labels/001.csv', '../labels/003.csv', '../labels/002.csv', '../labels/005.csv', '../labels/004.csv']
Some files in data directory:  ['../dataset/001.csv', '../dataset/003.csv', '../dataset/002.csv', '../dataset/005.csv', '../dataset/004.csv']


In [4]:
# Read all the ground truth files and concatenate them into a single DataFrame
# Also add a new column 'ObjectID' which is derived from the filename
ground_truth_data = []
for file_path in nodes_files:
    df = pd.read_csv(file_path)
    oid_s = os.path.basename(file_path).split('.')[0]  # Extract ObjectID from filename
    df['ObjectID'] = int(oid_s)
    ground_truth_data.append(df)

ground_truth_data = pd.concat(ground_truth_data)
ground_truth_data = ground_truth_data[['ObjectID'] + 
                                      list(ground_truth_data.columns[:-1])]
ground_truth_data.head()

Unnamed: 0,ObjectID,Time Index,Direction,Node,Type
0,1,0,EW,SS,HK
1,1,0,NS,SS,NK
2,1,568,NS,IK,HK
3,1,2172,ES,ES,ES
0,3,0,EW,SS,EK


In [5]:
# Usage
ground_truth_data     

Unnamed: 0,ObjectID,Time Index,Direction,Node,Type
0,1,0,EW,SS,HK
1,1,0,NS,SS,NK
2,1,568,NS,IK,HK
3,1,2172,ES,ES,ES
0,3,0,EW,SS,EK
1,3,0,NS,SS,EK
2,3,855,EW,ID,NK
3,3,855,NS,ID,NK
4,3,882,EW,AD,NK
5,3,982,EW,AD,NK


In [6]:
# Split the ground truth data into training and testing sets
# Ensure that the same 'ObjectID' does not appear in both sets
object_ids = ground_truth_data['ObjectID'].unique()
if config.test_ids is None:
    train_ids, test_ids = train_test_split(object_ids, test_size=config.test_ratio, 
                                        random_state=42)
else:
    train_ids = list(set(object_ids) - set(config.test_ids))
    test_ids = config.test_ids

ground_truth_train = ground_truth_data[
    ground_truth_data['ObjectID'].isin(train_ids)
].copy()
ground_truth_test = ground_truth_data[
    ground_truth_data['ObjectID'].isin(test_ids)
].copy()

# Check if there are any labels in the test set that are not present in the 
# training set. Labels are considered as a combination of 'Node' and 'Type'
ground_truth_train.loc[:, 'label'] = ground_truth_train['Node'] + '-' + \
                                        ground_truth_train['Type']
ground_truth_test.loc[:, 'label'] = ground_truth_test['Node'] + '-' + \
                                        ground_truth_test['Type']

# Split the labels by Directions 'EW' and 'NS' and check if all labels in the test set are present in the training set
ew_train_labels = set(ground_truth_train[ground_truth_train['Direction'] == 'EW']['label'].unique())
ns_train_labels = set(ground_truth_train[ground_truth_train['Direction'] == 'NS']['label'].unique())

ew_test_labels = set(ground_truth_test[ground_truth_test['Direction'] == 'EW']['label'].unique())
ns_test_labels = set(ground_truth_test[ground_truth_test['Direction'] == 'NS']['label'].unique())

ew_test_not_in_train = ew_test_labels - ew_train_labels
ns_test_not_in_train = ns_test_labels - ns_train_labels

ew_test_not_in_train, ns_test_not_in_train



({'IK-CK', 'IK-EK', 'SS-EK'}, {'ID-NK', 'IK-EK', 'SS-EK'})

In [7]:
# Save the ground truth files to CSV
ground_truth_train_path = config.output_dir / "ground_truth_train.csv"
ground_truth_test_path = config.output_dir / "ground_truth_test.csv"
train_data_dir = config.output_dir / "train_data"
test_data_dir = config.output_dir / "test_data"

# Create directories for all the output
if os.path.exists(config.output_dir):
    shutil.rmtree(config.output_dir)
os.makedirs(config.output_dir, exist_ok=False)
os.makedirs(train_data_dir, exist_ok=False)
os.makedirs(test_data_dir, exist_ok=False)

# Drop the 'label' column before saving the ground truth
ground_truth_train.drop(columns='label').to_csv(ground_truth_train_path, index=False)
ground_truth_test.drop(columns='label').to_csv(ground_truth_test_path, index=False)

# Move the data files to the appropriate directories
for file_path in data_filepaths:
    object_id_s = Path(file_path).stem
    object_id = int(object_id_s)
    if object_id in object_ids:
        df = pd.read_csv(file_path)

        # Write the file to the appropriate directory (train or test)
        if object_id in train_ids:
            new_csv_file_path = train_data_dir / f'{object_id}.csv'
        elif object_id in test_ids:
            new_csv_file_path = test_data_dir / f'{object_id}.csv'
            
        df.to_csv(new_csv_file_path, index=False)

ground_truth_train_path, ground_truth_test_path


(Path('../output/ground_truth_train.csv'),
 Path('../output/ground_truth_test.csv'))

In [8]:
# Print train and test ids, and sizes
print(train_ids, test_ids)
print(len(train_ids), len(test_ids))

[2 1 5] [3 4]
3 2
