# Setup

In [9]:
import pandas as pd
import numpy as np
import toolbox  # my own little package I made to help with curation work
from datetime import datetime

today = datetime.now().strftime("%Y%m%d")

## Dataset Creation

In [3]:
csv_path = "../data/cleaned-data-20231215.csv"
df = pd.read_csv(csv_path, low_memory=False)

# Feature selection
df = df.loc[
    :,
    [
        "ID",
        "TYPE",
        "TABLE",
        "NAME",
        "FILEFORMAT",
        "STUDY",
        "ASSAY",
        "DATATYPE",
        "DATASUBTYPE",
        "RESOURCETYPE",
    ],
]

# Explode list columns into rows
df["ASSAY"] = df["ASSAY"].str.split(",")
df = df.explode("ASSAY")


og_shape = df.shape
# print("Original dataset info")
# df.info()

# focusing on file annotations first
df = df[df["TYPE"] != "folder"]

# drop any missing values to develop training/test sets
df_full = df.dropna(how="any")
new_shape = df_full.shape


df.loc[~df.index.isin(df_full.index),].to_csv(
    f"../data/testing-dataset-withNulls-{today}.csv"
)
# print("-" * 50)
# print("New dataset info")
# df_full.info()

# print("-" * 50)
print(
    f"Rows removed: {(np.array(og_shape) - np.array(new_shape))[0]} \
        \nPercentage of original dataframe {round(((np.array(og_shape) - np.array(new_shape))[0]/np.array(og_shape))[0] * 100,2)}%"
)

Rows removed: 375598         
Percentage of original dataframe 71.62%


In [4]:
df_full.shape

(148805, 10)

In [7]:
# split dataset into train, validation, test sets
training_percent = 0.6
validation_percent = training_percent + 0.2
# test set is remaining amount

train, val, test = np.split(
    df_full.sample(frac=1, random_state=42),
    [int(training_percent * len(df_full)), int(validation_percent * len(df_full))],
)

In [10]:
train.to_csv(f"../data/training-set-{today}")
val.to_csv(f"../data/val-set-{today}")
test.to_csv(f"../data/test-set-{today}")