# Setup

In [11]:
import pandas as pd
import numpy as np
import toolbox  # my own little package I made to help with curation work
from datetime import datetime

today = datetime.now().strftime("%d%b%Y")

## Dataset Creation

In [15]:
csv_path = "../data/cleaned-data-20231214.csv"
df = pd.read_csv(csv_path, low_memory=False)

# Feature selection
df = df.loc[
    :,
    [
        "ID",
        "TYPE",
        "TABLE",
        "NAME",
        "FILEFORMAT",
        "STUDY",
        "ASSAY",
        "DATATYPE",
        "DATASUBTYPE",
        "RESOURCETYPE",
    ],
]

# Explode list columns into rows
df["ASSAY"] = df["ASSAY"].str.split(",")
df = df.explode("ASSAY")

df.loc[~df.index.isin(df_full.index),].to_csv(
    f"../data/testing-dataset-withNulls-{today}.csv"
)

og_shape = df.shape
# print("Original dataset info")
# df.info()

# focusing on file annotations first
df = df[df["TYPE"] != "folder"]

# drop any missing values to develop training/test sets
df = df.dropna(how="any")
new_shape = df.shape

# print("-" * 50)
# print("New dataset info")
# df_full.info()

# print("-" * 50)
print(
    f"Rows removed: {(np.array(og_shape) - np.array(new_shape))[0]} \
        \nPercentage of original dataframe {round(((np.array(og_shape) - np.array(new_shape))[0]/np.array(og_shape))[0] * 100,2)}%"
)

Rows removed: 375598         
Percentage of original dataframe 71.62%


In [18]:
# split dataset into train, validation, test sets
training_percent = 0.6
validation_percent = training_percent + 0.2
# test set is remaining amount

train, validate, test = np.split(
    df.sample(frac=1, random_state=42),
    [int(training_percent * len(df)), int(validation_percent * len(df))],
)

In [19]:
train

Unnamed: 0,ID,TYPE,TABLE,NAME,FILEFORMAT,STUDY,ASSAY,DATATYPE,DATASUBTYPE,RESOURCETYPE
166907,syn30142960,file,SAGE.PORTAL_RAW.AD,29487-DLPFC.recalibrated.haplotypeCalls.annota...,TBI,AMP-AD_DiverseCohorts,whole genome sequencing assay,genomicVariants,SNPvariants,experimentalData
152505,syn27271731,file,SAGE.PORTAL_RAW.AD,ROSMAP.SM-CJFO3.BreakSeq.vcf.gz,GZ,SV_xQTL,whole genome sequencing assay,genomicVariants,processed,analysis
11827,syn7385610,file,SAGE.PORTAL_RAW.AD,5815381006_R06C01_3-Swath1_Grn.jpg,JPG,ROSMAP,methylation array,epigenetics,raw,experimentalData
199931,syn51720070,file,SAGE.PORTAL_RAW.AD,18207_CN.FCHMK7HDSX2_L4_ICCGCGGTT-AGCGCTAG.bam...,BAI,AMP-AD_DiverseCohorts,RNA-seq assay,geneExpression,raw,experimentalData
110045,syn21301535,file,SAGE.PORTAL_RAW.AD,R2393217-PCC.final.bam,BAM,ROSMAP,RNA-seq assay,geneExpression,raw,experimentalData
...,...,...,...,...,...,...,...,...,...,...
149419,syn27268633,file,SAGE.PORTAL_RAW.AD,MSBB.71737.BreakSeq.vcf.gz,GZ,SV_xQTL,whole genome sequencing assay,genomicVariants,processed,analysis
187719,syn47443979,file,SAGE.PORTAL_RAW.AD,11323_TCX.FCHMLG2DSX2_L2_IATGAGGCC-GTTAATTG.bam,BAM,AMP-AD_DiverseCohorts,RNA-seq assay,geneExpression,raw,experimentalData
519118,syn23541635,file,SAGE.PORTAL_RAW.PSYCHENCODE,LIBD313_R2.fastq.gz,GZ,LIBD-WGBS-SCZD,bisulfite sequencing assay,epigenetics,processed,experimentalData
173048,syn31114639,file,SAGE.PORTAL_RAW.AD,SQ_MX2024-9_S01_L003_R1_001.fastq.gz,GZ,SEA-AD,10xmultiome,geneExpression,raw,experimentalData


# Feature Engineering

In [None]:
# tokenizer