In [18]:
import numpy as np
import pandas as pd

# Get YRI test genes

In [32]:
genes = np.load("genes.npz", allow_pickle=True)
yri_test_genes = genes["yri_test_genes"]
np.save("yri_test_genes.npy", yri_test_genes)

# Get YRI test samples

In [16]:
counts_path = "/data/yosef3/users/ruchir/finetuning-enformer/process_geuvadis_data/tpm/tpm_pca_annot.csv.gz"
metadata_path = "/data/yosef3/users/ruchir/pgp_uq/data/E-GEUV-1.sdrf.txt"

In [29]:
counts_df = pd.read_csv(counts_path, index_col="our_gene_name")
counts_df = counts_df[~pd.isna(counts_df.index)]
assert np.isin(yri_test_genes, counts_df.index).all()
metadata_df = pd.read_csv(metadata_path, sep="\t", index_col=0)

In [30]:
train_data = np.load("sample_train.npz")
test_data = np.load("sample_test.npz")

non_yri_samples = set()
yri_samples = set()

for (sample, gene) in zip(train_data["samples"], train_data["genes"]):
    if gene in yri_test_genes:
        non_yri_samples.add(sample)

for (sample, gene) in zip(test_data["samples"], test_data["genes"]):
    if gene in yri_test_genes:
        yri_samples.add(sample)

assert len(yri_samples & non_yri_samples) == 0
print("# YRI samples:", len(yri_samples))
print("# Non-YRI samples:", len(non_yri_samples))

# YRI samples: 77
# Non-YRI samples: 344


In [31]:
samples_df = pd.DataFrame(
    {
        "sample": list(yri_samples) + list(non_yri_samples),
        "population": ["YRI"] * len(yri_samples) + ["Non-YRI"] * len(non_yri_samples),
    }
)
samples_df.to_csv("samples.csv", index=False)