In [1]:
import pandas as pd
import numpy as np
import os
import parameters
from tqdm import tqdm


In [28]:
parameters.KNN

5

In [None]:
header = ["chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak"]
all_chroms = {f"chr{i}" for i in range(1, 23)}
B = {"chr14", "chr19"}
C = {"chr1"}
A = all_chroms - B - C

data_train = {}
sorted_indexes = {}

n_sorted_indexes = 100


for cell_type in ["X1", "X2"]:
    train_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_train_info.tsv", sep="\t") #, index_col="gene_name")

    data_train[cell_type] = np.zeros((len(train_df), parameters.N_BEDS, parameters.KNN, parameters.N_FEATURES_BED))
    sorted_indexes[cell_type] = -np.ones((len(train_df), parameters.N_BEDS, n_sorted_indexes), dtype=int)
    
    for signal_index, signal in enumerate(os.listdir("Data/bed/")):
        bed = pd.read_csv(f"Data/bed/{signal}/{cell_type}.bed", names=header, sep="\t")
        bed["center"] = (bed["start"] + bed["end"]) // 2

        # joined_df = train_df.merge(bed, on="chr", how="inner", suffixes=('_train', '_bed'))
        # joined_df["distance"] = np.abs(joined_df["center"] - joined_df["TSS_start"])
        # joined_df = joined_df.groupby("gene_name").apply(lambda x: x.nsmallest(parameters.KNN, "distance"))
        # print(joined_df.head())

        for row_index, row in tqdm(train_df.iterrows(), total=len(train_df)):
            chr = row["chr"]
            TSS_start = row["TSS_start"]
            TSS_end = row["TSS_end"]
            same_chromosome = bed[bed["chr"] == chr].copy()
            same_chromosome["distance"] = np.abs(same_chromosome["center"] - TSS_start)
            closest_indexes = np.argsort(same_chromosome["distance"])[:n_sorted_indexes]
            knn = same_chromosome.iloc[closest_indexes[:parameters.KNN]]

            data_train[cell_type][row_index, signal_index, :, 0] = knn["signalValue"].values
            data_train[cell_type][row_index, signal_index, :, 1] = knn["distance"].values
            sorted_indexes[cell_type][row_index, signal_index, :len(closest_indexes)] = closest_indexes

        


100%|██████████| 14310/14310 [01:41<00:00, 141.68it/s]
100%|██████████| 14310/14310 [01:21<00:00, 174.69it/s]
100%|██████████| 14310/14310 [01:52<00:00, 127.32it/s]
100%|██████████| 14310/14310 [01:03<00:00, 225.61it/s]
100%|██████████| 14310/14310 [00:50<00:00, 285.45it/s]
100%|██████████| 14310/14310 [00:41<00:00, 345.02it/s]
100%|██████████| 14310/14310 [01:19<00:00, 179.56it/s]
100%|██████████| 14310/14310 [01:35<00:00, 150.01it/s]
100%|██████████| 14310/14310 [03:27<00:00, 69.02it/s]
100%|██████████| 14310/14310 [03:26<00:00, 69.43it/s]
100%|██████████| 14310/14310 [01:02<00:00, 228.40it/s]
100%|██████████| 14310/14310 [00:47<00:00, 304.18it/s]
100%|██████████| 14310/14310 [01:53<00:00, 125.63it/s]
100%|██████████| 14310/14310 [01:35<00:00, 149.60it/s]


In [26]:
y_train = {}
for cell_type in ["X1", "X2"]:
    train_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_train_y.tsv", sep="\t") #, index_col="gene_name")
    y_train[cell_type] = np.array(train_df["gex"].values)

In [35]:
np.savez(f"Data/processed/data_train.npz", X1=data_train["X1"], X2=data_train["X2"], labels_X1=y_train["X1"], labels_X2=y_train["X2"])

In [None]:
np.savez_compressed(f"Data/processed/sorted_indexes.npz", X1=sorted_indexes["X1"], X2=sorted_indexes["X2"])


In [None]:


np.savez(f"Data/processed/labels_train.npz", X1=y_train["X1"], X2=y_train["X2"])

Create the train data based on already sorted indexes (which is computationally demanding).

In [33]:
sorted_indexes = np.load(f"Data/processed/sorted_indexes.npz")
data_train = {}
for cell_type in ["X1", "X2"]:
    indexes = sorted_indexes[cell_type]
    train_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_train_info.tsv", sep="\t") #, index_col="gene_name")
    data_train[cell_type] = np.zeros((len(train_df), parameters.N_BEDS, parameters.KNN, 5))

    for signal_index, signal in enumerate(os.listdir("Data/bed/")):
        bed = pd.read_csv(f"Data/bed/{signal}/{cell_type}.bed", names=header, sep="\t")
        bed["center"] = (bed["start"] + bed["end"]) // 2
        for row_index, row in tqdm(train_df.iterrows(), total=len(train_df)):
            knn_indx = indexes[row_index, signal_index, :parameters.KNN]
            same_chromosome = bed[bed["chr"] == row["chr"]].copy()
            knn = same_chromosome.iloc[knn_indx].copy()
            knn["rel_pos_TSS_start"] = knn["center"] - row["TSS_start"]
            knn["rel_pos_TSS_end"] = knn["center"] - row["TSS_end"]
            knn["rel_pos_gene_start"] = knn["center"] - row["gene_start"]
            knn["rel_pos_gene_end"] = knn["center"] - row["gene_end"]

            data_train[cell_type][row_index, signal_index, :, 0] = knn["signalValue"].values
            data_train[cell_type][row_index, signal_index, :, 1] = knn["rel_pos_TSS_start"].values
            data_train[cell_type][row_index, signal_index, :, 2] = knn["rel_pos_TSS_end"].values
            data_train[cell_type][row_index, signal_index, :, 3] = knn["rel_pos_gene_start"].values
            data_train[cell_type][row_index, signal_index, :, 4] = knn["rel_pos_gene_end"].values


            
    

100%|██████████| 14310/14310 [01:55<00:00, 123.85it/s]
100%|██████████| 14310/14310 [01:29<00:00, 159.72it/s]
100%|██████████| 14310/14310 [02:05<00:00, 114.39it/s]
100%|██████████| 14310/14310 [01:17<00:00, 184.49it/s]
100%|██████████| 14310/14310 [01:01<00:00, 234.54it/s]
100%|██████████| 14310/14310 [00:49<00:00, 291.17it/s]
100%|██████████| 14310/14310 [01:38<00:00, 145.51it/s]
100%|██████████| 14310/14310 [01:54<00:00, 125.07it/s]
100%|██████████| 14310/14310 [03:39<00:00, 65.34it/s]
100%|██████████| 14310/14310 [02:44<00:00, 86.84it/s] 
100%|██████████| 14310/14310 [01:07<00:00, 213.43it/s]
100%|██████████| 14310/14310 [00:58<00:00, 245.61it/s]
100%|██████████| 14310/14310 [02:06<00:00, 112.89it/s]
100%|██████████| 14310/14310 [01:54<00:00, 125.26it/s]


In [None]:
train_data = np.load("Data/train_data.npz", allow_pickle=True)

In [None]:
X = train_data["x"]
X[:, :, 0].mean()
len(X)

32568