In [1]:
import pandas as pd
import numpy as np
import os
import parameters
from tqdm import tqdm


In [28]:
parameters.KNN

5

In [None]:
header = ["chr", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "peak"]
all_chroms = {f"chr{i}" for i in range(1, 23)}
B = {"chr14", "chr19"}
C = {"chr1"}
A = all_chroms - B - C


def bed_dataset_statistics(train_or_val):
    data = {}
    sorted_indexes = {}

    n_sorted_indexes = 100


    for cell_type in ["X1", "X2"]:
        df = pd.read_csv(f"Data/CAGE-train/{cell_type}_{train_or_val}_info.tsv", sep="\t") #, index_col="gene_name")

        mask = df['strand'] == '-'
        df.loc[mask, ['TSS_start', 'TSS_end']] = df.loc[mask, ['TSS_end', 'TSS_start']].values
        df = df.drop(columns=['strand'])

        data[cell_type] = np.zeros((len(df), parameters.N_BEDS, parameters.KNN, parameters.N_FEATURES_BED))
        sorted_indexes[cell_type] = -np.ones((len(df), parameters.N_BEDS, n_sorted_indexes), dtype=int)
        
        for signal_index, signal in enumerate(os.listdir("Data/bed/")):
            bed = pd.read_csv(f"Data/bed/{signal}/{cell_type}.bed", names=header, sep="\t")
            bed["center"] = (bed["start"] + bed["end"]) // 2

            # joined_df = df.merge(bed, on="chr", how="inner", suffixes=('_train', '_bed'))
            # joined_df["distance"] = np.abs(joined_df["center"] - joined_df["TSS_start"])
            # joined_df = joined_df.groupby("gene_name").apply(lambda x: x.nsmallest(parameters.KNN, "distance"))
            # print(joined_df.head())

            for row_index, row in tqdm(df.iterrows(), total=len(df)):
                chr = row["chr"]
                TSS_start = row["TSS_start"]
                same_chromosome = bed[bed["chr"] == chr].copy()
                same_chromosome["rel_pos_TSS_start"] = np.abs(same_chromosome["center"] - TSS_start)
                closest_indexes = np.argsort(same_chromosome["rel_pos_TSS_start"])[:n_sorted_indexes]
                knn = same_chromosome.iloc[closest_indexes[:parameters.KNN]]
                knn["rel_pos_TSS_end"] = knn["center"] - row["TSS_end"]
                knn["rel_pos_gene_start"] = knn["center"] - row["gene_start"]
                knn["rel_pos_gene_end"] = knn["center"] - row["gene_end"]

                data[cell_type][row_index, signal_index, :, 0] = knn["signalValue"].values
                data[cell_type][row_index, signal_index, :, 1] = knn["rel_pos_TSS_start"].values
                data[cell_type][row_index, signal_index, :, 2] = knn["rel_pos_TSS_end"].values
                data[cell_type][row_index, signal_index, :, 3] = knn["rel_pos_gene_start"].values
                data[cell_type][row_index, signal_index, :, 4] = knn["rel_pos_gene_end"].values
                sorted_indexes[cell_type][row_index, signal_index, :len(closest_indexes)] = closest_indexes
                
    return data, sorted_indexes
            


In [78]:
val_data, val_sorted_indexes = bed_dataset_statistics("val")
np.savez_compressed("preprocessed/val_bed_data.npz", X1=val_data["X1"], X2=val_data["X2"],
                    X1_sorted_indexes=val_sorted_indexes["X1"], X2_sorted_indexes=val_sorted_indexes["X2"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn["rel_pos_TSS_end"] = knn["center"] - row["TSS_end"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn["rel_pos_gene_start"] = knn["center"] - row["gene_start"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  knn["rel_pos_gene_end"] = knn["center"] - row["gene_end"]
A value is trying to be set o

KeyboardInterrupt: 

In [26]:
y_train = {}
for cell_type in ["X1", "X2"]:
    train_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_train_y.tsv", sep="\t") #, index_col="gene_name")
    y_train[cell_type] = np.array(train_df["gex"].values)

In [35]:
np.savez(f"Data/processed/data_train.npz", X1=data_train["X1"], X2=data_train["X2"], labels_X1=y_train["X1"], labels_X2=y_train["X2"])

In [None]:
np.savez_compressed(f"Data/processed/sorted_indexes.npz", X1=sorted_indexes["X1"], X2=sorted_indexes["X2"])


In [None]:


np.savez(f"Data/processed/labels_train.npz", X1=y_train["X1"], X2=y_train["X2"])

Create the train data based on already sorted indexes (which is computationally demanding).

In [None]:
def process_data(train_or_val):
    sorted_indexes = np.load(f"Data/processed/sorted_indexes.npz")
    data = {}
    for cell_type in ["X1", "X2"]:
        indexes = sorted_indexes[cell_type]
        df = pd.read_csv(f"Data/CAGE-train/{cell_type}_{train_or_val}_info.tsv", sep="\t") #, index_col="gene_name")
        data[cell_type] = np.zeros((len(df), parameters.N_BEDS, parameters.KNN, 5))

        for signal_index, signal in enumerate(os.listdir("Data/bed/")):
            bed = pd.read_csv(f"Data/bed/{signal}/{cell_type}.bed", names=header, sep="\t")
            bed["center"] = (bed["start"] + bed["end"]) // 2
            for row_index, row in tqdm(df.iterrows(), total=len(df)):
                knn_indx = indexes[row_index, signal_index, :parameters.KNN]
                same_chromosome = bed[bed["chr"] == row["chr"]].copy()
                knn = same_chromosome.iloc[knn_indx].copy()
                knn["rel_pos_TSS_start"] = knn["center"] - row["TSS_start"]
                knn["rel_pos_TSS_end"] = knn["center"] - row["TSS_end"]
                knn["rel_pos_gene_start"] = knn["center"] - row["gene_start"]
                knn["rel_pos_gene_end"] = knn["center"] - row["gene_end"]

                data[cell_type][row_index, signal_index, :, 0] = knn["signalValue"].values
                data[cell_type][row_index, signal_index, :, 1] = knn["rel_pos_TSS_start"].values
                data[cell_type][row_index, signal_index, :, 2] = knn["rel_pos_TSS_end"].values
                data[cell_type][row_index, signal_index, :, 3] = knn["rel_pos_gene_start"].values
                data[cell_type][row_index, signal_index, :, 4] = knn["rel_pos_gene_end"].values
        


                
        

100%|██████████| 14310/14310 [01:55<00:00, 123.85it/s]
100%|██████████| 14310/14310 [01:29<00:00, 159.72it/s]
100%|██████████| 14310/14310 [02:05<00:00, 114.39it/s]
100%|██████████| 14310/14310 [01:17<00:00, 184.49it/s]
100%|██████████| 14310/14310 [01:01<00:00, 234.54it/s]
100%|██████████| 14310/14310 [00:49<00:00, 291.17it/s]
100%|██████████| 14310/14310 [01:38<00:00, 145.51it/s]
100%|██████████| 14310/14310 [01:54<00:00, 125.07it/s]
100%|██████████| 14310/14310 [03:39<00:00, 65.34it/s]
100%|██████████| 14310/14310 [02:44<00:00, 86.84it/s] 
100%|██████████| 14310/14310 [01:07<00:00, 213.43it/s]
100%|██████████| 14310/14310 [00:58<00:00, 245.61it/s]
100%|██████████| 14310/14310 [02:06<00:00, 112.89it/s]
100%|██████████| 14310/14310 [01:54<00:00, 125.26it/s]


In [None]:
train_data = np.load("Data/train_data.npz", allow_pickle=True)

In [None]:
X = train_data["x"]
X[:, :, 0].mean()
len(X)

32568

# CNN

In [None]:
parameters.SIGNALS_CNN



['DNase', 'H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K36me3']

In [52]:
def process_dataframe(df_x, df_y):
    # Merge x and y on gene_name
    df = pd.merge(df_x, df_y, on='gene_name')

    # Invert given the strand
    mask = df['strand'] == '-'
    df.loc[mask, ['TSS_start', 'TSS_end']] = df.loc[mask, ['TSS_end', 'TSS_start']].values
    df = df.drop(columns=['strand'])

    # Replace chromosome with numeric values
    def chr_to_num(chr_val):
        chr_val = chr_val.replace('chr','')
        return int(chr_val)
    df['chr'] = df['chr'].apply(chr_to_num)
    
    # Sort by chromosome
    df = df.sort_values(by='chr')

    # Split into list of DataFrames by chromosome
    xs_list = [group for _, group in df.groupby('chr')]

    gene_lists = []
    ys_list = []
    xs_numpy = []
    ys_numpy = []
    chroms = []

    for i, df_chr in enumerate(xs_list):
        gene_list = df_chr.pop('gene_name').tolist()  # extract gene_name
        y = df_chr.pop('gex').tolist()     
        chroms.append(df_chr['chr'].iloc[0])         # extract target values
        df_chr = df_chr.drop(columns=['chr'])         # remove chr column
        xs_list[i] = df_chr                            # update the list
        gene_lists.append(gene_list)                  # store gene names
        ys_list.append(y)                              # store target values

        # Convert DataFrame to numpy array (TSS Start and TSS End only)
        xs_numpy.append(df_chr.to_numpy())
        # Convert target list to numpy array
        ys_numpy.append(np.array(y))

    return xs_numpy, ys_numpy, gene_lists, chroms

#Cage data
cage_path = r'Data/CAGE-train/'  # raw string or forward slashes

train_x1 = pd.read_csv(cage_path + 'X1_train_info.tsv', sep='\t', usecols=[0,1,4,5,6])
train_y1 = pd.read_csv(cage_path + 'X1_train_y.tsv', sep='\t')
valid_x1 = pd.read_csv(cage_path + 'X1_val_info.tsv', sep='\t', usecols=[0,1,4,5,6])
valid_y1 = pd.read_csv(cage_path + 'X1_val_y.tsv', sep='\t')
train_x2 = pd.read_csv(cage_path + 'X2_train_info.tsv', sep='\t', usecols=[0,1,4,5,6])
train_y2 = pd.read_csv(cage_path + 'X2_train_y.tsv', sep='\t')
valid_x2 = pd.read_csv(cage_path + 'X2_val_info.tsv', sep='\t', usecols=[0,1,4,5,6])
valid_y2 = pd.read_csv(cage_path + 'X2_val_y.tsv', sep='\t')
test_x = pd.read_csv(cage_path + 'X3_test_info.tsv', sep='\t', usecols=[0,1,4,5,6])

t_x1, t_y1, t_names1, t_chroms1 = process_dataframe(train_x1, train_y1)
t_x2, t_y2, t_names2, t_chroms2 = process_dataframe(train_x2, train_y2)

train_x = t_x1 + t_x2
train_y = t_y1 + t_y2
train_names = t_names1 + t_names2

v_x1, v_y1, v_names1, v_chroms1 = process_dataframe(valid_x1, valid_y1)
v_x2, v_y2, v_names2, v_chroms2 = process_dataframe(valid_x2, valid_y2)

valid_x = v_x1 + v_x2
valid_y = v_y1 + v_y2
valid_names = v_names1 + v_names2


In [19]:
import pyBigWig
import glob

In [72]:
half_window = int(parameters.SIGNAL_CNN_WINDOW // 2)
chromstrs = []
ranges = []


def get_signals_bins(df, cell_type):
    df["neg_strand"] = mask = df['strand'] == '-'
    df.loc[mask, ['TSS_start', 'TSS_end']] = df.loc[mask, ['TSS_end', 'TSS_start']].values
    df["center"] = ((df["TSS_start"] + df["TSS_end"]) // 2).astype(int)
    df["window_start"] = df["center"] - half_window
    df["window_end"] = df["center"] + half_window
    # df.loc[mask, ['window_start', 'window_end']] = df.loc[mask, ['window_end', 'window_start']].values
    # df.drop(columns=['strand', 'TSS_start', 'TSS_end', 'neg_strand', 'center'], inplace=True)

    bins_signal_gene = np.zeros((len(df), len(parameters.SIGNALS_CNN), parameters.CNN_N_BINS))
    for i, signal in enumerate(parameters.SIGNALS_CNN):
        print("Processing signal:", signal, f"{i+1}/{len(parameters.SIGNALS_CNN)}")

        for j, (chromstr, window_start, window_end, neg_strand) in enumerate(tqdm(df[["chr", "window_start", "window_end", "neg_strand"]].itertuples(index=False), total=len(df))):
            fname = glob.glob(f"Data/bigwig/{signal}-bigwig/{cell_type}*")[0]
            bw = pyBigWig.open(fname)
            # print(chromstr, ranges[0][0], ranges[0][1])
            bins = bw.stats(chromstr, window_start, window_end, type="mean", nBins=parameters.CNN_N_BINS)
            if neg_strand:
                bins = bins[::-1]
            bins_signal_gene[j, i] = bins
            # print(chr_i + 1, x, y)
            bw.close()
            
    return bins_signal_gene

In [73]:
df = pd.read_csv('./Data/CAGE-train/X2_train_info.tsv', sep='\t', usecols=[0,1,4,5,6])
signal_bins_X2_train = get_signals_bins(df, "X2")
np.save('Data/processed/cnn_input_X2_train.npy', signal_bins_X2_train)

Processing signal: DNase 1/5


100%|██████████| 14310/14310 [00:48<00:00, 294.14it/s]


Processing signal: H3K4me1 2/5


100%|██████████| 14310/14310 [04:34<00:00, 52.09it/s]


Processing signal: H3K4me3 3/5


100%|██████████| 14310/14310 [04:39<00:00, 51.21it/s] 


Processing signal: H3K27ac 4/5


100%|██████████| 14310/14310 [03:09<00:00, 75.39it/s] 


Processing signal: H3K36me3 5/5


100%|██████████| 14310/14310 [06:12<00:00, 38.37it/s]


In [74]:
df = pd.read_csv('./Data/CAGE-train/X1_val_info.tsv', sep='\t', usecols=[0,1,4,5,6])
signal_bins_X1_val = get_signals_bins(df, "X1")
np.save('Data/processed/cnn_input_X1_val.npy', signal_bins_X1_val)


Processing signal: DNase 1/5


100%|██████████| 1974/1974 [00:03<00:00, 514.25it/s]


Processing signal: H3K4me1 2/5


100%|██████████| 1974/1974 [00:10<00:00, 183.28it/s]


Processing signal: H3K4me3 3/5


100%|██████████| 1974/1974 [00:14<00:00, 137.05it/s]


Processing signal: H3K27ac 4/5


100%|██████████| 1974/1974 [00:19<00:00, 101.97it/s]


Processing signal: H3K36me3 5/5


100%|██████████| 1974/1974 [00:19<00:00, 103.35it/s]


In [75]:

df = pd.read_csv('./Data/CAGE-train/X2_val_info.tsv', sep='\t', usecols=[0,1,4,5,6])
signal_bins_X2_val = get_signals_bins(df, "X2")
np.save('Data/processed/cnn_input_X2_val.npy', signal_bins_X2_val)

Processing signal: DNase 1/5


100%|██████████| 1974/1974 [00:08<00:00, 233.62it/s]


Processing signal: H3K4me1 2/5


100%|██████████| 1974/1974 [00:32<00:00, 61.26it/s] 


Processing signal: H3K4me3 3/5


100%|██████████| 1974/1974 [00:42<00:00, 46.52it/s] 


Processing signal: H3K27ac 4/5


100%|██████████| 1974/1974 [00:19<00:00, 99.90it/s] 


Processing signal: H3K36me3 5/5


100%|██████████| 1974/1974 [00:23<00:00, 82.96it/s] 


In [76]:
y_val = {}
for cell_type in ["X1", "X2"]:
    val_df = pd.read_csv(f"Data/CAGE-train/{cell_type}_val_y.tsv", sep="\t") #, index_col="gene_name")
    values = np.array(val_df["gex"].values)
    np.save(f"Data/processed/{cell_type}_val_y.npy", values)
