In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
import os
import copy
import math

In [26]:
files_list = os.listdir('../data/01_raw/HiCCUPS_looplist/')
files_list = [x for x in files_list if x[0] != '.']
files_list = ['../data/01_raw/HiCCUPS_looplist/' + x for x in files_list]
cell_types = [x.split('_')[-3] for x in files_list]

In [28]:
def read_HiCCUPS_file(path: str, cell_type: str) -> pd.DataFrame:
    """
    Read HiCCUPS file and return a dataframe with the following columns:
        x, y, chr, cell_type
    """
    df = pd.read_csv(path, sep='\t')
    f1 = lambda x: x['chr1'].split("chr")[-1]
    f2 = lambda x: x['chr2'].split("chr")[-1]
    df["chr1"] = df.apply(f1, axis=1)
    df["chr2"] = df.apply(f2, axis=1)
    assert len(df[df['chr1']!=df['chr2']]) == 0
    df.rename(columns={'chr1': 'chr'}, inplace=True)
    df['x'] = round((df['x1'] + df['x2'])/2)
    df['y'] = round((df['y1'] + df['y2'])/2)
    df['x_len'] = df['x2'] - df['x1']
    df['y_len'] = df['y2'] - df['y1']
    df['distance'] = df['y1'] - df['x2']
    df['cell_type'] = cell_type
    df = df[df.columns.intersection(['x', 'y', 'chr', 'cell_type', 'x1', 'x2', 'y1', 'y2', 'x_len', 'y_len', 'distance'])]
    df = df.sort_values(by=['x'])

    return df

In [33]:
loops = os.listdir('../data/01_raw/HiC_loops_annotations/')
values = {}
for file in loops:
    if not file.startswith('.'):
        df = read_HiCCUPS_file('../data/01_raw/HiC_loops_annotations/'+file, 'Non')
        values[file] = (np.unique(df['y_len']), np.unique(df['x_len']), np.median(df['distance']))

values

{'GSE63525_GM12878_primary+replicate_HiCCUPS_looplist.txt': (array([ 5000, 10000]),
  array([ 5000, 10000]),
  270000.0),
 'GSE63525_K562_HiCCUPS_looplist.txt': (array([ 5000, 10000, 25000]),
  array([ 5000, 10000, 25000]),
  250000.0),
 'GSE63525_IMR90_HiCCUPS_looplist.txt': (array([ 5000, 10000]),
  array([ 5000, 10000]),
  210000.0),
 'GSE63525_HUVEC_HiCCUPS_looplist.txt': (array([ 5000, 10000, 25000]),
  array([ 5000, 10000, 25000]),
  240000.0),
 'GSE63525_HMEC_HiCCUPS_looplist.txt': (array([ 5000, 10000]),
  array([ 5000, 10000]),
  145000.0),
 'GSE63525_NHEK_HiCCUPS_looplist.txt': (array([ 5000, 10000, 25000]),
  array([ 5000, 10000, 25000]),
  250000.0),
 'GSE63525_KBM7_HiCCUPS_looplist.txt': (array([ 5000, 10000, 25000]),
  array([ 5000, 10000, 25000]),
  250000.0),
 'GSE63525_HeLa_HiCCUPS_looplist.txt': (array([ 5000, 10000, 25000]),
  array([ 5000, 10000, 25000]),
  200000.0)}

In [51]:
dfs = [read_HiCCUPS_file(x, y) for x, y in zip(files_list, cell_types)]


In [38]:
def concat_dfs(dfs: list) -> pd.DataFrame:
    """
    Concatenate a list of dataframes
    """
    df = pd.concat(dfs)
    df = df.sort_values(by=['x'])
    return df

In [40]:
df = concat_dfs(dfs)

In [47]:
def add_labels_and_save(df: pd.DataFrame) -> None:
    """
    Creates as many file as cell types in the dataframe.
    """
    cell_types = df['cell_type'].unique()
    for cell_type in cell_types:
        f = lambda x: 1 if x['cell_type'] == cell_type else 0
        df['label'] = df.apply(f, axis=1)
        df.to_csv('../data/02_intermediate/HiCCUPS_looplist/'+cell_type+'.csv', index=False)

In [48]:
add_labels_and_save(df)

In [11]:
def add_negative_samples(positive_df, negative_df, path, n=50000):
    
    positive_df['centroid1_interval'] = positive_df['centroid1'].apply(lambda x: pd.Interval(x-n, x+n, closed='both'))
    negative_df['centroid1_interval'] = negative_df['centroid1'].apply(lambda x: pd.Interval(x-n, x+n, closed='both'))
    positive_df['centroid2_interval'] = positive_df['centroid2'].apply(lambda x: pd.Interval(x-n, x+n, closed='both'))
    negative_df['centroid2_interval'] = negative_df['centroid2'].apply(lambda x: pd.Interval(x-n, x+n, closed='both'))

    positive_df['label'] = 1
    negative_df['label'] = 0

    final_df = copy.deepcopy(positive_df)

    pos_groups = positive_df.groupby(['chr'])
    neg_groups = negative_df.groupby(['chr'])

    for chr in tqdm(pos_groups.groups.keys()):
        pos_chr = pos_groups.get_group(chr)
        neg_chr = neg_groups.get_group(chr)

        for i in tqdm(range(len(neg_chr))):
             for j in range(len(pos_chr)):
                if pos_chr.iloc[j]['centroid1_interval'].left > neg_chr.iloc[i]['centroid1_interval'].right:
                    break
                
            


    return final_df


In [12]:
df1 = read_HiCCUPS_file('../data/01_raw/HiCCUPS_looplist/'+files_list[0])
df2 = read_HiCCUPS_file('../data/01_raw/HiCCUPS_looplist/'+files_list[1])

df = add_negative_samples(df1, df2, '../data/01_raw/HiCCUPS_looplist/'+files_list[0])
df

  0%|          | 0/6057 [00:00<?, ?it/s]

0


  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  final_df = final_df.append(negative_df.iloc[i])
  1%|          | 46/6057 [01:06<2:24:12,  1.44s/it]


KeyboardInterrupt: 

In [5]:
dataframes = [read_HiCCUPS_file('../data/01_raw/HiCCUPS_looplist/'+file) for file in files_list]

for i in tqdm(range(len(dataframes))):
    positive_df = dataframes[i]
    negative_df = pd.concat(dataframes[:i]+dataframes[i+1:])
    df = add_negative_samples(positive_df, negative_df, '../data/01_raw/HiCCUPS_looplist/'+files_list[i])
    break

df

  0%|          | 0/8 [04:06<?, ?it/s]


: 

: 