From bd411f3699915572246a3ee7af3b6838784a9eeb Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Wed, 15 Jan 2025 19:40:23 -0800 Subject: [PATCH 1/7] adding few functions for right and left legs collision --- .../looptools-checkpoint.py | 38 +++++++++++++++++++ looplib/looptools.py | 38 +++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/looplib/.ipynb_checkpoints/looptools-checkpoint.py b/looplib/.ipynb_checkpoints/looptools-checkpoint.py index e926953..299775b 100644 --- a/looplib/.ipynb_checkpoints/looptools-checkpoint.py +++ b/looplib/.ipynb_checkpoints/looptools-checkpoint.py @@ -5,6 +5,7 @@ import networkx as nx import collections +from scipy.spatial import distance #import pyximport; pyximport.install( # setup_args={"include_dirs":np.get_include()}, @@ -294,3 +295,40 @@ def calc_percolation(LEF_array, r=1, tol=.01): return len(clusters[0]) / float(num_LEFs) + +def find_nearby_legs(lefs): + """ + Finds pairs of left and right leg sites with a distance of 1 between them. + + Parameters: + lefs (ndarray): A 2D array where the first column contains left leg sites + and the second column contains right leg sites. + + Returns: + ndarray: A flattened array of site pairs that have a distance of 1. + """ + l_legs=lefs[:,0] + r_legs = lefs[:,1] + l_legs_2d = l_legs[:, np.newaxis] + r_legs_2d = r_legs[:, np.newaxis] + + distances = distance.cdist(r_legs_2d, l_legs_2d) + indices = np.argwhere(distances ==1) + result = [(r_legs[i], l_legs[j]) for i, j in indices] + return np.array(result).flatten() + +def collision_at_barriers(lefs, ctcf_list): + """ + Filters collision sites that overlap with specific CTCF sites. + + Parameters: + lefs (ndarray): A 2D array where the first column contains left leg sites + and the second column contains right leg sites. + ctcf_list (ndarray): A 1D array of CTCF sites to filter against. + + Returns: + ndarray: An array of collision sites that match the provided CTCF sites. + """ + col_sites = find_nearby_legs(lefs) + col_at_ctcf = np.intersect1d(col_sites, ctcf_list) + return col_at_ctcf diff --git a/looplib/looptools.py b/looplib/looptools.py index e926953..299775b 100644 --- a/looplib/looptools.py +++ b/looplib/looptools.py @@ -5,6 +5,7 @@ import networkx as nx import collections +from scipy.spatial import distance #import pyximport; pyximport.install( # setup_args={"include_dirs":np.get_include()}, @@ -294,3 +295,40 @@ def calc_percolation(LEF_array, r=1, tol=.01): return len(clusters[0]) / float(num_LEFs) + +def find_nearby_legs(lefs): + """ + Finds pairs of left and right leg sites with a distance of 1 between them. + + Parameters: + lefs (ndarray): A 2D array where the first column contains left leg sites + and the second column contains right leg sites. + + Returns: + ndarray: A flattened array of site pairs that have a distance of 1. + """ + l_legs=lefs[:,0] + r_legs = lefs[:,1] + l_legs_2d = l_legs[:, np.newaxis] + r_legs_2d = r_legs[:, np.newaxis] + + distances = distance.cdist(r_legs_2d, l_legs_2d) + indices = np.argwhere(distances ==1) + result = [(r_legs[i], l_legs[j]) for i, j in indices] + return np.array(result).flatten() + +def collision_at_barriers(lefs, ctcf_list): + """ + Filters collision sites that overlap with specific CTCF sites. + + Parameters: + lefs (ndarray): A 2D array where the first column contains left leg sites + and the second column contains right leg sites. + ctcf_list (ndarray): A 1D array of CTCF sites to filter against. + + Returns: + ndarray: An array of collision sites that match the provided CTCF sites. + """ + col_sites = find_nearby_legs(lefs) + col_at_ctcf = np.intersect1d(col_sites, ctcf_list) + return col_at_ctcf From 3ebff22bc29aaa25332e06e1d177fc8dcb52cfac Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 12:49:33 -0800 Subject: [PATCH 2/7] Remove .ipynb_checkpoints from tracking --- .../looptools-checkpoint.py | 334 ------------------ 1 file changed, 334 deletions(-) delete mode 100644 looplib/.ipynb_checkpoints/looptools-checkpoint.py diff --git a/looplib/.ipynb_checkpoints/looptools-checkpoint.py b/looplib/.ipynb_checkpoints/looptools-checkpoint.py deleted file mode 100644 index 299775b..0000000 --- a/looplib/.ipynb_checkpoints/looptools-checkpoint.py +++ /dev/null @@ -1,334 +0,0 @@ -from __future__ import division, print_function -from scipy.spatial import KDTree - -import numpy as np -import networkx as nx - -import collections -from scipy.spatial import distance - -#import pyximport; pyximport.install( -# setup_args={"include_dirs":np.get_include()}, -# reload_support=True) - -from .looptools_c import get_parent_loops, get_stationary_loops - - -def convert_loops_to_sites(loops, r_sites=None): - """ - Convert a list of loops defined by tuples (left_site, right_side) into - two np.arrays left_sites, right_sites. - """ - if r_sites is None: - if (issubclass(type(loops), list) - and issubclass(type(loops[0]), tuple)): - l_sites, r_sites = np.array([i for i,j in loops]), np.array([j for i,j in loops]) - elif (issubclass(type(loops), np.ndarray) and (loops.ndim == 2)): - if (loops.shape[0] == 2): - return loops[0], loops[1] - elif (loops.shape[1] == 2): - return loops[:,0], loops[:,1] - else: - raise Exception('Unknown format of loop array') - else: - raise Exception('Unknown format of loop array') - else: - assert ((issubclass(type(loops), list) - or issubclass(type(loops), np.ndarray)) - and (issubclass(type(r_sites), list) - or issubclass(type(r_sites), np.ndarray))) - l_sites = np.array(loops, copy=False) - r_sites = np.array(r_sites, copy=False) - - return l_sites, r_sites - - -def get_roots(loops): - """Return the indices of root loops (i.e. loops not nested into - other loops). - """ - - l_sites, r_sites = convert_loops_to_sites(loops) - - try: - parent_loops = get_parent_loops(l_sites, r_sites) - except Exception as e: - print('Cannot find root loops: ', e.message) - return [] - - return (parent_loops == -1) - - -def get_root_births_deaths(prev_lsites, new_lsites, delta): - """ - Estimate the number of root loops that divided or died between the two time - frames. This number is estimated by finding groups of root loops with - stationary outer borders and measuring the change in the number of roots - within these groups. - """ - sync_boundaries = get_stationary_loops( - np.sort(prev_lsites), np.sort(new_lsites), delta) - births, deaths = 0,0 - for i,j in zip(sync_boundaries[1:], sync_boundaries[:-1]): - n_loop_change = i[0] - j[0] - i[1] + j[1] - births += max(0, n_loop_change) - deaths += max(0, -n_loop_change) - - return births, deaths - - -def avg_num_branching_points(parents): - sameparent = (parents[:, None] == parents[None, :]) - np.fill_diagonal(sameparent, False) - numsisters = (sameparent.sum(axis=1)).astype('float') - numsisters[parents == -1] = 0 # root loops don't have sisters - # normalize to the number of sisters - numsisters[numsisters != -1] /= (numsisters[numsisters != -1] + 1.0) - numbranches = numsisters.sum() - # normalize to the number of roots - avgnumbranches = numbranches / float((parents == -1).sum()) - return avgnumbranches - - -def get_loop_branches(parents, loops=None): - '''Get the list of list of daughter loops. If `loops` is provided, - sort the daughter loops according to their position along the loop. - ''' - nloops = len(parents) - children = [np.where(parents==i)[0] for i in range(nloops)] - if not (loops is None): - for i in range(nloops): - children[i] = children[i][np.argsort(loops[:,0][children[i]])] - return children - - -def stack_lefs(loops): - """Identify groups of stacked LEFs (i.e. tightly nested LEFs) - """ - - l_sites, r_sites = convert_loops_to_sites(loops) - - order = np.argsort(l_sites) - n_lefs = np.ones(l_sites.size) - parent_i, i = 0,0 - while True: - if ((l_sites[order[i+1]]==l_sites[order[i]]+1) - and (r_sites[order[i+1]]==r_sites[order[i]]-1)): - n_lefs[order[i+1]] -= 1 - n_lefs[order[parent_i]] += 1 - else: - parent_i = i+1 - i += 1 - if i >= l_sites.size-1: - break - return n_lefs - - -def get_backbone(loops, rootsMask=None, N=None, include_tails=True): - """Find the positions between the root loops aka the backbone. - """ - backboneidxs = [] - if rootsMask is None: - rootsMask = get_roots(loops) - - rootsSorted = np.where(rootsMask)[0][np.argsort(loops[:,0][rootsMask])] - - if include_tails and (N is None): - raise Exception('If you want to include tails, please specify the length of the chain') - - for i in range(len(rootsSorted)-1): - backboneidxs.append( - np.arange(loops[:,1][rootsSorted[i]], - loops[:,0][rootsSorted[i+1]]+1, - dtype=np.int)) - - if include_tails: - backboneidxs.insert(0, - np.arange(0, loops[:,0][rootsSorted[0]] + 1, dtype=np.int)) - backboneidxs.append( - np.arange(loops[:,1][rootsSorted[-1]], N, dtype=np.int)) - - backboneidxs = np.concatenate(backboneidxs) - return backboneidxs - - -def get_n_leafs(idx, children): - if isinstance(idx, collections.Iterable): - return np.array([get_n_leafs(i, children) for i in idx]) - else: - if len(children[idx])==0: - return 1 - else: - return sum([get_n_leafs(child, children) - for child in children[idx]]) - - - -def expand_boundary_positions(boundary_positions, offsets=[0]): - """ - Expand boundary_positions by offsets. - - Args: - boundary_positions : np.array - List of boundary locations. - offsets : np.array (optional) - List of window sizes. Defaults to [0], the positon of the boundaries. - For symmetric windows around boundaries, use np.arange(-window_size, (window_size) + 1) - - Returns: - np.ndarray: Array containing peak positions. - """ - expanded_positions = np.array([]) - - for offset in offsets: - inds_to_add = [boundary + offset for boundary in boundary_positions] - expanded_positions = np.hstack((expanded_positions, inds_to_add)) - - return expanded_positions.astype(int) - - - - -def FRiP(number_of_sites, lef_positions, boundary_positions): - """ - Calculate the Fraction of Reads in Peaks (FRiP). - - FRiP is calculated as the sum of reads falling into peak positions - divided by the total number of LEF positions. - - Args: - number_of_sites (int): The total number of sites. - extruders_positions (list): List of LEF positions. - peak_positions (list): List of peak positions. - - Returns: - float: The Fraction of Reads in Peaks (FRiP). - """ - - hist, bin_edges = np.histogram(lef_positions, np.arange(number_of_sites + 1)) - return np.sum(hist[boundary_positions]) / len(lef_positions) - - -def calc_coverage(LEF_array, L): - """ - Calculate the average coverage (fraction of polymer covered by at least one loop) - - Args: - LEF_array (ndarray): Nx2 numpy array of extruder positions - L (int): Total polymer length. - - Returns: - float: Loop coverage. - """ - - coverage = np.zeros(int(L)) - - for p in LEF_array: - coverage[p[0]:p[1]+1] = 1 - - return coverage.sum() / float(L) - - -def _get_collided_pairs(LEF_array, r=1, tol=.01): - """ - Locate LEF pairs with leg-leg separation distance of r sites or less - - Args: - LEF_array (np.ndarray): Nx2 numpy array of extruder positions - r (int): (1D) distance cutoff - tol (float): tolerance parameter for distance calculations - - Returns: - np.ndarray: Mx2 array of collided extruder indices - """ - - tree = KDTree(LEF_array.reshape(-1,1)) - pairs = tree.query_pairs(r=r+tol, output_type='ndarray') // 2 - - distinct_pairs = np.diff(pairs).astype(bool) - collided_pairs = pairs[distinct_pairs.flatten()] - - return collided_pairs - - -def calc_collisions(LEF_array, r=1, tol=.01): - """ - Calculate the number of collided LEFs (with at least one leg adjacent to another LEF) - - Args: - LEF_array (np.ndarray): Nx2 numpy array of extruder positions - r (int): (1D) distance cutoff - tol (float): tolerance parameter for distance calculations - - Returns: - float: Collided extruder fraction - """ - - num_LEFs = LEF_array.shape[0] - - pairs = _get_collided_pairs(LEF_array, r, tol) - collided_LEFs = np.unique(pairs.flatten()) - - return len(collided_LEFs) / float(num_LEFs) - - -def calc_percolation(LEF_array, r=1, tol=.01): - """ - Calculate the size of the largest collided LEF cluster - - Args: - LEF_array (np.ndarray): Nx2 numpy array of extruder positions - r (int): (1D) distance cutoff - tol (float): tolerance parameter for distance calculations - - Returns: - float: Fraction of extruders comprising the largest collided cluster - """ - - num_LEFs = LEF_array.shape[0] - - pairs = _get_collided_pairs(LEF_array, r, tol) - graph = nx.Graph(pairs.tolist()) - - clusters = nx.connected_components(graph) - clusters = sorted(clusters, key=len, reverse=True) - - return len(clusters[0]) / float(num_LEFs) - - -def find_nearby_legs(lefs): - """ - Finds pairs of left and right leg sites with a distance of 1 between them. - - Parameters: - lefs (ndarray): A 2D array where the first column contains left leg sites - and the second column contains right leg sites. - - Returns: - ndarray: A flattened array of site pairs that have a distance of 1. - """ - l_legs=lefs[:,0] - r_legs = lefs[:,1] - l_legs_2d = l_legs[:, np.newaxis] - r_legs_2d = r_legs[:, np.newaxis] - - distances = distance.cdist(r_legs_2d, l_legs_2d) - indices = np.argwhere(distances ==1) - result = [(r_legs[i], l_legs[j]) for i, j in indices] - return np.array(result).flatten() - -def collision_at_barriers(lefs, ctcf_list): - """ - Filters collision sites that overlap with specific CTCF sites. - - Parameters: - lefs (ndarray): A 2D array where the first column contains left leg sites - and the second column contains right leg sites. - ctcf_list (ndarray): A 1D array of CTCF sites to filter against. - - Returns: - ndarray: An array of collision sites that match the provided CTCF sites. - """ - col_sites = find_nearby_legs(lefs) - col_at_ctcf = np.intersect1d(col_sites, ctcf_list) - return col_at_ctcf From f9596abed8f36a67bf897c7bcf557b8e2cfddf00 Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 12:55:29 -0800 Subject: [PATCH 3/7] Ensure .ipynb_checkpoints is ignored --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 92aa1e2..a3b191c 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ dist/ *.backup looplib.egg-info/* .ipynb_checkpoints/* +.ipynb_checkpoints/ From eb0e28aa77708482da2e05951aeccb33e6313ec5 Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 14:13:10 -0800 Subject: [PATCH 4/7] adding function for finding convergent pairs between barriers --- looplib/looptools.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/looplib/looptools.py b/looplib/looptools.py index 299775b..43ecf9f 100644 --- a/looplib/looptools.py +++ b/looplib/looptools.py @@ -332,3 +332,37 @@ def collision_at_barriers(lefs, ctcf_list): col_sites = find_nearby_legs(lefs) col_at_ctcf = np.intersect1d(col_sites, ctcf_list) return col_at_ctcf + +def find_convergent_pairs(ctcf_right, ctcf_left, elements_between_pairs): + """ + Finds pairs of convergent CTCF binding sites with exactly `elements_between_pairs` barrier elements between them. + + Parameters: + ---------- + ctcf_right : list of int + List of positions for CTCF binding sites on the right (right-facing CTCF sites). + ctcf_left : list of int + List of positions for CTCF binding sites on the left (left-facing CTCF sites). + elements_between_pairs : int + The number of barrier elements between the convergent CTCF pairs. + For example, `elements_between_pairs=1` finds directly connected CTCF sites, + `elements_between_pairs=2` finds CTCF sites with one intervening barrier element between them, etc. + + Returns: + ------- + list of list of int + A list of pairs of convergent CTCF sites, where each pair consists of one + left-facing CTCF site and one right-facing CTCF site, separated by exactly `elements_between_pairs` barriers. + """ + # Combine and sort CTCF positions from both directions + ctot = np.sort(np.array(ctcf_right + ctcf_left)) + + # Find pairs of convergent CTCF elements with exactly `elements_between_pairs` barriers between them + convergent_pairs = [ + [ctot[i], ctot[i + elements_between_pairs]] + for i in range(len(ctot) - elements_between_pairs) + if ctot[i] in ctcf_left and ctot[i + elements_between_pairs] in ctcf_right + ] + + return convergent_pairs + From c8e6dc68f0aa365544b0742ae2b7d2fc7321bea9 Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 18:32:25 -0800 Subject: [PATCH 5/7] adding a new file to include functions that relies on barriers, including FRiP --- looplib/barriertools.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 looplib/barriertools.py diff --git a/looplib/barriertools.py b/looplib/barriertools.py new file mode 100644 index 0000000..dd734be --- /dev/null +++ b/looplib/barriertools.py @@ -0,0 +1,18 @@ +def peak_positions(boundary_list, window_sizes=[1]): + """ + Calculate peak positions based on a boundary_list within window_sizes. + + Args: + boundary_list (list): List of boundary values. + window_sizes (list, optional): List of window sizes. Defaults to [1]. + + Returns: + np.ndarray: Array containing peak positions. + """ + peak_monomers = np.array([]) + + for i in window_sizes: + inds_to_add = [boundary + i for boundary in boundary_list] + peak_monomers = np.hstack((peak_monomers, inds_to_add)) + + return peak_monomers.astype(int) From 8f00e9cd306c11abe9e7d64832a0a13040782f74 Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 18:37:43 -0800 Subject: [PATCH 6/7] adding a new file to include functions that relies on barriers, including FRiP --- looplib/barriertools.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/looplib/barriertools.py b/looplib/barriertools.py index dd734be..b9e6aa0 100644 --- a/looplib/barriertools.py +++ b/looplib/barriertools.py @@ -16,3 +16,28 @@ def peak_positions(boundary_list, window_sizes=[1]): peak_monomers = np.hstack((peak_monomers, inds_to_add)) return peak_monomers.astype(int) + + +def FRiP(num_sites_t, lef_positions, peak_positions): + """ + Calculate the Fraction of Reads in Peaks (FRiP) score. + + The FRiP score is a measure of how many loop-extruding factor (LEF) positions + fall within predefined peak regions, relative to the total number of LEF positions. + + Parameters: + ----------- + num_sites_t : int + Total number of genomic sites. + lef_positions : array-like + Positions of loop extruding factors (LEFs) along the genome. + peak_positions : array-like + Indices corresponding to peak regions (CTCFs). + + Returns: + -------- + float + The fraction of LEF positions that fall within peak regions. + """ + hist, edges = np.histogram(lef_positions, np.arange(num_sites_t + 1)) + return np.sum(hist[peak_positions]) / len(lef_positions) \ No newline at end of file From 44c896c9b04c903b175a71c9ba38a3d4ad569fb4 Mon Sep 17 00:00:00 2001 From: Hadi Rahmaninejad Date: Sun, 9 Feb 2025 19:05:51 -0800 Subject: [PATCH 7/7] adding a new file to include functions that relies on barriers, including FRiP --- looplib/barriertools.py | 36 +++++++++++++++++++++++++++++++++++- looplib/looptools.py | 31 ------------------------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/looplib/barriertools.py b/looplib/barriertools.py index b9e6aa0..c590d53 100644 --- a/looplib/barriertools.py +++ b/looplib/barriertools.py @@ -40,4 +40,38 @@ def FRiP(num_sites_t, lef_positions, peak_positions): The fraction of LEF positions that fall within peak regions. """ hist, edges = np.histogram(lef_positions, np.arange(num_sites_t + 1)) - return np.sum(hist[peak_positions]) / len(lef_positions) \ No newline at end of file + return np.sum(hist[peak_positions]) / len(lef_positions) + + +def find_convergent_pairs(ctcf_right, ctcf_left, elements_between_pairs): + """ + Finds pairs of convergent CTCF binding sites with exactly `elements_between_pairs` barrier elements between them. + + Parameters: + ---------- + ctcf_right : list of int + List of positions for CTCF binding sites on the right (right-facing CTCF sites). + ctcf_left : list of int + List of positions for CTCF binding sites on the left (left-facing CTCF sites). + elements_between_pairs : int + The number of barrier elements between the convergent CTCF pairs. + For example, `elements_between_pairs=1` finds directly connected CTCF sites, + `elements_between_pairs=2` finds CTCF sites with one intervening barrier element between them, etc. + + Returns: + ------- + list of list of int + A list of pairs of convergent CTCF sites, where each pair consists of one + left-facing CTCF site and one right-facing CTCF site, separated by exactly `elements_between_pairs` barriers. + """ + # Combine and sort CTCF positions from both directions + ctot = np.sort(np.array(ctcf_right + ctcf_left)) + + # Find pairs of convergent CTCF elements with exactly `elements_between_pairs` barriers between them + convergent_pairs = [ + [ctot[i], ctot[i + elements_between_pairs]] + for i in range(len(ctot) - elements_between_pairs) + if ctot[i] in ctcf_left and ctot[i + elements_between_pairs] in ctcf_right + ] + + return convergent_pairs \ No newline at end of file diff --git a/looplib/looptools.py b/looplib/looptools.py index 43ecf9f..89f547e 100644 --- a/looplib/looptools.py +++ b/looplib/looptools.py @@ -333,36 +333,5 @@ def collision_at_barriers(lefs, ctcf_list): col_at_ctcf = np.intersect1d(col_sites, ctcf_list) return col_at_ctcf -def find_convergent_pairs(ctcf_right, ctcf_left, elements_between_pairs): - """ - Finds pairs of convergent CTCF binding sites with exactly `elements_between_pairs` barrier elements between them. - - Parameters: - ---------- - ctcf_right : list of int - List of positions for CTCF binding sites on the right (right-facing CTCF sites). - ctcf_left : list of int - List of positions for CTCF binding sites on the left (left-facing CTCF sites). - elements_between_pairs : int - The number of barrier elements between the convergent CTCF pairs. - For example, `elements_between_pairs=1` finds directly connected CTCF sites, - `elements_between_pairs=2` finds CTCF sites with one intervening barrier element between them, etc. - Returns: - ------- - list of list of int - A list of pairs of convergent CTCF sites, where each pair consists of one - left-facing CTCF site and one right-facing CTCF site, separated by exactly `elements_between_pairs` barriers. - """ - # Combine and sort CTCF positions from both directions - ctot = np.sort(np.array(ctcf_right + ctcf_left)) - - # Find pairs of convergent CTCF elements with exactly `elements_between_pairs` barriers between them - convergent_pairs = [ - [ctot[i], ctot[i + elements_between_pairs]] - for i in range(len(ctot) - elements_between_pairs) - if ctot[i] in ctcf_left and ctot[i + elements_between_pairs] in ctcf_right - ] - - return convergent_pairs