Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 168 additions & 116 deletions README.md

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "ModDotPlot"
version = "0.9.5"
version = "0.9.6"
requires-python = ">= 3.7"
dependencies = [
"pysam",
Expand All @@ -21,6 +21,7 @@ dependencies = [
"cairosvg",
"pygenometracks",
"svgutils",
"cooler",
]
authors = [
{name = "Alex Sweeten", email = "alex.sweeten@nih.gov"},
Expand All @@ -30,7 +31,7 @@ authors = [
maintainers = [
{name = "Alex Sweeten", email = "alex.sweeten@nih.gov"}
]
readme = {file = "README.txt", content-type = "text/markdown"}
readme = {file = "README.md", content-type = "text/markdown"}
license = {file = "LICENSE"}
keywords = ["dotplot", "sketching", "modimizer", "heatmap"]

Expand Down
2 changes: 1 addition & 1 deletion src/moddotplot/const.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "0.9.5"
VERSION = "0.9.6"
COLS = [
"#query_name",
"query_start",
Expand Down
74 changes: 56 additions & 18 deletions src/moddotplot/estimate_identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from palettable import colorbrewer
from typing import List, Set, Dict, Tuple
import mmh3
import pandas as pd
import cooler

from moddotplot.parse_fasta import printProgressBar

Expand Down Expand Up @@ -206,31 +208,67 @@ def convertMatrixToBed(
return bed


def divide_into_chunks(lst: List[int], res: int) -> List[List[int]]:
def convertMatrixToCool(
matrix,
window_size,
id_threshold,
x_name,
y_name,
self_identity,
x_offset,
y_offset,
chromsizes,
output_cool,
):
"""
Divide a list into approximately equal-sized chunks.
Convert a matrix into a .cool file.

Args:
lst (List[int]): The input list to be divided.
res (int): The desired number of result chunks.

Returns:
List[List[int]]: A list of lists, where each inner list contains elements from the input list.
matrix (ndarray): 2D numpy array.
window_size (int): Bin/window size in bp.
id_threshold (float): Percent identity threshold (0-100).
x_name (str): Chromosome name for rows.
y_name (str): Chromosome name for columns.
self_identity (bool): Whether to include self and upper triangle only.
x_offset (int): Genomic offset for x axis (start position).
y_offset (int): Genomic offset for y axis (start position).
chromsizes (dict): Dict of chromosome lengths, e.g. {"chr1": 248956422}.
output_cool (str): Path to save cooler file.
"""
chunk_size = len(lst) // res # Calculate the target chunk size
remainder = len(lst) % res # Calculate the remainder
rows, cols = matrix.shape

# ---- build bin table ----
bins = []
for i in range(rows):
bins.append(
(x_name, i * window_size + x_offset, (i + 1) * window_size + x_offset)
)
for j in range(cols):
bins.append(
(y_name, j * window_size + y_offset, (j + 1) * window_size + y_offset)
)

bins = pd.DataFrame(bins, columns=["chrom", "start", "end"])

# ---- build pixel table ----
pixels = []
for x in range(rows):
for y in range(cols):
value = matrix[x, y]
if (not self_identity) or (self_identity and x <= y):
if value >= id_threshold / 100:
bin1_id = x
bin2_id = rows + y # offset y bins after x bins
pixels.append((bin1_id, bin2_id, float(value)))

chunks = []
start = 0
pixels = pd.DataFrame(pixels, columns=["bin1_id", "bin2_id", "count"])

for i in range(res):
end = (
start + chunk_size + (1 if i < remainder else 0)
) # Adjust chunk size for remainder
chunks.append(lst[start:end])
start = end
# ---- write cooler ----
cooler.create_cooler(output_cool, bins=bins, pixels=pixels, ordered=True)
print(bins)
print(pixels)

return chunks
return output_cool


def binomial_distance(containment_value: float, kmer_value: int) -> float:
Expand Down
112 changes: 98 additions & 14 deletions src/moddotplot/moddotplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
selfContainmentMatrix,
pairwiseContainmentMatrix,
convertMatrixToBed,
convertMatrixToCool,
createSelfMatrix,
createPairwiseMatrix,
partitionOverlaps,
Expand All @@ -27,6 +28,7 @@
import numpy as np
import pickle
import os
import cooler


def get_parser():
Expand Down Expand Up @@ -270,6 +272,10 @@ def get_parser():
help="Order in which sequences appear in the comparative plot. Default is 'sequential': First file on x-axis, second file on y-axis. Another option is 'size': The larger sequence on the x-axis and the smaller on y-axis.",
)

static_parser.add_argument(
"--cooler", action="store_true", help="Output matrix to cooler file."
)

static_parser.add_argument(
"--no-bedpe", action="store_true", help="Skip output of paired-end bed file."
)
Expand Down Expand Up @@ -924,7 +930,7 @@ def main():

# Create output directory, if doesn't exist:
if (args.output_dir) and not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
os.makedirs(args.output_dir, exist_ok=True)
# -----------COMPUTE SELF-IDENTITY PLOTS-----------
if not args.compare_only:
for i in range(len(sequences)):
Expand Down Expand Up @@ -991,14 +997,45 @@ def main():
grid_val_singles.append(bed)
grid_val_single_names.append(seq_name)

if args.cooler:
try:
cooler_path = "."
if not args.output_dir:
cooler_path = os.path.join(cooler_path, seq_name)
else:
cooler_path = os.path.join(args.output_dir, seq_name)
os.makedirs(cooler_path, exist_ok=True)
cooler_output = os.path.join(cooler_path, seq_name + ".cooler")
convertMatrixToCool(
matrix=self_mat,
window_size=win,
id_threshold=args.identity,
x_name=seq_name,
y_name=seq_name,
self_identity=True,
x_offset=seq_start_pos,
y_offset=seq_start_pos,
chromsizes=seq_length,
output_cool=cooler_output,
)
print(
f"Saved self-identity matrix as a cooler file to {cooler_output}\n"
)
except Exception as e:
print(f"Error creating cooler file: {e}")

if not args.no_bedpe:
# Log saving bed file
bedpe_path = "."
if not args.output_dir:
bedfile_output = seq_name + ".bedpe"
bedpe_path = os.path.join(bedpe_path, seq_name)
os.makedirs(bedpe_path, exist_ok=True)
bedfile_output = os.path.join(seq_name, seq_name + ".bedpe")
else:
bedfile_output = os.path.join(
args.output_dir, seq_name + ".bedpe"
)
bedpe_path = os.path.join(args.output_dir, seq_name)
os.makedirs(bedpe_path, exist_ok=True)
bedfile_output = os.path.join(bedpe_path, seq_name + ".bedpe")

with open(bedfile_output, "w") as bedfile:
for row in bed:
bedfile.write("\t".join(map(str, row)) + "\n")
Expand All @@ -1009,7 +1046,7 @@ def main():
if (not args.no_plot) and (not args.grid_only):
create_plots(
sdf=[bed],
directory=args.output_dir if args.output_dir else ".",
directory=bedpe_path,
name_x=seq_name,
name_y=seq_name,
palette=args.palette,
Expand Down Expand Up @@ -1110,12 +1147,48 @@ def main():
f"The pairwise identity matrix for {sequences[i][0]} and {sequences[j][0]} is empty. Skipping.\n"
)
else:
if args.cooler:
try:
cooler_path = "."
if not args.output_dir:
cooler_path = os.path.join(
cooler_path,
f"{larger_seq_name}_{smaller_seq_name}",
)
else:
cooler_path = os.path.join(
args.output_dir,
f"{larger_seq_name}_{smaller_seq_name}",
)
os.makedirs(cooler_path, exist_ok=True)
cooler_output = os.path.join(
cooler_path,
f"{larger_seq_name}_{smaller_seq_name}.cooler",
)
convertMatrixToCool(
matrix=pair_mat,
window_size=win,
id_threshold=args.identity,
x_name=larger_seq_name,
y_name=smaller_seq_name,
self_identity=False,
x_offset=larger_seq_start_pos,
y_offset=smaller_seq_start_pos,
chromsizes=larger_length,
output_cool=cooler_output,
)
print(
f"Saved comparative matrix as a cooler file to {cooler_output}\n"
)
except Exception as e:
print(f"Error creating pairwise cooler file: {e}")
bed = convertMatrixToBed(
pair_mat,
win,
args.identity,
seq_list[i],
seq_list[j],
# check if this is correct
larger_seq_name,
smaller_seq_name,
False,
larger_seq_start_pos,
smaller_seq_start_pos,
Expand All @@ -1128,16 +1201,27 @@ def main():
xlim_val_grid = max(larger_length, xlim_val_grid)
if not args.no_bedpe:
# Log saving bed file
bedpe_path = "."
if not args.output_dir:
bedfile_prefix = (
larger_seq_name + "_" + smaller_seq_name
)
bedpe_path = os.path.join(bedpe_path, bedfile_prefix)
os.makedirs(bedpe_path, exist_ok=True)
bedfile_output = (
larger_seq_name
+ "_"
+ smaller_seq_name
+ "_COMPARE.bedpe"
bedpe_path,
bedfile_prefix + "_COMPARE.bedpe",
)
else:
bedfile_prefix = (
larger_seq_name + "_" + smaller_seq_name
)
bedpe_path = os.path.join(
args.output_dir, bedfile_prefix
)
os.makedirs(bedpe_path, exist_ok=True)
bedfile_output = os.path.join(
args.output_dir,
bedpe_path,
larger_seq_name
+ "_"
+ smaller_seq_name
Expand All @@ -1153,7 +1237,7 @@ def main():
if (not args.no_plot) and (not args.grid_only):
create_plots(
sdf=[bed],
directory=args.output_dir if args.output_dir else ".",
directory=bedpe_path,
name_x=larger_seq_name,
name_y=smaller_seq_name,
palette=args.palette,
Expand Down
Loading