Skip to content

Commit

Permalink
Merge pull request #23 from marbl/develop
Browse files Browse the repository at this point in the history
v0.8.2 release
  • Loading branch information
alexsweeten committed Apr 22, 2024
2 parents c1388eb + 992572b commit c1d945a
Show file tree
Hide file tree
Showing 12 changed files with 114 additions and 59 deletions.
19 changes: 19 additions & 0 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: black

# Controls when the action will run.
on:
push:
branches: [main, develop]
pull_request:
branches: [main, develop]

workflow_dispatch:

jobs:
black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: psf/black@stable
with:
options: ". --check --verbose"
14 changes: 14 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

@article{Sweeten2024.04.15.589623,
abstract = {Motivation A common method for analyzing genomic repeats is to produce a sequence similarity matrix visualized via a dot plot. Innovative approaches such as StainedGlass have improved upon this classic visualization by rendering dot plots as a heatmap of sequence identity, enabling researchers to better visualize multi-megabase tandem repeat arrays within centromeres and other heterochromatic regions of the genome. However, computing the similarity estimates for heatmaps requires high computational overhead and can suffer from decreasing accuracy. Results In this work we introduce ModDotPlot, an interactive and alignment-free dot plot viewer. By approximating average nucleotide identity via a k-mer-based containment index, ModDotPlot produces accurate plots orders of magnitude faster than StainedGlass. We accomplish this through the use of a hierarchical modimizer scheme that can visualize the full 128 Mbp genome of Arabidopsis thaliana in under 5 minutes on a laptop. ModDotPlot is implemented in Python with a graphical user interface supporting real-time interactive navigation of entire chromosomes. Availability and Implementation ModDotPlot is available at https://github.com/marbl/ModDotPlot.Competing Interest StatementThe authors have declared no competing interest.},
author = {Alexander P. Sweeten and Michael C. Schatz and Adam M. Phillippy},
doi = {10.1101/2024.04.15.589623},
elocation-id = {2024.04.15.589623},
eprint = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623.full.pdf},
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
title = {ModDotPlot - Rapid and interactive visualization of complex repeats},
url = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623},
year = {2024},
bdsk-url-1 = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623},
bdsk-url-2 = {https://doi.org/10.1101/2024.04.15.589623}}
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ Finally, confirm that the installation was installed correctly by running `moddo
| | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_
|_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__|
v0.8.1
v0.8.2
usage: moddotplot [-h] {interactive,static} ...
Expand Down Expand Up @@ -234,7 +234,7 @@ $ moddotplot interactive -f sequences/Chr1_cen.fa
| | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_
|_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__|
v0.8.1
v0.8.2
Running ModDotPlot in interactive mode
Expand Down Expand Up @@ -322,7 +322,7 @@ $ moddotplot static -c config/config.json
| | | | (_) | (_| | | |__| | (_) | |_ | | | | (_) | |_
|_| |_|\___/ \__,_| |_____/ \___/ \__| |_| |_|\___/ \__|
v0.8.1
v0.8.2
Running ModDotPlot in static mode
Expand Down Expand Up @@ -387,4 +387,6 @@ For bug reports or general usage questions, please raise a GitHub issue, or emai

## Cite

Publication in progress! (almost there :D)
Alexander P. Sweeten, Michael C. Schatz, Adam M. Phillippy, ModDotPlot - Rapid and interactive visualization of complex repeats
bioRxiv 2024.04.15.589623; doi: https://doi.org/10.1101/2024.04.15.589623

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "ModDotPlot"
version = "0.8.1"
version = "0.8.2"
requires-python = ">= 3.7"
dependencies = [
"pysam",
Expand Down
1 change: 0 additions & 1 deletion sequences/chr1_cen.fa.fai

This file was deleted.

4 changes: 2 additions & 2 deletions src/moddotplot/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@
import setproctitle

# Set the process title to a custom name
setproctitle.setproctitle('ModDotPlot')
setproctitle.setproctitle("ModDotPlot")

sys.exit(main())
sys.exit(main())
2 changes: 1 addition & 1 deletion src/moddotplot/const.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = "0.8.1"
VERSION = "0.8.2"
COLS = [
"#query_name",
"query_start",
Expand Down
1 change: 0 additions & 1 deletion src/moddotplot/estimate_identity.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,7 +510,6 @@ def findValueInRange(integer: int, range_dict: dict) -> int:
if key[0] >= integer >= key[1]:
return value
return highest_value



def setZoomLevels(axis_length, sparsity_layers):
Expand Down
37 changes: 22 additions & 15 deletions src/moddotplot/interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from moddotplot.estimate_identity import (
getInteractiveColor,
getMatchingColors,
verifyModimizers,
setZoomLevels,
makeDifferencesEqual,
generateDictionaryFromList,
findValueInRange,
Expand Down Expand Up @@ -76,30 +74,39 @@ def run_dash(matrices, metadata, axes, sparsity, identity, port_number, output_d
titles = []
for i in range(len(metadata)):
titles.append(metadata[i]["title"])

# Get zooming thresholds, adjust sparsity respectively.
def halving_sequence(size, start):
sequence = [start]
for _ in range(1, size):
start /= 2
sequence.append(start)
return sequence
#print(current_metadata)
mod_thresholds_list = halving_sequence(len(current_metadata["sparsities"]), current_metadata["x_size"])
#print(mod_thresholds_list)

#print(current_metadata["min_window_size"]* current_metadata["resolution"])
#print(current_metadata["max_window_size"])
numo = round(math.log2(current_metadata['max_window_size']/current_metadata['min_window_size']) + 1)
#print(numo)

# print(current_metadata)
mod_thresholds_list = halving_sequence(
len(current_metadata["sparsities"]), current_metadata["x_size"]
)
# print(mod_thresholds_list)

# print(current_metadata["min_window_size"]* current_metadata["resolution"])
# print(current_metadata["max_window_size"])
numo = round(
math.log2(
current_metadata["max_window_size"] / current_metadata["min_window_size"]
)
+ 1
)
# print(numo)
important = generateDictionaryFromList(mod_thresholds_list)
#print(f"this is imprtant: {important}")
# print(f"this is imprtant: {important}")

main_level = image_pyramid[0]
main_x_axis = axes[0][0]
main_y_axis = axes[0][1]
main_x_axis_np = np.array(main_x_axis)

#TODO: modify value here
# TODO: modify value here
main_x_axis_np += 3000

# Modify text so that hover shows interval format
Expand Down Expand Up @@ -261,9 +268,9 @@ def halving_sequence(size, start):
{"label": f"{title}", "value": f"{title}"}
for title in titles # Iterate over each title
],
value=titles[0]
if titles
else None, # Set default value based on the length of matrices
value=(
titles[0] if titles else None
), # Set default value based on the length of matrices
clearable=False, # Prevent dropdown from clearing values,
style={"width": "300px"},
),
Expand Down
27 changes: 13 additions & 14 deletions src/moddotplot/moddotplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -502,7 +502,7 @@ def main():
isValidFasta(i)
headers = getInputHeaders(i)
if len(headers) > 1:
print(f"File {i} contains multiple fasta entries: \n")
print(f"File {i} contains multiple fasta entries. \n")
counter = 1
for j in headers:
counter += 1
Expand Down Expand Up @@ -541,9 +541,7 @@ def main():
max_window_size = math.ceil(hgi / args.resolution)
# If only sequence is too small, throw an error.
if max_window_size < 10:
print(
f"Error: sequence too small for analysis.\n"
)
print(f"Error: sequence too small for analysis.\n")
print(
f"ModDotPlot requires a minimum window size of 10. Sequences less than 10Kbp will not work with ModDotPlot under normal resolution. We recommend rerunning ModDotPlot with --r {math.ceil(hgi / 10)}.\n"
)
Expand Down Expand Up @@ -772,10 +770,11 @@ def main():
pickle.dump(metadata, f)
# Check if no plot arg is used
if args.no_plot:
print(f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n")
print(
f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n"
)
sys.exit(0)


# Before running dash, change into intervals...
axes = []
for matrices_set, meta in zip(matrices, metadata):
Expand All @@ -799,8 +798,8 @@ def main():
# -----------SETUP STATIC MODE-----------
elif args.command == "static":
# -----------SET SPARSITY VALUE-----------
# TODO: this is not sorting correctly
sequences = list(zip(seq_list, k_list))
sequences.sort(key=lambda x: len(x[1]), reverse=True)

# Create output directory, if doesn't exist:
if (args.output_dir) and not os.path.exists(args.output_dir):
Expand All @@ -816,10 +815,12 @@ def main():
res = math.ceil(seq_length / args.window)
else:
win = math.ceil(seq_length / args.resolution)

if win < args.modimizer:
raise ValueError(
args.modimizer = win
"""raise ValueError(
"Window size must be greater than or equal to the modimizer sketch size"
)
)"""

seq_sparsity = round(win / args.modimizer)
if seq_sparsity <= args.modimizer:
Expand All @@ -845,7 +846,7 @@ def main():
# print(f"\tSparsity value s: {seq_sparsity}\n")
print(f"\tSequence length n: {len(k_list[i]) + args.kmer - 1}\n")
print(f"\tWindow size w: {win}\n")
print(f"\tModimizer sketch value: {expectation}\n")
print(f"\tModimizer sketch size: {expectation}\n")
print(f"\tPlot Resolution r: {res}\n")
self_mat = createSelfMatrix(
seq_length,
Expand Down Expand Up @@ -905,9 +906,7 @@ def main():
else:
win = math.ceil(len(sequences[0][1]) / args.resolution)
if win < args.modimizer:
raise ValueError(
"Window size must be greater than or equal to the modimizer sketch size"
)
args.modimizer = win

seq_sparsity = round(win / args.modimizer)
if seq_sparsity <= args.modimizer:
Expand Down Expand Up @@ -935,7 +934,7 @@ def main():
f"\tSequence length {sequences[j][0]}: {smaller_length + args.kmer - 1}\n"
)
print(f"\tWindow size w: {win}\n")
print(f"\tModimizer sketch value: {expectation}\n")
print(f"\tModimizer sketch size: {expectation}\n")
print(f"\tPlot Resolution r: {res}\n")

pair_mat = createPairwiseMatrix(
Expand Down
36 changes: 24 additions & 12 deletions src/moddotplot/parse_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,37 @@

tab_b = bytes.maketrans(b"ACTG", b"TGAC")


def generateKmersFromFasta(seq: Sequence[str], k: int, quiet: bool) -> Iterable[int]:
n = len(seq)
if not quiet:
progress_thresholds = round(n / 77)
printProgressBar(0, n - k + 1, prefix='Progress:', suffix='Complete', length=40)
printProgressBar(0, n - k + 1, prefix="Progress:", suffix="Complete", length=40)

for i in range(n - k + 1):
if not quiet:
if i % progress_thresholds == 0:
printProgressBar(i, n - k + 1, prefix='Progress:', suffix='Complete', length=40)
printProgressBar(
i, n - k + 1, prefix="Progress:", suffix="Complete", length=40
)
if i == n - k:
printProgressBar(n - k + 1, n - k + 1, prefix='Progress:', suffix='Completed', length=40)

kmer = seq[i:i + k]
printProgressBar(
n - k + 1,
n - k + 1,
prefix="Progress:",
suffix="Completed",
length=40,
)

kmer = seq[i : i + k]
fh = mmh3.hash(kmer)

# Calculate reverse complement hash directly without the need for translation
rc = mmh3.hash(kmer[::-1].translate(tab_b))

yield fh if fh < rc else rc


def isValidFasta(file_path):
try:
with open(file_path, "r") as file:
Expand All @@ -52,6 +62,7 @@ def isValidFasta(file_path):
print(f"An error occurred: {str(e)}")
sys.exit(6)


def extractFiles(folder_path):
# Check to see at least one compressed numpy matrix, and one metadata pickle are included
metadata = []
Expand All @@ -61,15 +72,15 @@ def extractFiles(folder_path):
file_path = os.path.join(folder_path, filename) # Full path to the file
if filename.endswith(".pkl"):
with open(file_path, "rb") as f:
metadata = pickle.load(f) # Append loaded data to the metadata list
metadata = pickle.load(f) # Append loaded data to the metadata list

for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if filename.endswith(".npz"):
pattern = rf'_(\d+)\.npz' # Using f-string to include the value of i in the regex pattern
pattern = rf"_(\d+)\.npz" # Using f-string to include the value of i in the regex pattern
tmp2 = re.split(pattern, filename, maxsplit=1)
ff = np.load(file_path,allow_pickle=True)
tmp.append((tmp2[0],tmp2[1], ff))
ff = np.load(file_path, allow_pickle=True)
tmp.append((tmp2[0], tmp2[1], ff))
sorted_list = sorted(tmp, key=lambda x: (x[0], x[1]))

unique_lists = {}
Expand All @@ -84,15 +95,16 @@ def extractFiles(folder_path):

# Convert dictionary values to lists
result_lists = list(unique_lists.values())
sorted_result_lists = [lst for title in metadata for lst in result_lists if lst[0][0] == title['title']]
sorted_result_lists = [
lst for title in metadata for lst in result_lists if lst[0][0] == title["title"]
]
for unique_list in sorted_result_lists:
matrices.append([])
for val in unique_list:
matrices[-1].append(val[-1]["data"])
return matrices, metadata



def printProgressBar(
iteration,
total,
Expand Down
Loading

0 comments on commit c1d945a

Please sign in to comment.