Merge pull request #23 from marbl/develop

v0.8.2 release
marbl · Apr 22, 2024 · c1d945a · c1d945a
2 parents c1388eb + 992572b
commit c1d945a
Show file tree

Hide file tree

Showing 12 changed files with 114 additions and 59 deletions.
diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -0,0 +1,19 @@
+name: black
+
+# Controls when the action will run.
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+
+  workflow_dispatch:
+
+jobs:
+  black:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: psf/black@stable
+        with:
+          options: ". --check --verbose"
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,14 @@
+
+@article{Sweeten2024.04.15.589623,
+	abstract = {Motivation A common method for analyzing genomic repeats is to produce a sequence similarity matrix visualized via a dot plot. Innovative approaches such as StainedGlass have improved upon this classic visualization by rendering dot plots as a heatmap of sequence identity, enabling researchers to better visualize multi-megabase tandem repeat arrays within centromeres and other heterochromatic regions of the genome. However, computing the similarity estimates for heatmaps requires high computational overhead and can suffer from decreasing accuracy. Results In this work we introduce ModDotPlot, an interactive and alignment-free dot plot viewer. By approximating average nucleotide identity via a k-mer-based containment index, ModDotPlot produces accurate plots orders of magnitude faster than StainedGlass. We accomplish this through the use of a hierarchical modimizer scheme that can visualize the full 128 Mbp genome of Arabidopsis thaliana in under 5 minutes on a laptop. ModDotPlot is implemented in Python with a graphical user interface supporting real-time interactive navigation of entire chromosomes. Availability and Implementation ModDotPlot is available at https://github.com/marbl/ModDotPlot.Competing Interest StatementThe authors have declared no competing interest.},
+	author = {Alexander P. Sweeten and Michael C. Schatz and Adam M. Phillippy},
+	doi = {10.1101/2024.04.15.589623},
+	elocation-id = {2024.04.15.589623},
+	eprint = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623.full.pdf},
+	journal = {bioRxiv},
+	publisher = {Cold Spring Harbor Laboratory},
+	title = {ModDotPlot - Rapid and interactive visualization of complex repeats},
+	url = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623},
+	year = {2024},
+	bdsk-url-1 = {https://www.biorxiv.org/content/early/2024/04/19/2024.04.15.589623},
+	bdsk-url-2 = {https://doi.org/10.1101/2024.04.15.589623}}
diff --git a/README.md b/README.md
@@ -53,7 +53,7 @@ Finally, confirm that the installation was installed correctly by running `moddo
  | |  | | (_) | (_| | | |__| | (_) | |_  | |    | | (_) | |_ 
  |_|  |_|\___/ \__,_| |_____/ \___/ \__| |_|    |_|\___/ \__|
 
-v0.8.1
+v0.8.2
 
 usage: moddotplot [-h] {interactive,static} ...
 
@@ -234,7 +234,7 @@ $ moddotplot interactive -f sequences/Chr1_cen.fa
  | |  | | (_) | (_| | | |__| | (_) | |_  | |    | | (_) | |_ 
  |_|  |_|\___/ \__,_| |_____/ \___/ \__| |_|    |_|\___/ \__|
 
-v0.8.1
+v0.8.2
 
 Running ModDotPlot in interactive mode
 
@@ -322,7 +322,7 @@ $ moddotplot static -c config/config.json
  | |  | | (_) | (_| | | |__| | (_) | |_  | |    | | (_) | |_ 
  |_|  |_|\___/ \__,_| |_____/ \___/ \__| |_|    |_|\___/ \__|
 
-v0.8.1
+v0.8.2
 
 Running ModDotPlot in static mode
 
@@ -387,4 +387,6 @@ For bug reports or general usage questions, please raise a GitHub issue, or emai
 
 ## Cite
 
-Publication in progress! (almost there :D)
+Alexander P. Sweeten, Michael C. Schatz, Adam M. Phillippy, ModDotPlot - Rapid and interactive visualization of complex repeats
+bioRxiv 2024.04.15.589623; doi: https://doi.org/10.1101/2024.04.15.589623
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "ModDotPlot"
-version = "0.8.1"
+version = "0.8.2"
 requires-python = ">= 3.7"
 dependencies = [
   "pysam",

diff --git a/sequences/chr1_cen.fa.fai b/sequences/chr1_cen.fa.fai
diff --git a/src/moddotplot/__main__.py b/src/moddotplot/__main__.py
@@ -6,6 +6,6 @@
 import setproctitle
 
 # Set the process title to a custom name
-setproctitle.setproctitle('ModDotPlot')
+setproctitle.setproctitle("ModDotPlot")
 
-sys.exit(main())
+sys.exit(main())
diff --git a/src/moddotplot/const.py b/src/moddotplot/const.py
@@ -1,4 +1,4 @@
-VERSION = "0.8.1"
+VERSION = "0.8.2"
 COLS = [
     "#query_name",
     "query_start",

diff --git a/src/moddotplot/estimate_identity.py b/src/moddotplot/estimate_identity.py
@@ -510,7 +510,6 @@ def findValueInRange(integer: int, range_dict: dict) -> int:
         if key[0] >= integer >= key[1]:
             return value
     return highest_value
-
 
 
 def setZoomLevels(axis_length, sparsity_layers):

diff --git a/src/moddotplot/interactive.py b/src/moddotplot/interactive.py
@@ -2,8 +2,6 @@
 from moddotplot.estimate_identity import (
     getInteractiveColor,
     getMatchingColors,
-    verifyModimizers,
-    setZoomLevels,
     makeDifferencesEqual,
     generateDictionaryFromList,
     findValueInRange,
@@ -76,30 +74,39 @@ def run_dash(matrices, metadata, axes, sparsity, identity, port_number, output_d
     titles = []
     for i in range(len(metadata)):
         titles.append(metadata[i]["title"])
+
     # Get zooming thresholds, adjust sparsity respectively.
     def halving_sequence(size, start):
         sequence = [start]
         for _ in range(1, size):
             start /= 2
             sequence.append(start)
         return sequence
-    #print(current_metadata)
-    mod_thresholds_list = halving_sequence(len(current_metadata["sparsities"]), current_metadata["x_size"])
-    #print(mod_thresholds_list)
-
-    #print(current_metadata["min_window_size"]* current_metadata["resolution"])
-    #print(current_metadata["max_window_size"])
-    numo = round(math.log2(current_metadata['max_window_size']/current_metadata['min_window_size']) + 1)
-    #print(numo)
+
+    # print(current_metadata)
+    mod_thresholds_list = halving_sequence(
+        len(current_metadata["sparsities"]), current_metadata["x_size"]
+    )
+    # print(mod_thresholds_list)
+
+    # print(current_metadata["min_window_size"]* current_metadata["resolution"])
+    # print(current_metadata["max_window_size"])
+    numo = round(
+        math.log2(
+            current_metadata["max_window_size"] / current_metadata["min_window_size"]
+        )
+        + 1
+    )
+    # print(numo)
     important = generateDictionaryFromList(mod_thresholds_list)
-    #print(f"this is imprtant: {important}")
+    # print(f"this is imprtant: {important}")
 
     main_level = image_pyramid[0]
     main_x_axis = axes[0][0]
     main_y_axis = axes[0][1]
     main_x_axis_np = np.array(main_x_axis)
 
-    #TODO: modify value here
+    # TODO: modify value here
     main_x_axis_np += 3000
 
     # Modify text so that hover shows interval format
@@ -261,9 +268,9 @@ def halving_sequence(size, start):
                                             {"label": f"{title}", "value": f"{title}"}
                                             for title in titles  # Iterate over each title
                                         ],
-                                        value=titles[0]
-                                        if titles
-                                        else None,  # Set default value based on the length of matrices
+                                        value=(
+                                            titles[0] if titles else None
+                                        ),  # Set default value based on the length of matrices
                                         clearable=False,  # Prevent dropdown from clearing values,
                                         style={"width": "300px"},
                                     ),

diff --git a/src/moddotplot/moddotplot.py b/src/moddotplot/moddotplot.py
@@ -502,7 +502,7 @@ def main():
         isValidFasta(i)
         headers = getInputHeaders(i)
         if len(headers) > 1:
-            print(f"File {i} contains multiple fasta entries: \n")
+            print(f"File {i} contains multiple fasta entries. \n")
             counter = 1
             for j in headers:
                 counter += 1
@@ -541,9 +541,7 @@ def main():
         max_window_size = math.ceil(hgi / args.resolution)
         # If only sequence is too small, throw an error.
         if max_window_size < 10:
-            print(
-                    f"Error: sequence too small for analysis.\n"
-                )
+            print(f"Error: sequence too small for analysis.\n")
             print(
                 f"ModDotPlot requires a minimum window size of 10. Sequences less than 10Kbp will not work with ModDotPlot under normal resolution. We recommend rerunning ModDotPlot with --r {math.ceil(hgi / 10)}.\n"
             )
@@ -772,10 +770,11 @@ def main():
                 pickle.dump(metadata, f)
             # Check if no plot arg is used
             if args.no_plot:
-                print(f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n")
+                print(
+                    f"Saved matrices to {folder_path}. Thank you for using ModDotPlot!\n"
+                )
                 sys.exit(0)
 
-
         # Before running dash, change into intervals...
         axes = []
         for matrices_set, meta in zip(matrices, metadata):
@@ -799,8 +798,8 @@ def main():
     # -----------SETUP STATIC MODE-----------
     elif args.command == "static":
         # -----------SET SPARSITY VALUE-----------
+        # TODO: this is not sorting correctly
         sequences = list(zip(seq_list, k_list))
-        sequences.sort(key=lambda x: len(x[1]), reverse=True)
 
         # Create output directory, if doesn't exist:
         if (args.output_dir) and not os.path.exists(args.output_dir):
@@ -816,10 +815,12 @@ def main():
                     res = math.ceil(seq_length / args.window)
                 else:
                     win = math.ceil(seq_length / args.resolution)
+
                 if win < args.modimizer:
-                    raise ValueError(
+                    args.modimizer = win
+                    """raise ValueError(
                         "Window size must be greater than or equal to the modimizer sketch size"
-                    )
+                    )"""
 
                 seq_sparsity = round(win / args.modimizer)
                 if seq_sparsity <= args.modimizer:
@@ -845,7 +846,7 @@ def main():
                 # print(f"\tSparsity value s: {seq_sparsity}\n")
                 print(f"\tSequence length n: {len(k_list[i]) + args.kmer - 1}\n")
                 print(f"\tWindow size w: {win}\n")
-                print(f"\tModimizer sketch value: {expectation}\n")
+                print(f"\tModimizer sketch size: {expectation}\n")
                 print(f"\tPlot Resolution r: {res}\n")
                 self_mat = createSelfMatrix(
                     seq_length,
@@ -905,9 +906,7 @@ def main():
             else:
                 win = math.ceil(len(sequences[0][1]) / args.resolution)
             if win < args.modimizer:
-                raise ValueError(
-                    "Window size must be greater than or equal to the modimizer sketch size"
-                )
+                args.modimizer = win
 
             seq_sparsity = round(win / args.modimizer)
             if seq_sparsity <= args.modimizer:
@@ -935,7 +934,7 @@ def main():
                         f"\tSequence length {sequences[j][0]}: {smaller_length + args.kmer - 1}\n"
                     )
                     print(f"\tWindow size w: {win}\n")
-                    print(f"\tModimizer sketch value: {expectation}\n")
+                    print(f"\tModimizer sketch size: {expectation}\n")
                     print(f"\tPlot Resolution r: {res}\n")
 
                     pair_mat = createPairwiseMatrix(

diff --git a/src/moddotplot/parse_fasta.py b/src/moddotplot/parse_fasta.py
@@ -10,27 +10,37 @@
 
 tab_b = bytes.maketrans(b"ACTG", b"TGAC")
 
+
 def generateKmersFromFasta(seq: Sequence[str], k: int, quiet: bool) -> Iterable[int]:
     n = len(seq)
     if not quiet:
         progress_thresholds = round(n / 77)
-        printProgressBar(0, n - k + 1, prefix='Progress:', suffix='Complete', length=40)
+        printProgressBar(0, n - k + 1, prefix="Progress:", suffix="Complete", length=40)
 
     for i in range(n - k + 1):
         if not quiet:
             if i % progress_thresholds == 0:
-                printProgressBar(i, n - k + 1, prefix='Progress:', suffix='Complete', length=40)
+                printProgressBar(
+                    i, n - k + 1, prefix="Progress:", suffix="Complete", length=40
+                )
             if i == n - k:
-                printProgressBar(n - k + 1, n - k + 1, prefix='Progress:', suffix='Completed', length=40)
-
-        kmer = seq[i:i + k]
+                printProgressBar(
+                    n - k + 1,
+                    n - k + 1,
+                    prefix="Progress:",
+                    suffix="Completed",
+                    length=40,
+                )
+
+        kmer = seq[i : i + k]
         fh = mmh3.hash(kmer)
 
         # Calculate reverse complement hash directly without the need for translation
         rc = mmh3.hash(kmer[::-1].translate(tab_b))
-        
+
         yield fh if fh < rc else rc
 
+
 def isValidFasta(file_path):
     try:
         with open(file_path, "r") as file:
@@ -52,6 +62,7 @@ def isValidFasta(file_path):
         print(f"An error occurred: {str(e)}")
         sys.exit(6)
 
+
 def extractFiles(folder_path):
     # Check to see at least one compressed numpy matrix, and one metadata pickle are included
     metadata = []
@@ -61,15 +72,15 @@ def extractFiles(folder_path):
         file_path = os.path.join(folder_path, filename)  # Full path to the file
         if filename.endswith(".pkl"):
             with open(file_path, "rb") as f:
-                metadata = pickle.load(f) # Append loaded data to the metadata list
+                metadata = pickle.load(f)  # Append loaded data to the metadata list
 
     for filename in os.listdir(folder_path):
         file_path = os.path.join(folder_path, filename)
         if filename.endswith(".npz"):
-            pattern = rf'_(\d+)\.npz'  # Using f-string to include the value of i in the regex pattern
+            pattern = rf"_(\d+)\.npz"  # Using f-string to include the value of i in the regex pattern
             tmp2 = re.split(pattern, filename, maxsplit=1)
-            ff = np.load(file_path,allow_pickle=True)
-            tmp.append((tmp2[0],tmp2[1], ff))
+            ff = np.load(file_path, allow_pickle=True)
+            tmp.append((tmp2[0], tmp2[1], ff))
     sorted_list = sorted(tmp, key=lambda x: (x[0], x[1]))
 
     unique_lists = {}
@@ -84,15 +95,16 @@ def extractFiles(folder_path):
 
     # Convert dictionary values to lists
     result_lists = list(unique_lists.values())
-    sorted_result_lists = [lst for title in metadata for lst in result_lists if lst[0][0] == title['title']]
+    sorted_result_lists = [
+        lst for title in metadata for lst in result_lists if lst[0][0] == title["title"]
+    ]
     for unique_list in sorted_result_lists:
         matrices.append([])
         for val in unique_list:
             matrices[-1].append(val[-1]["data"])
     return matrices, metadata
 
 
-
 def printProgressBar(
     iteration,
     total,