Move read fasta into utils

pachterlab · May 20, 2024 · c376766 · c376766
1 parent 0d43dad
commit c376766
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 45 deletions.
diff --git a/gget/gget_blast.py b/gget/gget_blast.py
@@ -19,7 +19,7 @@
 from urllib.parse import urlencode
 
 # Custom functions
-from .utils import parse_blast_ref_page, wrap_cols_func
+from .utils import parse_blast_ref_page, wrap_cols_func, read_fasta
 
 # Constants
 from .constants import (
@@ -107,28 +107,8 @@ def blast(
                         seqs.append(line.strip())
 
         elif ".fa" in sequence:
-            # Read the FASTA
-            titles = []
-            seqs = []
-            with open(sequence) as fasta_file:
-                for i, line in enumerate(fasta_file):
-                    # Each second line will be a title line
-                    if i % 2 == 0:
-                        if line[0] != ">":
-                            raise ValueError(
-                                "Expected FASTA to start with a '>' character. "
-                            )
-                        else:
-                            # Append title line to titles list
-                            titles.append(line.strip())
-                    else:
-                        if line[0] == ">":
-                            raise ValueError(
-                                "FASTA contains two lines starting with '>' in a row -> missing sequence line. "
-                            )
-                        # Append sequences line to seqs list
-                        else:
-                            seqs.append(line.strip())
+            titles, seqs = read_fasta(sequence)
+
         else:
             raise ValueError(
                 "File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. "

diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py
@@ -1,9 +1,10 @@
 import os
 import pandas as pd
-from Bio import SeqIO
 import re
 import logging
 
+from .utils import read_fasta
+
 # Add and format time stamp in logging messages
 logging.basicConfig(
     format="%(asctime)s %(levelname)s %(message)s",
@@ -24,16 +25,9 @@
 mutation_pattern = r"c\.([0-9_\-\+\*]+)([a-zA-Z>]+)"  # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)'
 
 
-def raise_pytest_error():
-    if os.getenv("TEST_MODE"):
-        raise ValueError()
-
-
-def find_sequence(fasta_path, accession_number):
-    for record in SeqIO.parse(fasta_path, "fasta"):
-        if accession_number in record.description:
-            return record.seq
-    return "No sequence found"  # Return None if no match is found
+# def raise_pytest_error():
+#     if os.getenv("TEST_MODE"):
+#         raise ValueError()
 
 
 def extract_mutation_type(mutation):
@@ -78,14 +72,6 @@ def extract_mutation_type(mutation):
         return "unknown"
 
 
-def load_sequences(fasta_path):
-    sequences = {}
-    for record in SeqIO.parse(fasta_path, "fasta"):
-        key = record.description.split()[1]
-        sequences[key] = str(record.seq)
-    return sequences
-
-
 def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
     global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations
 
@@ -139,7 +125,7 @@ def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
         except:
             logging.debug(f"Error with {row['mut_ID']} - row['Mutation CDS']")
             unknown_mutations += 1
-            raise_pytest_error()
+            # raise_pytest_error()
             return ""
 
         mutant_sequence, adjusted_end_position = mutation_function(
@@ -160,7 +146,7 @@ def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
     else:
         logging.debug(f"Error with {row['mut_ID']} - {row['Mutation CDS']}")
         unknown_mutations += 1
-        raise_pytest_error()
+        # raise_pytest_error()
         return ""
 
 
@@ -342,7 +328,7 @@ def mutate(
     )
 
     # Load input sequences and link sequences to their mutations using the sequence identifier
-    sequences = load_sequences(input_fasta)
+    _, sequences = read_fasta(input_fasta)
     mutation_df["full_sequence"] = mutation_df[seq_id_column].map(sequences)
 
     # Split data frame by mutation type

diff --git a/gget/utils.py b/gget/utils.py
@@ -22,12 +22,57 @@
 
 from .constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV, ENS_TO_PDB_API
 
+
 def flatten(xss):
     """
     Function to flatten a list of lists.
     """
     return [x for xs in xss for x in xs]
 
+
+def read_fasta(fasta):
+    """
+    Args:
+    - fasta     (str) Path to fasta file.
+
+    Returns titles and seqs from fasta file as two list objects.
+    """
+    titles = []
+    seqs = []
+    title_last = False
+    with open(fasta) as fasta_file:
+        for i, line in enumerate(fasta_file):
+            if i == 0 and line[0] != ">":
+                raise ValueError("Expected FASTA to start with a '>' character. ")
+            elif line[0] == ">":
+                if title_last:
+                    raise ValueError(
+                        "FASTA contains two lines starting with '>' in a row -> missing sequence line. "
+                    )
+
+                if new_seq:
+                    # Append last recorded sequence
+                    seqs.append(flatten(new_seq))
+
+                # Append title line to titles list
+                titles.append(line.strip())
+                title_last = True
+
+            else:
+                if title_last:
+                    # Start recording new sequence if title was last
+                    new_seq = []
+                    new_seq.append(line.strip())
+                else:
+                    new_seq.append(line.strip())
+                title_last = False
+
+        # Append last sequence
+        seqs.append(flatten(new_seq))
+
+    return titles, seqs
+
+
 def n_colors(nucleotide):
     """
     Returns a string format to print the nucleotide
@@ -658,6 +703,7 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None):
 
     return databases
 
+
 def find_nv_kingdom(species, release):
     kds = ["plants", "protists", "metazoa", "fungi"]
     for kingdom in kds:
@@ -681,6 +727,7 @@ def find_nv_kingdom(species, release):
         if species in sps[5:]:
             return kingdom
 
+
 def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
     """
     Function to find all available species for gget ref.