Skip to content

Commit

Permalink
Move read fasta into utils
Browse files Browse the repository at this point in the history
  • Loading branch information
lauraluebbert committed May 20, 2024
1 parent 0d43dad commit c376766
Show file tree
Hide file tree
Showing 3 changed files with 58 additions and 45 deletions.
26 changes: 3 additions & 23 deletions gget/gget_blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from urllib.parse import urlencode

# Custom functions
from .utils import parse_blast_ref_page, wrap_cols_func
from .utils import parse_blast_ref_page, wrap_cols_func, read_fasta

# Constants
from .constants import (
Expand Down Expand Up @@ -107,28 +107,8 @@ def blast(
seqs.append(line.strip())

elif ".fa" in sequence:
# Read the FASTA
titles = []
seqs = []
with open(sequence) as fasta_file:
for i, line in enumerate(fasta_file):
# Each second line will be a title line
if i % 2 == 0:
if line[0] != ">":
raise ValueError(
"Expected FASTA to start with a '>' character. "
)
else:
# Append title line to titles list
titles.append(line.strip())
else:
if line[0] == ">":
raise ValueError(
"FASTA contains two lines starting with '>' in a row -> missing sequence line. "
)
# Append sequences line to seqs list
else:
seqs.append(line.strip())
titles, seqs = read_fasta(sequence)

else:
raise ValueError(
"File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. "
Expand Down
30 changes: 8 additions & 22 deletions gget/gget_mutate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
import pandas as pd
from Bio import SeqIO
import re
import logging

from .utils import read_fasta

# Add and format time stamp in logging messages
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
Expand All @@ -24,16 +25,9 @@
mutation_pattern = r"c\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)'


def raise_pytest_error():
if os.getenv("TEST_MODE"):
raise ValueError()


def find_sequence(fasta_path, accession_number):
for record in SeqIO.parse(fasta_path, "fasta"):
if accession_number in record.description:
return record.seq
return "No sequence found" # Return None if no match is found
# def raise_pytest_error():
# if os.getenv("TEST_MODE"):
# raise ValueError()


def extract_mutation_type(mutation):
Expand Down Expand Up @@ -78,14 +72,6 @@ def extract_mutation_type(mutation):
return "unknown"


def load_sequences(fasta_path):
sequences = {}
for record in SeqIO.parse(fasta_path, "fasta"):
key = record.description.split()[1]
sequences[key] = str(record.seq)
return sequences


def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations

Expand Down Expand Up @@ -139,7 +125,7 @@ def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
except:
logging.debug(f"Error with {row['mut_ID']} - row['Mutation CDS']")
unknown_mutations += 1
raise_pytest_error()
# raise_pytest_error()
return ""

mutant_sequence, adjusted_end_position = mutation_function(
Expand All @@ -160,7 +146,7 @@ def create_mutant_sequence(row, mutation_function, kmer_flanking_length):
else:
logging.debug(f"Error with {row['mut_ID']} - {row['Mutation CDS']}")
unknown_mutations += 1
raise_pytest_error()
# raise_pytest_error()
return ""


Expand Down Expand Up @@ -342,7 +328,7 @@ def mutate(
)

# Load input sequences and link sequences to their mutations using the sequence identifier
sequences = load_sequences(input_fasta)
_, sequences = read_fasta(input_fasta)
mutation_df["full_sequence"] = mutation_df[seq_id_column].map(sequences)

# Split data frame by mutation type
Expand Down
47 changes: 47 additions & 0 deletions gget/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,57 @@

from .constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV, ENS_TO_PDB_API


def flatten(xss):
"""
Function to flatten a list of lists.
"""
return [x for xs in xss for x in xs]


def read_fasta(fasta):
"""
Args:
- fasta (str) Path to fasta file.
Returns titles and seqs from fasta file as two list objects.
"""
titles = []
seqs = []
title_last = False
with open(fasta) as fasta_file:
for i, line in enumerate(fasta_file):
if i == 0 and line[0] != ">":
raise ValueError("Expected FASTA to start with a '>' character. ")
elif line[0] == ">":
if title_last:
raise ValueError(
"FASTA contains two lines starting with '>' in a row -> missing sequence line. "
)

if new_seq:
# Append last recorded sequence
seqs.append(flatten(new_seq))

# Append title line to titles list
titles.append(line.strip())
title_last = True

else:
if title_last:
# Start recording new sequence if title was last
new_seq = []
new_seq.append(line.strip())
else:
new_seq.append(line.strip())
title_last = False

# Append last sequence
seqs.append(flatten(new_seq))

return titles, seqs


def n_colors(nucleotide):
"""
Returns a string format to print the nucleotide
Expand Down Expand Up @@ -658,6 +703,7 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None):

return databases


def find_nv_kingdom(species, release):
kds = ["plants", "protists", "metazoa", "fungi"]
for kingdom in kds:
Expand All @@ -681,6 +727,7 @@ def find_nv_kingdom(species, release):
if species in sps[5:]:
return kingdom


def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
"""
Function to find all available species for gget ref.
Expand Down

0 comments on commit c376766

Please sign in to comment.