Skip to content

Commit

Permalink
Merge pull request #136 from pachterlab/dev
Browse files Browse the repository at this point in the history
Dev -> main
  • Loading branch information
lauraluebbert committed May 31, 2024
2 parents 0f34fd2 + 17e1074 commit 3b3d4ad
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 21 deletions.
4 changes: 3 additions & 1 deletion docs/src/en/updates.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
## ✨ What's new
**Version ≥ 0.28.5** (May 29, 2024):
**Version ≥ 0.28.6** (May 31, 2024):
- **New module: [`gget mutate`](./mutate.md)**
- [`gget cosmic`](./cosmic.md): You can now download entire COSMIC databases using the argument `download_cosmic` argument
- [`gget ref`](./ref.md): Can now fetch the GRCh27 genome assembly using `species='human_grch37'`
- [`gget search`](./search.md): Adjust access of human data to the structure of Ensembl release 112 (fixes [issue 129](https://github.com/pachterlab/gget/issues/129))

~~**Version ≥ 0.28.5** (May 29, 2024):~~ Yanked due to logging bug in `gget.setup("alphafold")`

**Version ≥ 0.28.4** (January 31, 2024):
- [`gget setup`](./setup.md): Fix bug with filepath when running `gget.setup("elm")` on Windows OS.
Expand Down
4 changes: 3 additions & 1 deletion docs/src/es/updates.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
## ✨ ¡Lo más reciente!
**Versión ≥ 0.28.5 (29 de mayo de 2024):**
**Versión ≥ 0.28.6 (31 de mayo de 2024):**
- **Nuevo módulo: [`gget mutate`](./mutate.md)**
- [`gget cosmic`](./cosmic.md): Ahora puedes descargar bases de datos completas de COSMIC utilizando el argumento `download_cosmic`
- [`gget ref`](./ref.md): Ahora puede obtener la ensambladura del genoma GRCh27 usando `species='human_grch37'`
- [`gget search`](./search.md): Ajusta el acceso a los datos humanos a la estructura de la versión 112 de Ensembl (corrige [issue 129](https://github.com/pachterlab/gget/issues/129))

~~**Version ≥ 0.28.5** (May 29, 2024):~~ Retirado debido a un error con 'logging' en `gget.setup("alphafold")`

**Versión ≥ 0.28.4** (31 de enero de 2024):
- [`gget setup`](./setup.md): soluciona el error con la ruta del archivo al ejecutar `gget.setup("elm")` en el sistema operativo Windows.

Expand Down
48 changes: 45 additions & 3 deletions gget/gget_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,9 +316,35 @@ def inversion_mutation(
starting_nucleotide_position_index_0 : ending_nucleotide_position_index_0
+ 1
]

# Reverse
reverse_insertion_string = insertion_string[::-1]

# Get complement
complement = {
"A": "T",
"T": "A",
"U": "A",
"C": "G",
"G": "C",
"N": "N",
"a": "t",
"t": "a",
"u": "a",
"c": "g",
"g": "c",
"n": "n",
".": ".", # annotation for gaps
"-": "-", # annotation for gaps
}
mutated_string = "".join(
complement.get(nucleotide, "N") for nucleotide in reverse_insertion_string
)

# Create final sequence
mutant_sequence = (
row["full_sequence"][:starting_nucleotide_position_index_0]
+ insertion_string[::-1]
+ mutated_string
+ row["full_sequence"][ending_nucleotide_position_index_0 + 1 :]
)
adjusted_end_position = ending_nucleotide_position_index_0
Expand Down Expand Up @@ -391,8 +417,8 @@ def mutate(
- mut_column (str) Name of the column containing the mutations to be performed in 'mutations'. Default: 'mutation'.
- mut_id_column (str) Name of the column containing the IDs of each mutation in 'mutations'. Default: 'mut_ID'.
- seq_id_column (str) Name of the column containing the IDs of the sequences to be mutated in 'mutations'. Default: 'seq_ID'.
- out (str) Path to output fasta file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'.
Default: None -> returns a list of the mutated sequences to standard out.
- out (str) Path to output fasta file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'.
Default: None -> returns a list of the mutated sequences to standard out.
The identifiers (following the '>') of the mutated sequences in the output fasta will be '>[seq_ID]_[mut_ID]'.
- verbose (True/False) whether to print progress information. Default: True
Expand Down Expand Up @@ -479,12 +505,28 @@ def mutate(
"""
)

# Set of possible nucleotides (- and . are gap annotations)
nucleotides = set("ATGCUNatgcun.-")

seq_dict = {}
non_nuc_seqs = 0
for title, seq in zip(titles, seqs):
# Check that sequences are nucleotide sequences
if not set(seq) <= nucleotides:
non_nuc_seqs += 1

# Keep text following the > until the first space/dot as the sequence identifier
# Dots are removed so Ensembl version numbers are removed
seq_dict[title.split(" ")[0].split(".")[0]] = seq

if non_nuc_seqs > 0:
logger.warning(
f"""
Non-nucleotide characters detected in {non_nuc_seqs} input sequences. gget mutate is currently only optimized for mutating nucleotide sequences.
Specifically inversion mutations might not be performed correctly.
"""
)

# Get all mutation types
if verbose:
tqdm.pandas(desc="Extracting mutation types")
Expand Down
3 changes: 2 additions & 1 deletion gget/gget_setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import logging
import shutil
import sys
import subprocess
Expand Down Expand Up @@ -209,7 +210,7 @@ def setup(module, verbose=True, out=None):
import simtk.openmm as openmm

# Silence openmm logger
logger.getLogger("openmm").setLevel(logger.WARNING)
logging.getLogger("openmm").setLevel(logging.WARNING)

# Commenting the following out because openmm v7.7.0 does not support __version__
# # Check if correct version was installed
Expand Down
30 changes: 15 additions & 15 deletions tests/test_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,18 +136,18 @@ def test_single_dup(self):
)
assert result == "ABCDEEFG", f"Expected ABCDEEFG, got {result}"

def test_inv(self):
test_row = create_test_row("c.3_4inv", self.alphabet_sequence)
result = create_mutant_sequence(
test_row, inversion_mutation, kmer_flanking_length=k
)
assert result == "ABDCEFG", f"Expected ABDCEFG, got {result}"

def test_inv_long(self):
test_row = create_test_row("c.41_42inv", self.alphabet_sequence_long)
result = create_mutant_sequence(
test_row, inversion_mutation, kmer_flanking_length=k
)
assert (
result == "KLMNOPQRSTUVWXYZABCDEFGHIJKLMNPOQRSTUVWXYZABCDEFGHIJKLMNOPQRST"
), f"Expected KLMNOPQRSTUVWXYZABCDEFGHIJKLMNPOQRSTUVWXYZABCDEFGHIJKLMNOPQRST, got {result}"
# def test_inv(self):
# test_row = create_test_row("c.3_4inv", self.alphabet_sequence)
# result = create_mutant_sequence(
# test_row, inversion_mutation, kmer_flanking_length=k
# )
# assert result == "ABDCEFG", f"Expected ABDCEFG, got {result}"

# def test_inv_long(self):
# test_row = create_test_row("c.41_42inv", self.alphabet_sequence_long)
# result = create_mutant_sequence(
# test_row, inversion_mutation, kmer_flanking_length=k
# )
# assert (
# result == "KLMNOPQRSTUVWXYZABCDEFGHIJKLMNPOQRSTUVWXYZABCDEFGHIJKLMNOPQRST"
# ), f"Expected KLMNOPQRSTUVWXYZABCDEFGHIJKLMNPOQRSTUVWXYZABCDEFGHIJKLMNOPQRST, got {result}"

0 comments on commit 3b3d4ad

Please sign in to comment.