Skip to content

Commit

Permalink
Merge pull request #130 from pachterlab/cosmic_dev
Browse files Browse the repository at this point in the history
Cosmic dev -> dev
  • Loading branch information
lauraluebbert committed May 26, 2024
2 parents d3e34dd + 78c86b7 commit 901b7ec
Show file tree
Hide file tree
Showing 32 changed files with 2,205 additions and 820 deletions.
2 changes: 2 additions & 0 deletions docs/src/en/cosmic.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Search for genes, mutations, and other factors associated with cancer using the
Return format: JSON (command-line) or data frame/CSV (Python).
This module was written by [@AubakirovArman](https://github.com/AubakirovArman).

NOTE: License fees apply for the commercial use of COSMIC. You can read more about licensing COSMIC data [here](https://cancer.sanger.ac.uk/cosmic/license).

**Positional argument**
`searchterm`
Search term, which can be a mutation, gene name (or Ensembl ID), cancer type, tumor site, study ID, PubMed ID, or sample ID, as defined using the `entity` argument. Example: 'EGFR'
Expand Down
2 changes: 1 addition & 1 deletion docs/src/en/ref.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Return format: dictionary/JSON.
Species for which the FTPs will be fetched in the format genus_species, e.g. homo_sapiens.
Supports all available vertebrate and invertebrate (plants, fungi, protists, and invertebrate metazoa) genomes from Ensembl, except bacteria.
Note: Not required when using flags `--list_species` or `--list_iv_species`.
Supported shortcuts: 'human', 'mouse'
Supported shortcuts: 'human', 'mouse', 'human_grch37' (accesses the GRCh37 genome assembly)

**Optional arguments**
`-w` `--which`
Expand Down
2 changes: 2 additions & 0 deletions docs/src/es/cosmic.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Busque genes, mutaciones, etc. asociados con cánceres utilizando la base de dat
Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
`gget cosmic` fue escrito por [@AubakirovArman](https://github.com/AubakirovArman).

Se aplican tarifas de licencia para el uso comercial de COSMIC. Puede leer más sobre la concesión de licencias de datos COSMIC [aquí](https://cancer.sanger.ac.uk/cosmic/license).

**Parámetro posicional**
`searchterm`
Término de búsqueda. Puede ser una mutación, un nombre de gen (o ID de Ensembl), tipo de cáncer, sitio del tumor, ID de estudio, ID de PubMed o ID de muestra, tal como se define con el argumento `entity`. Ejemplo: 'EGFR'
Expand Down
2 changes: 1 addition & 1 deletion docs/src/es/ref.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Regresa: Resultados en formato JSON.
`species`
La especie por la cual que se buscará los FTP en el formato género_especies, p. ej. homo_sapiens.
Nota: No se requiere cuando se llama a la bandera `--list_species`.
Accesos directos: 'human', 'mouse'
Accesos directos: 'human', 'mouse', 'human_grch37' (accede al ensamblaje del genoma GRCh37)

**Parámetros optionales**
`-w` `--which`
Expand Down
6 changes: 1 addition & 5 deletions gget/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,9 @@
from .gget_elm import elm
from .gget_diamond import diamond
from .gget_cosmic import cosmic
from .gget_mutate import mutate

import logging
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%c",
)
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)

Expand Down
23 changes: 7 additions & 16 deletions gget/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,10 @@
import subprocess
import sys
import platform
import logging

# Add and format time stamp in logging messages
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%c",
)
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)

# Constants
from .constants import MUSCLE_GITHUB_LINK
from .utils import set_up_logger
logger = set_up_logger()

# Get absolute package path
PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__))
Expand All @@ -35,7 +26,7 @@ def compile_muscle():
f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n"
)

logging.info("Compiling MUSCLE binary from source... ")
logger.info("Compiling MUSCLE binary from source... ")

# Record current working directory
cwd = os.getcwd()
Expand Down Expand Up @@ -64,14 +55,14 @@ def compile_muscle():

# Run make command
if platform.system() == "Linux":
logging.warning(
logger.warning(
"Compiling MUSCLE requires that g++, make, sed and git are installed."
)
if platform.system() == "Darwin":
logging.warning(
logger.warning(
"Compiling MUSCLE requires that gcc v11, make, sed and git are installed."
)
logging.warning(
logger.warning(
"Please run 'brew install gcc' to install gcc v11 if the compile fails."
)

Expand All @@ -86,7 +77,7 @@ def compile_muscle():
if process_2.wait() != 0:
sys.exit(f"'{command2}' command returned with error {process_2.wait()}.")

logging.info("MUSCLE compiled.")
logger.info("MUSCLE compiled.")

# Change path back to cwd
os.chdir(cwd)
9 changes: 7 additions & 2 deletions gget/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Ensembl REST API server for gget seq and info
ENSEMBL_REST_API = "http://rest.ensembl.org/"
ENSEMBL_FTP_URL = "http://ftp.ensembl.org/pub/"
ENSEMBL_FTP_URL_GRCH37 = "http://ftp.ensembl.org/pub/grch37/"
# Non-vertebrate server
ENSEMBL_FTP_URL_NV = "http://ftp.ensemblgenomes.org/pub/"

Expand Down Expand Up @@ -38,8 +39,12 @@
EXPRESSION_URL = "https://maayanlab.cloud/archs4/search/loadExpressionTissue.php?"

# Download links for ELM database
ELM_INSTANCES_FASTA_DOWNLOAD = "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
ELM_INSTANCES_TSV_DOWNLOAD = "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
ELM_INSTANCES_FASTA_DOWNLOAD = (
"http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
)
ELM_INSTANCES_TSV_DOWNLOAD = (
"http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
)
ELM_CLASSES_TSV_DOWNLOAD = "http://elm.eu.org/elms/elms_index.tsv"
ELM_INTDOMAINS_TSV_DOWNLOAD = "http://elm.eu.org/interactiondomains.tsv"

Expand Down
45 changes: 19 additions & 26 deletions gget/gget_alphafold.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,8 @@
from ipywidgets import GridspecLayout
from ipywidgets import Output

import logging

logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%c",
)
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)
from .utils import set_up_logger
logger = set_up_logger()

TQDM_BAR_FORMAT = (
"{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
Expand Down Expand Up @@ -237,7 +230,7 @@ def alphafold(
"""

if platform.system() == "Windows":
logging.warning(
logger.warning(
"gget setup alphafold and gget alphafold are not supported on Windows OS."
)

Expand All @@ -261,7 +254,7 @@ def alphafold(
try:
import alphafold as AlphaFold
except ImportError:
logging.error(
logger.error(
"""
Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
Expand All @@ -275,7 +268,7 @@ def alphafold(
pdb_out, err = process.communicate()

if pdb_out.decode() == "":
logging.error(
logger.error(
"""
Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
Expand All @@ -285,7 +278,7 @@ def alphafold(

## Check if model parameters were downloaded
if not os.path.exists(os.path.join(PARAMS_DIR, "params/")):
logging.error(
logger.error(
"""
The AlphaFold model parameters are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
Expand All @@ -294,7 +287,7 @@ def alphafold(
return

if len(os.listdir(os.path.join(PARAMS_DIR, "params/"))) < 12:
logging.error(
logger.error(
"""
The AlphaFold model parameters are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
Expand Down Expand Up @@ -348,7 +341,7 @@ def alphafold(
)

## Move stereo_chemical_props.txt from gget bins to Alphafold package so it can be found
# logging.info("Locate files containing stereochemical properties.")
# logger.info("Locate files containing stereochemical properties.")
ALPHAFOLD_PATH = os.path.abspath(os.path.dirname(AlphaFold.__file__))
os.makedirs(os.path.join(ALPHAFOLD_PATH, "common/"), exist_ok=True)
shutil.copyfile(
Expand All @@ -358,7 +351,7 @@ def alphafold(

## Validate input sequence(s)
if verbose:
logging.info(f"Validating input sequence(s).")
logger.info(f"Validating input sequence(s).")

# Handle command line passing path to FASTA as a list
if isinstance(sequence, list) and len(sequence) == 1:
Expand Down Expand Up @@ -426,17 +419,17 @@ class ModelType(enum.Enum):
if len(seqs) == 1:
if multimer_for_monomer:
if verbose:
logging.info(
logger.info(
"Using the multimer model for a single chain, as requested."
)
model_type_to_use = ModelType.MULTIMER
else:
if verbose:
logging.info("Using the single-chain (monomer) model.")
logger.info("Using the single-chain (monomer) model.")
model_type_to_use = ModelType.MONOMER
else:
if verbose:
logging.info(f"Using the multimer model with {len(seqs)} sequences.")
logger.info(f"Using the multimer model with {len(seqs)} sequences.")
model_type_to_use = ModelType.MULTIMER

# Check whether total length exceeds limit
Expand All @@ -457,13 +450,13 @@ class ModelType(enum.Enum):
)

if total_sequence_length > MAX_VALIDATED_LENGTH:
logging.warning(
logger.warning(
f"The accuracy of this algorithm has not been fully validated above 3000 residues, and you may experience long running times or run out of memory. Total sequence length is {total_sequence_length} residues."
)

## Find the closest source
if verbose:
logging.info(f"Finding closest source for reference database.")
logger.info(f"Finding closest source for reference database.")

ex = futures.ThreadPoolExecutor(3)
fs = [ex.submit(fetch, source) for source in ["", "-europe", "-asia"]]
Expand Down Expand Up @@ -526,7 +519,7 @@ class ModelType(enum.Enum):
features_for_chain = {}
raw_msa_results_for_sequence = {}
for sequence_index, sequence in enumerate(sequences, start=1):
# logging.info(f"Getting MSA for sequence {sequence_index}.")
# logger.info(f"Getting MSA for sequence {sequence_index}.")

## Manage permissions to jackhmmer binary
command = f"chmod 755 {JACKHMMER_BINARY_PATH}"
Expand All @@ -537,7 +530,7 @@ class ModelType(enum.Enum):
if stderr:
# Log the standard error if it is not empty
sys.stderr.write(stderr)
logging.error("Giving chmod 755 permissions to jackhmmer binary failed.")
logger.error("Giving chmod 755 permissions to jackhmmer binary failed.")
return

# Save the target sequence in a fasta file
Expand Down Expand Up @@ -568,7 +561,7 @@ class ModelType(enum.Enum):
single_chain_msas.append(merged_msa)
msa_size = len(set(merged_msa.sequences))
if verbose:
logging.info(
logger.info(
f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}."
)
elif merged_msa.sequences and db_name == "uniprot":
Expand Down Expand Up @@ -721,7 +714,7 @@ class ModelType(enum.Enum):
prot=unrelaxed_proteins[best_model_name]
)
else:
logging.warning(
logger.warning(
"\nRunning model without relaxation stage. Use flag [--relax] ('relax=True') to include AMBER relaxation."
)
relaxed_pdb = protein.to_pdb(unrelaxed_proteins[best_model_name])
Expand Down Expand Up @@ -760,7 +753,7 @@ class ModelType(enum.Enum):
## Plotting
if plot:
if verbose:
logging.info("Plotting prediction results.")
logger.info("Plotting prediction results.")
import py3Dmol

# Construct multiclass b-factors to indicate confidence bands
Expand Down
23 changes: 8 additions & 15 deletions gget/gget_archs4.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,9 @@
import pandas as pd
import json as json_package
import io
import logging

# Add and format time stamp in logging messages
logging.basicConfig(
format="%(asctime)s %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%c",
)
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)
from .utils import set_up_logger
logger = set_up_logger()

# Custom functions
from .gget_info import info
Expand Down Expand Up @@ -78,7 +71,7 @@ def archs4(

# Check if Ensembl ID was found
if isinstance(info_df, type(None)):
logging.error(
logger.error(
f"ID '{gene}' not found. Please double-check spelling/arguments and try again."
)
return
Expand All @@ -96,7 +89,7 @@ def archs4(

if which == "correlation":
if verbose:
logging.info(
logger.info(
f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4."
)

Expand All @@ -120,13 +113,13 @@ def archs4(
# Check if the request returned an error (e.g. gene not found)
if "error" in corr_data.keys():
if corr_data["error"] == f"{gene} not in colids":
logging.error(
logger.error(
f"Gene '{gene}' did not return any gene correlation results. \n"
"If the gene is an Ensembl ID, please set argument 'ensembl=True' (for terminal, add flag: [--ensembl])."
)
return
else:
logging.error(
logger.error(
f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}"
)
return
Expand Down Expand Up @@ -157,7 +150,7 @@ def archs4(

if which == "tissue":
if verbose:
logging.info(
logger.info(
f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data."
)

Expand All @@ -182,7 +175,7 @@ def archs4(
tissue_exp_df = pd.read_csv(io.StringIO(r.content.decode("utf-8")))
# Check if any results were returned
if len(tissue_exp_df) < 2:
logging.error(
logger.error(
f"Gene '{gene}' did not return any tissue expression results. \n"
"If the gene is an Ensembl ID, please set argument 'ensembl=True' (for terminal, add flag: [--ensembl])."
)
Expand Down
Loading

0 comments on commit 901b7ec

Please sign in to comment.