Merge pull request #130 from pachterlab/cosmic_dev

Cosmic dev -> dev
pachterlab · May 26, 2024 · 901b7ec · 901b7ec
2 parents d3e34dd + 78c86b7
commit 901b7ec
Show file tree

Hide file tree

Showing 32 changed files with 2,205 additions and 820 deletions.
diff --git a/docs/src/en/cosmic.md b/docs/src/en/cosmic.md
@@ -4,6 +4,8 @@ Search for genes, mutations, and other factors associated with cancer using the
 Return format: JSON (command-line) or data frame/CSV (Python).  
 This module was written by [@AubakirovArman](https://github.com/AubakirovArman).
 
+NOTE: License fees apply for the commercial use of COSMIC. You can read more about licensing COSMIC data [here](https://cancer.sanger.ac.uk/cosmic/license).
+
 **Positional argument**  
 `searchterm`   
 Search term, which can be a mutation, gene name (or Ensembl ID), cancer type, tumor site, study ID, PubMed ID, or sample ID, as defined using the `entity` argument. Example: 'EGFR'  

diff --git a/docs/src/en/ref.md b/docs/src/en/ref.md
@@ -8,7 +8,7 @@ Return format: dictionary/JSON.
 Species for which the FTPs will be fetched in the format genus_species, e.g. homo_sapiens.  
 Supports all available vertebrate and invertebrate (plants, fungi, protists, and invertebrate metazoa) genomes from Ensembl, except bacteria.  
 Note: Not required when using flags `--list_species` or `--list_iv_species`.   
-Supported shortcuts: 'human', 'mouse'
+Supported shortcuts: 'human', 'mouse', 'human_grch37' (accesses the GRCh37 genome assembly)
 
 **Optional arguments**  
 `-w` `--which`  

diff --git a/docs/src/es/cosmic.md b/docs/src/es/cosmic.md
@@ -4,6 +4,8 @@ Busque genes, mutaciones, etc. asociados con cánceres utilizando la base de dat
 Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).  
 `gget cosmic` fue escrito por [@AubakirovArman](https://github.com/AubakirovArman).
 
+Se aplican tarifas de licencia para el uso comercial de COSMIC. Puede leer más sobre la concesión de licencias de datos COSMIC [aquí](https://cancer.sanger.ac.uk/cosmic/license).
+
 **Parámetro posicional**  
 `searchterm`   
 Término de búsqueda. Puede ser una mutación, un nombre de gen (o ID de Ensembl), tipo de cáncer, sitio del tumor, ID de estudio, ID de PubMed o ID de muestra, tal como se define con el argumento `entity`. Ejemplo: 'EGFR'  

diff --git a/docs/src/es/ref.md b/docs/src/es/ref.md
@@ -7,7 +7,7 @@ Regresa: Resultados en formato JSON.
 `species`  
 La especie por la cual que se buscará los FTP en el formato género_especies, p. ej. homo_sapiens.  
 Nota: No se requiere cuando se llama a la bandera `--list_species`.    
-Accesos directos: 'human', 'mouse'  
+Accesos directos: 'human', 'mouse', 'human_grch37' (accede al ensamblaje del genoma GRCh37)  
 
 **Parámetros optionales**  
 `-w` `--which`  

diff --git a/gget/__init__.py b/gget/__init__.py
@@ -15,13 +15,9 @@
 from .gget_elm import elm
 from .gget_diamond import diamond
 from .gget_cosmic import cosmic
+from .gget_mutate import mutate
 
 import logging
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s %(message)s",
-    level=logging.INFO,
-    datefmt="%c",
-)
 # Mute numexpr threads info
 logging.getLogger("numexpr").setLevel(logging.WARNING)
 

diff --git a/gget/compile.py b/gget/compile.py
@@ -2,19 +2,10 @@
 import subprocess
 import sys
 import platform
-import logging
 
-# Add and format time stamp in logging messages
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s %(message)s",
-    level=logging.INFO,
-    datefmt="%c",
-)
-# Mute numexpr threads info
-logging.getLogger("numexpr").setLevel(logging.WARNING)
-
-# Constants
 from .constants import MUSCLE_GITHUB_LINK
+from .utils import set_up_logger
+logger = set_up_logger()
 
 # Get absolute package path
 PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__))
@@ -35,7 +26,7 @@ def compile_muscle():
             f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n"
         )
 
-    logging.info("Compiling MUSCLE binary from source... ")
+    logger.info("Compiling MUSCLE binary from source... ")
 
     # Record current working directory
     cwd = os.getcwd()
@@ -64,14 +55,14 @@ def compile_muscle():
 
     # Run make command
     if platform.system() == "Linux":
-        logging.warning(
+        logger.warning(
             "Compiling MUSCLE requires that g++, make, sed and git are installed."
         )
     if platform.system() == "Darwin":
-        logging.warning(
+        logger.warning(
             "Compiling MUSCLE requires that gcc v11, make, sed and git are installed."
         )
-        logging.warning(
+        logger.warning(
             "Please run 'brew install gcc' to install gcc v11 if the compile fails."
         )
 
@@ -86,7 +77,7 @@ def compile_muscle():
     if process_2.wait() != 0:
         sys.exit(f"'{command2}' command returned with error {process_2.wait()}.")
 
-    logging.info("MUSCLE compiled.")
+    logger.info("MUSCLE compiled.")
 
     # Change path back to cwd
     os.chdir(cwd)
diff --git a/gget/constants.py b/gget/constants.py
@@ -3,6 +3,7 @@
 # Ensembl REST API server for gget seq and info
 ENSEMBL_REST_API = "http://rest.ensembl.org/"
 ENSEMBL_FTP_URL = "http://ftp.ensembl.org/pub/"
+ENSEMBL_FTP_URL_GRCH37 = "http://ftp.ensembl.org/pub/grch37/"
 # Non-vertebrate server
 ENSEMBL_FTP_URL_NV = "http://ftp.ensemblgenomes.org/pub/"
 
@@ -38,8 +39,12 @@
 EXPRESSION_URL = "https://maayanlab.cloud/archs4/search/loadExpressionTissue.php?"
 
 # Download links for ELM database
-ELM_INSTANCES_FASTA_DOWNLOAD = "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
-ELM_INSTANCES_TSV_DOWNLOAD = "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
+ELM_INSTANCES_FASTA_DOWNLOAD = (
+    "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
+)
+ELM_INSTANCES_TSV_DOWNLOAD = (
+    "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
+)
 ELM_CLASSES_TSV_DOWNLOAD = "http://elm.eu.org/elms/elms_index.tsv"
 ELM_INTDOMAINS_TSV_DOWNLOAD = "http://elm.eu.org/interactiondomains.tsv"
 

diff --git a/gget/gget_alphafold.py b/gget/gget_alphafold.py
@@ -31,15 +31,8 @@
 from ipywidgets import GridspecLayout
 from ipywidgets import Output
 
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s %(message)s",
-    level=logging.INFO,
-    datefmt="%c",
-)
-# Mute numexpr threads info
-logging.getLogger("numexpr").setLevel(logging.WARNING)
+from .utils import set_up_logger
+logger = set_up_logger()
 
 TQDM_BAR_FORMAT = (
     "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
@@ -237,7 +230,7 @@ def alphafold(
     """
 
     if platform.system() == "Windows":
-        logging.warning(
+        logger.warning(
             "gget setup alphafold and gget alphafold are not supported on Windows OS."
         )
 
@@ -261,7 +254,7 @@ def alphafold(
     try:
         import alphafold as AlphaFold
     except ImportError:
-        logging.error(
+        logger.error(
             """
             Some third-party dependencies are missing. Please run the following command: 
             >>> gget.setup('alphafold') or $ gget setup alphafold
@@ -275,7 +268,7 @@ def alphafold(
     pdb_out, err = process.communicate()
 
     if pdb_out.decode() == "":
-        logging.error(
+        logger.error(
             """
             Some third-party dependencies are missing. Please run the following command: 
             >>> gget.setup('alphafold') or $ gget setup alphafold
@@ -285,7 +278,7 @@ def alphafold(
 
     ## Check if model parameters were downloaded
     if not os.path.exists(os.path.join(PARAMS_DIR, "params/")):
-        logging.error(
+        logger.error(
             """
             The AlphaFold model parameters are missing. Please run the following command: 
             >>> gget.setup('alphafold') or $ gget setup alphafold
@@ -294,7 +287,7 @@ def alphafold(
         return
 
     if len(os.listdir(os.path.join(PARAMS_DIR, "params/"))) < 12:
-        logging.error(
+        logger.error(
             """
             The AlphaFold model parameters are missing. Please run the following command: 
             >>> gget.setup('alphafold') or $ gget setup alphafold
@@ -348,7 +341,7 @@ def alphafold(
                 )
 
     ## Move stereo_chemical_props.txt from gget bins to Alphafold package so it can be found
-    # logging.info("Locate files containing stereochemical properties.")
+    # logger.info("Locate files containing stereochemical properties.")
     ALPHAFOLD_PATH = os.path.abspath(os.path.dirname(AlphaFold.__file__))
     os.makedirs(os.path.join(ALPHAFOLD_PATH, "common/"), exist_ok=True)
     shutil.copyfile(
@@ -358,7 +351,7 @@ def alphafold(
 
     ## Validate input sequence(s)
     if verbose:
-        logging.info(f"Validating input sequence(s).")
+        logger.info(f"Validating input sequence(s).")
 
     # Handle command line passing path to FASTA as a list
     if isinstance(sequence, list) and len(sequence) == 1:
@@ -426,17 +419,17 @@ class ModelType(enum.Enum):
     if len(seqs) == 1:
         if multimer_for_monomer:
             if verbose:
-                logging.info(
+                logger.info(
                     "Using the multimer model for a single chain, as requested."
                 )
             model_type_to_use = ModelType.MULTIMER
         else:
             if verbose:
-                logging.info("Using the single-chain (monomer) model.")
+                logger.info("Using the single-chain (monomer) model.")
             model_type_to_use = ModelType.MONOMER
     else:
         if verbose:
-            logging.info(f"Using the multimer model with {len(seqs)} sequences.")
+            logger.info(f"Using the multimer model with {len(seqs)} sequences.")
         model_type_to_use = ModelType.MULTIMER
 
     # Check whether total length exceeds limit
@@ -457,13 +450,13 @@ class ModelType(enum.Enum):
             )
 
     if total_sequence_length > MAX_VALIDATED_LENGTH:
-        logging.warning(
+        logger.warning(
             f"The accuracy of this algorithm has not been fully validated above 3000 residues, and you may experience long running times or run out of memory. Total sequence length is {total_sequence_length} residues."
         )
 
     ## Find the closest source
     if verbose:
-        logging.info(f"Finding closest source for reference database.")
+        logger.info(f"Finding closest source for reference database.")
 
     ex = futures.ThreadPoolExecutor(3)
     fs = [ex.submit(fetch, source) for source in ["", "-europe", "-asia"]]
@@ -526,7 +519,7 @@ class ModelType(enum.Enum):
     features_for_chain = {}
     raw_msa_results_for_sequence = {}
     for sequence_index, sequence in enumerate(sequences, start=1):
-        # logging.info(f"Getting MSA for sequence {sequence_index}.")
+        # logger.info(f"Getting MSA for sequence {sequence_index}.")
 
         ## Manage permissions to jackhmmer binary
         command = f"chmod 755 {JACKHMMER_BINARY_PATH}"
@@ -537,7 +530,7 @@ class ModelType(enum.Enum):
             if stderr:
                 # Log the standard error if it is not empty
                 sys.stderr.write(stderr)
-            logging.error("Giving chmod 755 permissions to jackhmmer binary failed.")
+            logger.error("Giving chmod 755 permissions to jackhmmer binary failed.")
             return
 
         # Save the target sequence in a fasta file
@@ -568,7 +561,7 @@ class ModelType(enum.Enum):
                 single_chain_msas.append(merged_msa)
                 msa_size = len(set(merged_msa.sequences))
                 if verbose:
-                    logging.info(
+                    logger.info(
                         f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}."
                     )
             elif merged_msa.sequences and db_name == "uniprot":
@@ -721,7 +714,7 @@ class ModelType(enum.Enum):
                 prot=unrelaxed_proteins[best_model_name]
             )
         else:
-            logging.warning(
+            logger.warning(
                 "\nRunning model without relaxation stage. Use flag [--relax] ('relax=True') to include AMBER relaxation."
             )
             relaxed_pdb = protein.to_pdb(unrelaxed_proteins[best_model_name])
@@ -760,7 +753,7 @@ class ModelType(enum.Enum):
     ## Plotting
     if plot:
         if verbose:
-            logging.info("Plotting prediction results.")
+            logger.info("Plotting prediction results.")
         import py3Dmol
 
         # Construct multiclass b-factors to indicate confidence bands

diff --git a/gget/gget_archs4.py b/gget/gget_archs4.py
@@ -2,16 +2,9 @@
 import pandas as pd
 import json as json_package
 import io
-import logging
 
-# Add and format time stamp in logging messages
-logging.basicConfig(
-    format="%(asctime)s %(levelname)s %(message)s",
-    level=logging.INFO,
-    datefmt="%c",
-)
-# Mute numexpr threads info
-logging.getLogger("numexpr").setLevel(logging.WARNING)
+from .utils import set_up_logger
+logger = set_up_logger()
 
 # Custom functions
 from .gget_info import info
@@ -78,7 +71,7 @@ def archs4(
 
         # Check if Ensembl ID was found
         if isinstance(info_df, type(None)):
-            logging.error(
+            logger.error(
                 f"ID '{gene}' not found. Please double-check spelling/arguments and try again."
             )
             return
@@ -96,7 +89,7 @@ def archs4(
 
     if which == "correlation":
         if verbose:
-            logging.info(
+            logger.info(
                 f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4."
             )
 
@@ -120,13 +113,13 @@ def archs4(
         # Check if the request returned an error (e.g. gene not found)
         if "error" in corr_data.keys():
             if corr_data["error"] == f"{gene} not in colids":
-                logging.error(
+                logger.error(
                     f"Gene '{gene}' did not return any gene correlation results. \n"
                     "If the gene is an Ensembl ID, please set argument 'ensembl=True' (for terminal, add flag: [--ensembl])."
                 )
                 return
             else:
-                logging.error(
+                logger.error(
                     f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}"
                 )
                 return
@@ -157,7 +150,7 @@ def archs4(
 
     if which == "tissue":
         if verbose:
-            logging.info(
+            logger.info(
                 f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data."
             )
 
@@ -182,7 +175,7 @@ def archs4(
         tissue_exp_df = pd.read_csv(io.StringIO(r.content.decode("utf-8")))
         # Check if any results were returned
         if len(tissue_exp_df) < 2:
-            logging.error(
+            logger.error(
                 f"Gene '{gene}' did not return any tissue expression results. \n"
                 "If the gene is an Ensembl ID, please set argument 'ensembl=True' (for terminal, add flag: [--ensembl])."
             )