Debugging

pachterlab · May 25, 2024 · 7de31ce · 7de31ce
1 parent 2d29c50
commit 7de31ce
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 16 deletions.
diff --git a/gget/gget_cosmic.py b/gget/gget_cosmic.py
@@ -16,17 +16,17 @@
 
 logger = set_up_logger()
 
+
 def is_valid_email(email):
     """
     Check if an e-mail address is valid.
     """
-    email_pattern = re.compile(
-        r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
-    )
+    email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
 
     return re.match(email_pattern, email) is not None
 
-def download_reference(download_link, tar_folder_path, file_path, verbose) :
+
+def download_reference(download_link, tar_folder_path, file_path, verbose):
     email = input("Please enter your COSMIC email: ")
     if not is_valid_email(email):
         raise ValueError("The email address is not valid.")
@@ -130,7 +130,6 @@ def select_reference(
             f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv"
         )
 
-
     tar_folder_path = os.path.join(reference_dir, tarred_folder)
     file_path = os.path.join(tar_folder_path, contained_file)
 
@@ -155,7 +154,7 @@ def select_reference(
         else:
             proceed = (
                 input(
-                    "Downloading complete databases from COSMIC requires an account (https://cancer.sanger.ac.uk/cosmic/register; free for academic use, license for commercial use). Would you like to proceed? "
+                    "Downloading complete databases from COSMIC requires an account (https://cancer.sanger.ac.uk/cosmic/register; free for academic use, license for commercial use).\nWould you like to proceed? "
                 )
                 .strip()
                 .lower()
@@ -210,8 +209,8 @@ def cosmic(
     - gget_mutate     (True/False) whether to create a modified version of the database for use with gget mutate. Default: True
 
     General args:
-    - out             (str) Path to folder the database will be downloaded into. 
-                      Default: None 
+    - out             (str) Path to folder the database will be downloaded into.
+                      Default: None
                       -> When download_cosmic=False: Results will be returned to standard out
                       -> When download_cosmic=True: Database will be downloaded into current working directory
     - verbose         (True/False) whether to print progress information. Default: True
@@ -296,10 +295,13 @@ def cosmic(
                     }
                 )
 
+            # Remove version numbers from Ensembl IDs
+            df["seq_ID"] = df["seq_ID"].str.split(".")[0]
+
             # Get mut_ID column (by combining GENOMIC_MUTATION_ID and MUTATION_URL/MUTATION_ID)
             df["GENOMIC_MUTATION_ID"] = df["GENOMIC_MUTATION_ID"].fillna("NA")
-            df['GENOMIC_MUTATION_ID'] = df['GENOMIC_MUTATION_ID'].astype(str)
-            df['MUTATION_ID'] = df['MUTATION_ID'].astype(str)
+            df["GENOMIC_MUTATION_ID"] = df["GENOMIC_MUTATION_ID"].astype(str)
+            df["MUTATION_ID"] = df["MUTATION_ID"].astype(str)
             df["mut_ID"] = df["GENOMIC_MUTATION_ID"] + "_" + df["MUTATION_ID"]
             df = df.drop(columns=["GENOMIC_MUTATION_ID", "MUTATION_ID"])
 

diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py
@@ -5,6 +5,7 @@
 tqdm.pandas()
 
 from .utils import read_fasta, set_up_logger
+
 logger = set_up_logger()
 
 # Define global variables to count occurences of weird mutations
@@ -289,17 +290,17 @@ def mutate(
 
     Args:
     - sequences     (str) Path to the fasta file containing the sequences to be mutated, e.g., 'seqs.fa'.
-                    Sequence identifiers following the '>' character must correspond to the identifiers 
+                    Sequence identifiers following the '>' character must correspond to the identifiers
                     in the seq_ID column of 'mutations'.
-                    NOTE: Only string until first space or dot will be used as sequence identifier 
+                    NOTE: Only string until first space or dot will be used as sequence identifier
                     - Version numbers of Ensembl IDs will be ignored.
 
                     Example:
                     >seq1 (or ENSG00000106443)
                     ACTGCGATAGACT
                     >seq2
                     AGATCGCTAG
-                            
+
                     Alternatively: Input sequence(s) as a string or list, e.g. 'AGCTAGCT' or 'ACTGCTAGCT' 'AGCTAGCT'.
     - mutations     Path to csv or tsv file (str) (e.g., 'mutations.csv') or data frame (DataFrame object),
                     containing information about the mutations in the following format:
@@ -342,6 +343,9 @@ def mutate(
     ambiguous_position_mutations = 0
     cosmic_incorrect_wt_base = 0
 
+    print(sequences)
+    print(mutations)
+
     # Load input sequences and their identifiers from fasta file
     if "." in sequences:
         titles, seqs = read_fasta(sequences)
@@ -370,7 +374,6 @@ def mutate(
     # Read in 'mutations' if passed as filepath to comma-separated csv
     if isinstance(mutations, str) and ".csv" in mutations:
         mutations = pd.read_csv(mutations)
-        print(mutations)
 
     elif isinstance(mutations, str) and ".tsv" in mutations:
         mutations = pd.read_csv(mutations, sep="\t")
@@ -418,8 +421,6 @@ def mutate(
             - A list of mutations (the number of mutations must equal the number of input sequences) (e.g. ['c.2C>T', 'c.1A>C'])
             """
         )
-
-    print(mutations)
 
     seq_dict = {}
     for title, seq in zip(titles, seqs):