Skip to content

Commit

Permalink
Debugging
Browse files Browse the repository at this point in the history
  • Loading branch information
lauraluebbert committed May 25, 2024
1 parent 2d29c50 commit 7de31ce
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 16 deletions.
22 changes: 12 additions & 10 deletions gget/gget_cosmic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,17 @@

logger = set_up_logger()


def is_valid_email(email):
"""
Check if an e-mail address is valid.
"""
email_pattern = re.compile(
r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
)
email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")

return re.match(email_pattern, email) is not None

def download_reference(download_link, tar_folder_path, file_path, verbose) :

def download_reference(download_link, tar_folder_path, file_path, verbose):
email = input("Please enter your COSMIC email: ")
if not is_valid_email(email):
raise ValueError("The email address is not valid.")
Expand Down Expand Up @@ -130,7 +130,6 @@ def select_reference(
f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv"
)


tar_folder_path = os.path.join(reference_dir, tarred_folder)
file_path = os.path.join(tar_folder_path, contained_file)

Expand All @@ -155,7 +154,7 @@ def select_reference(
else:
proceed = (
input(
"Downloading complete databases from COSMIC requires an account (https://cancer.sanger.ac.uk/cosmic/register; free for academic use, license for commercial use). Would you like to proceed? "
"Downloading complete databases from COSMIC requires an account (https://cancer.sanger.ac.uk/cosmic/register; free for academic use, license for commercial use).\nWould you like to proceed? "
)
.strip()
.lower()
Expand Down Expand Up @@ -210,8 +209,8 @@ def cosmic(
- gget_mutate (True/False) whether to create a modified version of the database for use with gget mutate. Default: True
General args:
- out (str) Path to folder the database will be downloaded into.
Default: None
- out (str) Path to folder the database will be downloaded into.
Default: None
-> When download_cosmic=False: Results will be returned to standard out
-> When download_cosmic=True: Database will be downloaded into current working directory
- verbose (True/False) whether to print progress information. Default: True
Expand Down Expand Up @@ -296,10 +295,13 @@ def cosmic(
}
)

# Remove version numbers from Ensembl IDs
df["seq_ID"] = df["seq_ID"].str.split(".")[0]

# Get mut_ID column (by combining GENOMIC_MUTATION_ID and MUTATION_URL/MUTATION_ID)
df["GENOMIC_MUTATION_ID"] = df["GENOMIC_MUTATION_ID"].fillna("NA")
df['GENOMIC_MUTATION_ID'] = df['GENOMIC_MUTATION_ID'].astype(str)
df['MUTATION_ID'] = df['MUTATION_ID'].astype(str)
df["GENOMIC_MUTATION_ID"] = df["GENOMIC_MUTATION_ID"].astype(str)
df["MUTATION_ID"] = df["MUTATION_ID"].astype(str)
df["mut_ID"] = df["GENOMIC_MUTATION_ID"] + "_" + df["MUTATION_ID"]
df = df.drop(columns=["GENOMIC_MUTATION_ID", "MUTATION_ID"])

Expand Down
13 changes: 7 additions & 6 deletions gget/gget_mutate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
tqdm.pandas()

from .utils import read_fasta, set_up_logger

logger = set_up_logger()

# Define global variables to count occurences of weird mutations
Expand Down Expand Up @@ -289,17 +290,17 @@ def mutate(
Args:
- sequences (str) Path to the fasta file containing the sequences to be mutated, e.g., 'seqs.fa'.
Sequence identifiers following the '>' character must correspond to the identifiers
Sequence identifiers following the '>' character must correspond to the identifiers
in the seq_ID column of 'mutations'.
NOTE: Only string until first space or dot will be used as sequence identifier
NOTE: Only string until first space or dot will be used as sequence identifier
- Version numbers of Ensembl IDs will be ignored.
Example:
>seq1 (or ENSG00000106443)
ACTGCGATAGACT
>seq2
AGATCGCTAG
Alternatively: Input sequence(s) as a string or list, e.g. 'AGCTAGCT' or 'ACTGCTAGCT' 'AGCTAGCT'.
- mutations Path to csv or tsv file (str) (e.g., 'mutations.csv') or data frame (DataFrame object),
containing information about the mutations in the following format:
Expand Down Expand Up @@ -342,6 +343,9 @@ def mutate(
ambiguous_position_mutations = 0
cosmic_incorrect_wt_base = 0

print(sequences)
print(mutations)

# Load input sequences and their identifiers from fasta file
if "." in sequences:
titles, seqs = read_fasta(sequences)
Expand Down Expand Up @@ -370,7 +374,6 @@ def mutate(
# Read in 'mutations' if passed as filepath to comma-separated csv
if isinstance(mutations, str) and ".csv" in mutations:
mutations = pd.read_csv(mutations)
print(mutations)

elif isinstance(mutations, str) and ".tsv" in mutations:
mutations = pd.read_csv(mutations, sep="\t")
Expand Down Expand Up @@ -418,8 +421,6 @@ def mutate(
- A list of mutations (the number of mutations must equal the number of input sequences) (e.g. ['c.2C>T', 'c.1A>C'])
"""
)

print(mutations)

seq_dict = {}
for title, seq in zip(titles, seqs):
Expand Down

0 comments on commit 7de31ce

Please sign in to comment.