In [35]:
import pandas as pd

# Example original DataFrame
df_original = pd.DataFrame({
    'mutation': ['p.R175H', 'c.5266dupC', 'p.L858R', 'p.V600E'],
    'gene': ['TP53', 'BRCA1', 'EGFR', 'BRAF']
})

# Example df with mutation_cdna mapping
df = pd.DataFrame({
    'mutation': ['p.R175H', 'c.5266dupC', 'p.L858R'],
    'mutation_cdna': ['c.524G>A', 'c.5266dupC', 'c.2573T>G']
})

# Merge on 'mutation' column, keeping all rows from df_original
df_merged = df_original.merge(df[['mutation', 'mutation_cdna']], on='mutation', how='left')

# Fill missing mutation_cdna values with an empty string if needed
df_merged['mutation_cdna'] = df_merged['mutation_cdna'].fillna('')

print(df_merged)


     mutation   gene mutation_cdna
0     p.R175H   TP53      c.524G>A
1  c.5266dupC  BRCA1    c.5266dupC
2     p.L858R   EGFR     c.2573T>G
3     p.V600E   BRAF              


In [17]:
import time
start_time = time.time()
variants = ["1"] * 1_000_000_000
end_time = time.time()
print(f"Time taken: {end_time - start_time} seconds")

Time taken: 1.6109421253204346 seconds


In [27]:
import pandas as pd

# Create a toy DataFrame
variants = pd.DataFrame({
    'Gene': ['TP53', 'BRCA1', 'EGFR'],
    'Mutation': ['p.R175H', 'c.5266dupC', 'p.L858R'],
    'Frequency': [0.3, 0.15, 0.4]
})

# Rename the variable
mutations = variants
del variants  # Optional


    Gene    Mutation  Frequency
0   TP53     p.R175H       0.30
1  BRCA1  c.5266dupC       0.15
2   EGFR     p.L858R       0.40


In [29]:
mutations["test"] = mutations["Frequency"] * 2

In [31]:
mutations["Frequency"] = mutations["Frequency"].astype("float32")

In [1]:
import varseek as vk
print(help(vk.ref))

Help on function ref in module varseek.varseek_ref:

ref(sequences, mutations, filters=('dlist_substring:equal=none', 'pseudoaligned_to_human_reference_despite_not_truly_aligning:is_not_true', 'dlist:equal=none', 'number_of_kmers_with_overlap_to_other_mcrs_items_in_mcrs_reference:less_than=999999', 'number_of_mcrs_items_with_overlapping_kmers_in_mcrs_reference:less_than=999999', 'longest_homopolymer_length:bottom_percent=99.99', 'triplet_complexity:top_percent=99.9'), mode=None, dlist=False, config=None, out='.', index_out=None, t2g_out=None, download=False, dry_run=False, list_downloadable_references=False, minimum_info_columns=True, overwrite=False, threads=2, verbose=True, **kwargs)
        Create a reference index and t2g file for variant screening with varseek count. Wraps around varseek build, varseek info, varseek filter, and kb ref.
    
        # Required input argument:
        - sequences     (str) Path to the fasta file containing the sequences to have the mutations added, 

In [14]:
import subprocess
import sys

def check_and_install_seqtk():
    try:
        # Try running `seqtk` to check if it's installed
        result = subprocess.run(["seqtk"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        
        if result.returncode == 0:
            print("‚úÖ seqtk is installed and working.")
            return True
        else:
            print("‚ö†Ô∏è seqtk is not configured properly.")
    except FileNotFoundError:
        print("‚ùå seqtk is not installed.")

    # Attempt installation
    install_seqtk()
    return False

def install_seqtk():
    print("üîÑ Attempting to install seqtk...")

    if sys.platform == "darwin":  # macOS
        install_cmd = ["brew", "install", "seqtk"]
    elif shutil.which("mamba"):
        install_cmd = ["mamba", "install", "-y", "seqtk", "-c", "bioconda"]
    elif shutil.which("conda"):
        install_cmd = ["conda", "install", "-y", "seqtk", "-c", "bioconda"]
    elif shutil.which("apt"):
        install_cmd = ["sudo", "apt", "install", "-y", "seqtk"]
    elif shutil.which("yum"):
        install_cmd = ["sudo", "yum", "install", "-y", "seqtk"]
    else:
        print("‚ùå No supported package manager found. Install seqtk manually.")
        return

    try:
        subprocess.run(install_cmd, check=True)
        print("‚úÖ seqtk installed successfully!")
    except subprocess.CalledProcessError:
        print("‚ùå Installation failed. Install manually.")

# Run check
check_and_install_seqtk()


‚ö†Ô∏è seqtk is not configured properly.
üîÑ Attempting to install seqtk...


KeyboardInterrupt: 

==> Auto-updating Homebrew...
Adjust how often this is run with HOMEBREW_AUTO_UPDATE_SECS or disable with
HOMEBREW_NO_AUTO_UPDATE. Hide these hints with HOMEBREW_NO_ENV_HINTS (see `man brew`).
==> Downloading https://ghcr.io/v2/homebrew/portable-ruby/portable-ruby/blobs/sha256:4ffc8607e08e9bd536f1df71643b2ecb4cea1a15be9226f297008bc34d0bc8e2
######################################################################## 100.0%
==> Pouring portable-ruby-3.3.7.el_capitan.bottle.tar.gz
==> Auto-updated Homebrew!
==> Updated Homebrew from 4.4.6 (c81cd0b929) to 4.4.19 (4ee6e96bdf).
Updated 3 taps (brewsci/bio, homebrew/core and homebrew/cask).
==> New Formulae
acme.sh
acronym
aliae
alive2
ampl-asl
ansible@10
asciigen
atac
aws-c-auth
aws-c-cal
aws-c-common
aws-c-compression
aws-c-event-stream
aws-c-http
aws-c-io
aws-c-mqtt
aws-c-s3
aws-c-sdkutils
aws-checksums
aws-crt-cpp
azure-core-cpp
azure-storage-blobs-cpp
azure-storage-common-cpp
azurehound
babelfish
beanquery
bender
binocle
bold
brewsci/bio/bo

In [1]:
import inspect
import re
import varseek as vk

In [10]:
from varseek.utils import vcf_to_dataframe
vcf_file_path = "/Users/joeyrich/Downloads/sample.vcf"
vcf_df = vcf_to_dataframe(vcf_file_path, additional_columns = True, explode_alt = True)

[W::vcf_parse] Contig '19' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '20' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig 'X' is not defined in the header. (Quick workaround: index the file with tabix.)


In [11]:
import pandas as pd



In [12]:
vcf_df[["POS", "REF", "ALT", var_column]]

Unnamed: 0,POS,REF,ALT,mut_column
0,111,A,C,g.111A>C
1,112,A,G,g.112A>G
2,14370,G,A,g.14370G>A
3,17330,T,A,g.17330T>A
4,1110696,A,G,g.1110696A>G
5,1110696,A,T,g.1110696A>T
6,1230237,T,,g.UNKNOWN
7,1234567,G,GA,g.1234567_1234568insA
8,1234567,G,GAC,g.1234567_1234568insAC
9,1235237,T,,g.UNKNOWN


In [7]:
import inspect
from varseek.varseek_build import build  # Import the function

# Get function signature
signature = inspect.signature(build)

# Retrieve the default value for 'k'
k_default = signature.parameters["w"].default

print(f"Default value of k: {k_default}")


Default value of k: 54


In [19]:
import pandas as pd

# Create a toy DataFrame
data = {"semicolon_count": [0, 0, 1, 2, 3]}
mutation_metadata_df = pd.DataFrame(data)

# Compute the sum as per your requirement
total_sum = (mutation_metadata_df.loc[mutation_metadata_df["semicolon_count"] > 0, "semicolon_count"] + 1).sum()

print("Toy DataFrame:")
print(mutation_metadata_df)
print("\nTotal Sum:", total_sum)


Toy DataFrame:
   semicolon_count
0                0
1                0
2                1
3                2
4                3

Total Sum: 9


In [9]:
for key in signature:
    print(f"Parameter: {key}")
    print(f"Default: {signature.parameters[key].default}")
    print(f"Type: {signature.parameters[key].annotation}")
    print(f"Description: {signature.parameters[key].annotation.__doc__}")
    print()
    print()

TypeError: 'Signature' object is not iterable

In [None]:
explicit_parameters_vk_build = vk.utils.get_set_of_parameters_from_function_signature(vk.varseek_build.build)
allowable_kwargs_vk_build = vk.utils.get_set_of_allowable_kwargs(vk.varseek_build.build)
explicit_parameters_vk_build = explicit_parameters_vk_build.union(allowable_kwargs_vk_build)

In [5]:
if not isinstance(value, int) and not (isinstance(value, str) and value.isdigit()):
    print("Value is not an integer or a string that can be converted to an integer")

{1, 2, 3, 4, 5}

In [4]:
allowable_kwargs_vk_build

{'cosmic_email',
 'cosmic_grch',
 'cosmic_password',
 'cosmic_release',
 'insertion_size_limit',
 'merge_identical',
 'min_seq_len',
 'optimize_flanking_regions',
 'remove_seqs_with_wt_kmers',
 'use_IDs',
 'required_insertion_overlap_length',
 'save_files',
 'vcrs_strandedness'}

In [3]:
explicit_parameters_vk_build

{'dry_run',
 'filtering_report_text_out',
 'gtf',
 'gtf_transcript_id_column',
 'id_to_header_csv_out',
 'k',
 'max_ambiguous',
 'mcrs_fasta_out',
 'mcrs_t2g_out',
 'mut_column',
 'mut_id_column',
 'mutations',
 'mutations_updated_csv_out',
 'out',
 'overwrite',
 'reference_out_dir',
 'removed_variants_text_out',
 'return_mutation_output',
 'save_filtering_report_text',
 'save_mutations_updated_csv',
 'save_removed_variants_text',
 'save_wt_mcrs_fasta_and_t2g',
 'seq_id_column',
 'sequences',
 'store_full_sequences',
 'translate',
 'translate_end',
 'translate_start',
 'verbose',
 'w',
 'wt_mcrs_fasta_out',
 'wt_mcrs_t2g_out'}