# Imports and data loading

In [44]:
import sourmash
import screed
import os
import re
import pandas as pd

from tqdm import tqdm

from kmer_utils import get_encoded_kmer_hashvals

### Read in human-mouse homologs

In [2]:


SYMBOL_SEPARATOR = '---'

In [3]:
human_mouse_homologs = pd.read_csv('HOM_MouseHumanSequence.rpt', sep='\t')
print(human_mouse_homologs.shape)
human_mouse_homologs.head()

(43117, 13)


Unnamed: 0,DB Class Key,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HGNC ID,OMIM Gene ID,Genetic Location,"Genomic Coordinates (mouse: , human: )",Nucleotide RefSeq IDs,Protein RefSeq IDs,SWISS_PROT IDs
0,39806032,"mouse, laboratory",10090,Gdnf,14573,MGI:107430,,,Chr15 3.8 cM,Chr15:7840327-7867056(+),"NM_010275,NM_001301333,NM_001301357,NM_001301332","NP_001288262,NP_034405,NP_001288286,NP_001288261",P48540
1,39806032,human,9606,GDNF,2668,,HGNC:4232,OMIM:600837,Chr5 p13.2,Chr5:37812677-37840044(-),"NM_199231,NM_001278098,NM_001190469,NM_000514,...","NP_001177398,NP_000505,NP_001177397,XP_0168648...",P39905
2,39806033,"mouse, laboratory",10090,Npy4r,19065,MGI:105374,,,Chr14 20.8 cM,Chr14:33867603-33874376(-),NM_008919,NP_032945,Q61041
3,39806033,human,9606,NPY4R,5540,,HGNC:9329,OMIM:601790,Chr10 q11.22,Chr10:46458551-46470668(-),"NM_001278794,NM_005972","NP_005963,XP_011538238,XP_011538239,XP_0168718...",
4,39806034,"mouse, laboratory",10090,Evx2,14029,MGI:95462,,,Chr2 44.13 cM,Chr2:74483335-74489901(-),"XM_006498728,XM_006498729,NM_007967","XP_006498792,NP_031993,XP_006498791",P49749


## Set output directory

In [4]:
outdir = '/Users/olgabot/botryllus/adhoc-analysis/2022-apr--gather-botryllus-in-human-mouse-with-kmers/'
! mkdir $outdir

mkdir: /Users/olgabot/botryllus/adhoc-analysis/2022-apr--gather-botryllus-in-human-mouse-with-kmers/: File exists


In [5]:
def sanitize(x):
    """Clean a gene name so it is a nice filename"""
    return x.replace(' ', '_').replace('.', '-')

# Iterate over all botryllus proteins to do `gather` on human and mouse

In [12]:
botryllus_dir = '/Users/olgabot/botryllus/data/botryllus-proteins/'

In [13]:
human_gencode_dir = '/Users/olgabot/botryllus/data/gencode/v38/'
mouse_gencode_dir = '/Users/olgabot/botryllus/data/gencode/M28/'

In [14]:
ls $human_gencode_dir

GRCh38.primary_assembly.genome.fa
GRCh38.primary_assembly.genome.fa.fai
gencode.v38.basic.annotation.gff3
gencode.v38.basic.annotation.gtf.gz
gencode.v38.basic.annotation.protein.fa
gencode.v38.basic.annotation.protein.fa.hp.k24.scale5.sig
gencode.v38.basic.annotation.protein.fa.sig
gencode.v38.chr_patch_hapl_scaff.basic.annotation.gff3.gz
gencode.v38.pc_translations.fa.gz
gencode.v38.pc_translations.fa.gz.sig


In [15]:
ls $mouse_gencode_dir

GRCm39.primary_assembly.genome.fa
GRCm39.primary_assembly.genome.fa.fai
gencode.vM28.basic.annotation.gff3
gencode.vM28.basic.annotation.protein.fa
gencode.vM28.basic.annotation.protein.fa.hp.k24.scale10.sig
gencode.vM28.basic.annotation.protein.fa.hp.k24.scale5.sig


## set signature files

In [16]:
human_sigfile = os.path.join(
    human_gencode_dir, "gencode.v38.basic.annotation.protein.fa.hp.k24.scale5.sig"
)
mouse_sigfile = os.path.join(
    mouse_gencode_dir, "gencode.vM28.basic.annotation.protein.fa.hp.k24.scale5.sig"
)

In [11]:
botryllus_sigfile = os.path.join(botryllus_dir, 'Bs_proteins.fa.hp.k24.scale5.sig')

## Load human and mouse signatures as indecies/databases

In [29]:
human_db = sourmash.load_file_as_index(human_sigfile)

In [30]:
mouse_db = sourmash.load_file_as_index(mouse_sigfile)

In [31]:
dbs = {'human': human_db, 'mouse': mouse_db}

## Load botryllus protein signatures

In [32]:
botryllus_sigs = sourmash.load_file_as_signatures(botryllus_sigfile)

### Load BHF alone for testing

In [33]:
query_sig = next(botryllus_sigs)
query_sig

SourmashSignature('BHF', 3ac7314a)

## Load human, mouse, botryllus fastas to read in sequences to write matching k-mers

In [34]:
def fasta_to_dict(fasta_filename):
    sequences = {}
    with screed.open(fasta_filename) as records:
        for record in records:
            sequences[record['name']] = record['sequence']
            
    return sequences

In [85]:
botryllus_sequences = fasta_to_dict(os.path.join(botryllus_dir, 'Bs_proteins.fa.gz'))
n_botryllus_seqs = len(botryllus_sequences)
n_botryllus_seqs

72617

In [36]:
fastas = {
    "human": os.path.join(human_gencode_dir, "gencode.v38.basic.annotation.protein.fa"),
    "mouse": os.path.join(
        mouse_gencode_dir, "gencode.vM28.basic.annotation.protein.fa"
    ),
}
fastas

{'human': '/Users/olgabot/botryllus/data/gencode/v38/gencode.v38.basic.annotation.protein.fa',
 'mouse': '/Users/olgabot/botryllus/data/gencode/M28/gencode.vM28.basic.annotation.protein.fa'}

In [39]:
%time sequences = {k: fasta_to_dict(fasta) for k, fasta in fastas.items()}
for species, seqs in sequences.items():
    print(f"{species}, {len(seqs)}")

CPU times: user 782 ms, sys: 43 ms, total: 825 ms
Wall time: 826 ms
human, 61543
mouse, 46083


# Test iterating over human and mouse with BHF

In [75]:
threshold_bp = 10
scaled = 5
query_seq = botryllus_sequences[query_sig.name]

GENE_SYMBOL_PATTERN = re.compile('gene_name=([\w\d]+)')
class_col = 'DB Class Key'
homolog_group_col = 'homolog_group'
coord_col = 'Genomic Coordinates (mouse: , human: )'

query_kmer_hashvals = get_encoded_kmer_hashvals(query_seq, sig.name, sigobj=sig)

dfs = []

for species, db in dbs.items():
    species_seqs = sequences[species]
    counter = db.counter_gather(query_sig, threshold_bp=threshold_bp)

    noident_mh = sig.minhash.to_mutable()
    

    # subtract found hashes as we can.
    for i, found_sig in enumerate(counter.siglist):
        containment = found_sig.contained_by(query_sig)
        symbol = re.findall(GENE_SYMBOL_PATTERN, found_sig.name)[0]
        found_seq = species_seqs[found_sig.name]

        found_kmer_hashvals = get_encoded_kmer_hashvals(
            found_seq, found_sig.name, sigobj=query_sig
        )
        contained_kmer_hashvals = query_kmer_hashvals.merge(
            found_kmer_hashvals, suffixes=("_query", "_found"), on=("hashval", "kmer_hp")
        )
        contained_kmer_hashvals['species'] = species
        contained_kmer_hashvals['n_kmers'] = len(contained_kmer_hashvals)
        contained_kmer_hashvals['intersect_bp'] = scaled * contained_kmer_hashvals['n_kmers']
        contained_kmer_hashvals['containment'] = containment
        contained_kmer_hashvals['symbol'] = symbol
        contained_kmer_hashvals['found_i'] = i
        
        dfs.append(contained_kmer_hashvals)
        

query_kmer_matches = pd.concat(dfs, ignore_index=True)

query_kmer_matches[homolog_group_col] = None
# Annotate any genes that are human-mouse homologs
for symbol, df in query_kmer_matches.groupby('symbol'):
    found_homolog_subset = human_mouse_homologs.query('Symbol == @symbol')
    
    if found_homolog_subset.empty:
        # No matches found in mouse-human homologs, continue on to next one
        continue
    
    found_homolog_rows = human_mouse_homologs[class_col].isin(found_homolog_subset[class_col])
    found_homolog_groups = human_mouse_homologs.loc[found_homolog_rows]
    homolog_group_name = SYMBOL_SEPARATOR.join(sorted(found_homolog_groups['Symbol']))
    
    all_homologs_found = found_homolog_groups['Symbol'].isin(query_kmer_matches.symbol).all()
    
    # Assign values
    query_kmer_matches.loc[df.index, 'genomic_coord'] = found_homolog_subset[coord_col].values[0]
    query_kmer_matches.loc[df.index, homolog_group_col] = homolog_group_name
    query_kmer_matches.loc[df.index, 'all_homologs_found'] = all_homologs_found



csv = os.path.join(outdir, f'{sanitize(query_sig.name)}.csv')
query_kmer_matches.to_csv(csv, index=False)



#     # -- Don't need to save a prefetch csv, skip this -- #

# #     # optionally calculate and save prefetch csv
# #     if prefetch_csvout_fp:
# #         assert scaled
# #         # calculate intersection stats and info
# #         prefetch_result = calculate_prefetch_info(prefetch_query, found_sig, scaled, args.threshold_bp)
# #         # remove match and query signatures; write result to prefetch csv
# #         d = dict(prefetch_result._asdict())
# #         del d['match']
# #         del d['query']
# #         prefetch_csvout_w.writerow(d)

# counters.append(counter)

query_kmer_matches

Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found,species,n_kmers,intersect_bp,containment,symbol,found_i,homolog_group,genomic_coord,all_homologs_found
0,165,DDRFGERLIDRAQNKYAPLDEKQR,ppphhpphhpphppphhhhppppp,13592530723998561865,BHF,154,KSEFLSTAPRSLRKRLIVPRSHSD,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,human,6,30,0.015873,ORC2,0,ORC2---Orc2,Chr2:200908981-200963703(-),False
1,166,DRFGERLIDRAQNKYAPLDEKQRS,pphhpphhpphppphhhhpppppp,18007806196568601670,BHF,155,SEFLSTAPRSLRKRLIVPRSHSDS,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,human,6,30,0.015873,ORC2,0,ORC2---Orc2,Chr2:200908981-200963703(-),False
2,167,RFGERLIDRAQNKYAPLDEKQRSE,phhpphhpphppphhhhppppppp,4680963950811137194,BHF,156,EFLSTAPRSLRKRLIVPRSHSDSE,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,human,6,30,0.015873,ORC2,0,ORC2---Orc2,Chr2:200908981-200963703(-),False
3,168,FGERLIDRAQNKYAPLDEKQRSES,hhpphhpphppphhhhpppppppp,343616811934702161,BHF,157,FLSTAPRSLRKRLIVPRSHSDSES,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,human,6,30,0.015873,ORC2,0,ORC2---Orc2,Chr2:200908981-200963703(-),False
4,169,GERLIDRAQNKYAPLDEKQRSESH,hpphhpphppphhhhppppppppp,3434887395817678525,BHF,158,LSTAPRSLRKRLIVPRSHSDSESE,ENST00000234296.7 gene_id=ENSG00000115942.9;tr...,human,6,30,0.015873,ORC2,0,ORC2---Orc2,Chr2:200908981-200963703(-),False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146,74,TKKKAKKDKRKNKPPKKDSETSKP,pppphpppppppphhpppppppph,518345984864610128,BHF,654,HRRSVDRKRRDTSGLERSHKSSKG,ENSMUST00000021381.6 gene_id=ENSMUSG0000002099...,mouse,9,45,0.013889,Pnn,3,PNN---Pnn,Chr12:59113705-59120803(+),True
147,75,KKKAKKDKRKNKPPKKDSETSKPA,ppphpppppppphhpppppppphh,831034578771461622,BHF,655,RRSVDRKRRDTSGLERSHKSSKGG,ENSMUST00000021381.6 gene_id=ENSMUSG0000002099...,mouse,9,45,0.013889,Pnn,3,PNN---Pnn,Chr12:59113705-59120803(+),True
148,76,KKAKKDKRKNKPPKKDSETSKPAQ,pphpppppppphhpppppppphhp,12911956892028419384,BHF,656,RSVDRKRRDTSGLERSHKSSKGGS,ENSMUST00000021381.6 gene_id=ENSMUSG0000002099...,mouse,9,45,0.013889,Pnn,3,PNN---Pnn,Chr12:59113705-59120803(+),True
149,77,KAKKDKRKNKPPKKDSETSKPAQT,phpppppppphhpppppppphhpp,17684580819255158518,BHF,657,SVDRKRRDTSGLERSHKSSKGGSS,ENSMUST00000021381.6 gene_id=ENSMUSG0000002099...,mouse,9,45,0.013889,Pnn,3,PNN---Pnn,Chr12:59113705-59120803(+),True


# Move code into functions for a separate file to run on all botryllus sequences

In [92]:
%%file single_gather_multi_db.py

import re
import pandas as pd
import sourmash
import screed
import os
import re
import pandas as pd

from tqdm import tqdm

from kmer_utils import get_encoded_kmer_hashvals

GENE_SYMBOL_PATTERN = re.compile("gene_name=([\w\d]+)")


query_kmer_hashvals = get_encoded_kmer_hashvals(query_seq, sig.name, sigobj=sig)


def annotate_found_sig(
    found_sig, query_sig, found_seq, species, gene_symbol_pattern=GENE_SYMBOL_PATTERN
):
    """Get the containment and overlapping k-mers for each found signature"""
    containment = found_sig.contained_by(query_sig)
    symbol = re.findall(gene_symbol_pattern, found_sig.name)[0]

    found_kmer_hashvals = get_encoded_kmer_hashvals(
        found_seq, found_sig.name, sigobj=query_sig
    )
    contained_kmer_hashvals = query_kmer_hashvals.merge(
        found_kmer_hashvals, suffixes=("_query", "_found"), on=("hashval", "kmer_hp")
    )
    contained_kmer_hashvals["species"] = species
    contained_kmer_hashvals["n_kmers"] = len(contained_kmer_hashvals)
    contained_kmer_hashvals["intersect_bp"] = (
        scaled * contained_kmer_hashvals["n_kmers"]
    )
    contained_kmer_hashvals["containment"] = containment
    contained_kmer_hashvals["symbol"] = symbol
    contained_kmer_hashvals["found_i"] = i
    return contained_kmer_hashvals


def add_homology_information_to_matches(
    query_kmer_matches,
    human_mouse_homologs,
    class_col="DB Class Key",
    homolog_group_col="homolog_group",
    coord_col="Genomic Coordinates (mouse: , human: )",
    symbol_separator=SYMBOL_SEPARATOR,
):
    query_kmer_matches[homolog_group_col] = None

    # Annotate any genes that are human-mouse homologs
    for symbol, df in query_kmer_matches.groupby("symbol"):
        found_homolog_subset = human_mouse_homologs.query("Symbol == @symbol")

        if found_homolog_subset.empty:
            # No matches found in mouse-human homologs, continue on to next one
            continue

        found_homolog_rows = human_mouse_homologs[class_col].isin(
            found_homolog_subset[class_col]
        )
        found_homolog_groups = human_mouse_homologs.loc[found_homolog_rows]
        homolog_group_name = symbol_separator.join(
            sorted(found_homolog_groups["Symbol"])
        )

        all_homologs_found = (
            found_homolog_groups["Symbol"].isin(query_kmer_matches.symbol).all()
        )

        # Assign values
        query_kmer_matches.loc[df.index, "genomic_coord"] = found_homolog_subset[
            coord_col
        ].values[0]
        query_kmer_matches.loc[df.index, homolog_group_col] = homolog_group_name
        query_kmer_matches.loc[df.index, "all_homologs_found"] = all_homologs_found
    return query_kmer_matches


def single_gather_multi_db(query_sig, query_seq, dbs, sequences, threshold_bp=10):
    dfs = []
    for species, db in dbs.items():
        species_seqs = sequences[species]
        counter = db.counter_gather(query_sig, threshold_bp=threshold_bp)

        noident_mh = query_sig.minhash.to_mutable()

        # subtract found hashes as we can.
        for i, found_sig in enumerate(counter.siglist):
            found_seq = species_seqs[found_sig.name]
            df = annotate_found_sig(found_sig, query_sig, found_seq, species)
            dfs.append(contained_kmer_hashvals)

    query_kmer_matches = pd.concat(dfs, ignore_index=True)
    query_kmer_matches = add_homology_information_to_matches(
        query_kmer_matches, human_mouse_homologs
    )

    csv = os.path.join(outdir, f"{sanitize(query_sig.name)}.csv")
    query_kmer_matches.to_csv(csv, index=False)

Overwriting single_gather_multi_db.py


## Iterate over all botryllus signatures

In [93]:
from single_gather_multi_db import single_gather_multi_db

for query_sig in tqdm(botryllus_sigs, total=n_botryllus_seqs):
    query_seq = botryllus_sequences[query_sig.name]

    single_gather_multi_db(query_sig, query_seq, dbs, sequences)

NameError: name 'query_seq' is not defined

In [78]:
# query_kmer_matches.query('all_homologs_found == True')

In [74]:
found_homolog_groups['Symbol'].isin(query_kmer_matches.symbol).all()

False

In [69]:
found_homolog_subset

Unnamed: 0,DB Class Key,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HGNC ID,OMIM Gene ID,Genetic Location,"Genomic Coordinates (mouse: , human: )",Nucleotide RefSeq IDs,Protein RefSeq IDs,SWISS_PROT IDs


In [67]:
found_homolog_subset[coord_col].values[0]

'Chr6:87155551-87264172(+)'

In [56]:
found_homolog_subset = human_mouse_homologs.query('Symbol == "PNN"')
found_homolog_subset

Unnamed: 0,DB Class Key,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HGNC ID,OMIM Gene ID,Genetic Location,"Genomic Coordinates (mouse: , human: )",Nucleotide RefSeq IDs,Protein RefSeq IDs,SWISS_PROT IDs
7939,39809970,human,9606,PNN,5411,,HGNC:9162,OMIM:603154,Chr14 q21.1,Chr14:39175183-39183218(+),NM_002687,NP_002678,Q9H307


In [61]:
class_col = 'DB Class Key'
found_homolog_rows = human_mouse_homologs[class_col].isin(found_homolog_subset[class_col])
found_homolog_groups = human_mouse_homologs.loc[found_homolog_rows]
found_homolog_groups

Unnamed: 0,DB Class Key,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HGNC ID,OMIM Gene ID,Genetic Location,"Genomic Coordinates (mouse: , human: )",Nucleotide RefSeq IDs,Protein RefSeq IDs,SWISS_PROT IDs
7938,39809970,"mouse, laboratory",10090,Pnn,18949,MGI:1100514,,,Chr12 26 cM,Chr12:59113705-59120803(+),NM_008891,NP_032917,O35691
7939,39809970,human,9606,PNN,5411,,HGNC:9162,OMIM:603154,Chr14 q21.1,Chr14:39175183-39183218(+),NM_002687,NP_002678,Q9H307


In [63]:
homolog_group_name = SYMBOL_SEPARATOR.join(found_homolog_groups['Symbol'])
homolog_group_name

'Pnn---PNN'

In [62]:
coord_col = 'Genomic Coordinates (mouse: , human: )'

In [49]:
query_kmer_matches.pivot_table(index='symbol', columns='species', values='containment')

species,human,mouse
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
ACE2,0.013569,
BMP3,0.018519,
CACNA1G,0.004505,
CFAP43,0.006098,
Ccdc9,,0.017494
Gm18596,,0.040816
ORC2,0.015873,
PNN,0.013333,
Pnn,,0.013889
RNMT,0.034359,


In [50]:
query_kmer_matches.pivot_table(index='symbol', columns='species', values='n_kmers')

species,human,mouse
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
ACE2,6.0,
BMP3,4.0,
CACNA1G,2.0,
CFAP43,3.0,
Ccdc9,,4.0
Gm18596,,6.0
ORC2,6.0,
PNN,9.0,
Pnn,,9.0
RNMT,8.0,


In [None]:
found_seq = species_seqs[found_sig.name]

In [138]:
found_sig.contained_by(sig)


0.013157894736842105

In [90]:
query_seq = botryllus_sequences[sig.name]

In [93]:
query_kmer_hashvals = get_encoded_kmer_hashvals(query_seq, sig.name, sigobj=sig)
query_kmer_hashvals

Unnamed: 0,i,kmer,kmer_hp,hashval,name
0,0,MVHDTEQLLAQGHHEEETECGKYG,hhppppphhhphpppppppphphh,17134648382419275520,BHF
1,1,VHDTEQLLAQGHHEEETECGKYGK,hppppphhhphpppppppphphhp,1036020595944595459,BHF
2,2,HDTEQLLAQGHHEEETECGKYGKL,ppppphhhphpppppppphphhph,9354784992242920062,BHF
3,3,DTEQLLAQGHHEEETECGKYGKLP,pppphhhphpppppppphphhphh,14467040310155683947,BHF
4,4,TEQLLAQGHHEEETECGKYGKLPE,ppphhhphpppppppphphhphhp,2993707203445337902,BHF
...,...,...,...,...,...
224,224,SLQAGARSQTAFLNPQGAVSAALV,phphhhpppphhhphphhhphhhh,14182015708224275480,BHF
225,225,LQAGARSQTAFLNPQGAVSAALVQ,hphhhpppphhhphphhhphhhhp,17311038750468094191,BHF
226,226,QAGARSQTAFLNPQGAVSAALVQN,phhhpppphhhphphhhphhhhpp,2785373275731037188,BHF
227,227,AGARSQTAFLNPQGAVSAALVQNR,hhhpppphhhphphhhphhhhppp,16834236568811559007,BHF


In [95]:
found_kmer_hashvals = get_encoded_kmer_hashvals(found_seq, found_sig.name, sigobj=sig)
found_kmer_hashvals

Unnamed: 0,i,kmer,kmer_hp,hashval,name
0,0,MSSSSWLLLSLVAVTAAQSTIEEQ,hpppphhhhphhhhphhppphppp,4529367029267399939,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
1,1,SSSSWLLLSLVAVTAAQSTIEEQA,pppphhhhphhhhphhppphppph,8133022457627493460,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
2,2,SSSWLLLSLVAVTAAQSTIEEQAK,ppphhhhphhhhphhppphppphp,302439427842201040,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
3,3,SSWLLLSLVAVTAAQSTIEEQAKT,pphhhhphhhhphhppphppphpp,13296791544504571300,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
4,4,SWLLLSLVAVTAAQSTIEEQAKTF,phhhhphhhhphhppphppphpph,15826488194025053905,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
...,...,...,...,...,...
777,777,ENPYASIDISKGENNPGFQNTDDV,pphhhphphpphppphhhppppph,6156471769310373174,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
778,778,NPYASIDISKGENNPGFQNTDDVQ,phhhphphpphppphhhppppphp,2644353523020109479,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
779,779,PYASIDISKGENNPGFQNTDDVQT,hhhphphpphppphhhppppphpp,5578042643637926218,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
780,780,YASIDISKGENNPGFQNTDDVQTS,hhphphpphppphhhppppphppp,13938802521831342304,ENST00000252519.8 gene_id=ENSG00000130234.13;t...


In [97]:
contained_kmer_hashvals = query_kmer_hashvals.merge(found_kmer_hashvals, suffixes=('_query', '_found'), on=('hashval', 'kmer_hp'))
contained_kmer_hashvals

Unnamed: 0,i_query,kmer_query,kmer_hp,hashval,name_query,i_found,kmer_found,name_found
0,96,KPAQTTISRLPSNRNNNNANSFAT,phhppphpphhppppppphpphhp,2022470327997897771,BHF,97,QALQQNGSSVLSEDKSKRLNTILN,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
1,97,PAQTTISRLPSNRNNNNANSFATT,hhppphpphhppppppphpphhpp,6455666354323166068,BHF,98,ALQQNGSSVLSEDKSKRLNTILNT,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
2,98,AQTTISRLPSNRNNNNANSFATTY,hppphpphhppppppphpphhpph,16339746702010901143,BHF,99,LQQNGSSVLSEDKSKRLNTILNTM,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
3,99,QTTISRLPSNRNNNNANSFATTYE,ppphpphhppppppphpphhpphp,3122210595655102375,BHF,100,QQNGSSVLSEDKSKRLNTILNTMS,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
4,100,TTISRLPSNRNNNNANSFATTYEK,pphpphhppppppphpphhpphpp,4418245652551675301,BHF,101,QNGSSVLSEDKSKRLNTILNTMST,ENST00000252519.8 gene_id=ENSG00000130234.13;t...
5,101,TISRLPSNRNNNNANSFATTYEKF,phpphhppppppphpphhpphpph,6259314928490704384,BHF,102,NGSSVLSEDKSKRLNTILNTMSTI,ENST00000252519.8 gene_id=ENSG00000130234.13;t...


In [107]:
# counter.siglist

In [108]:
# Regex: https://regex101.com/r/Kd1r1U/1
[re.findall('transcript_name=[\w\d]+-\d+', x.name)[0].split('=')[-1] for x in counter.siglist]

['ORC2-201',
 'BMP3-201',
 'ZNF292-202',
 'ZNF292-201',
 'CFAP43-202',
 'RSF1-201',
 'RSF1-204',
 'PNN-201',
 'CACNA1G-209',
 'RNMT-203',
 'RNMT-202',
 'RNMT-209',
 'RNMT-206',
 'RNMT-201',
 'ACE2-209',
 'ACE2-210',
 'ACE2-206',
 'ACE2-211',
 'ACE2-202',
 'ACE2-207',
 'ACE2-201']

In [109]:
query = sig

In [110]:
ignore_abundance = False

In [111]:
## ok! now do gather -

found = []
weighted_missed = 1
is_abundance = query.minhash.track_abundance and not ignore_abundance
orig_query_mh = query.minhash
gather_iter = GatherDatabases(query, counters,
                              threshold_bp=threshold_bp,
                              ignore_abundance=ignore_abundance,
                              noident_mh=noident_mh)

for result, weighted_missed in gather_iter:
    if not len(found):                # first result? print header.
        if is_abundance:
            print_results("")
            print_results("overlap     p_query p_match avg_abund")
            print_results("---------   ------- ------- ---------")
        else:
            print_results("")
            print_results("overlap     p_query p_match")
            print_results("---------   ------- -------")


    # print interim result & save in `found` list for later use
    pct_query = '{:.1f}%'.format(result.f_unique_weighted*100)
    pct_genome = '{:.1f}%'.format(result.f_match*100)
    name = result.match._display_name(40)

In [98]:
gather_iter.scaled

5

In [99]:
num_results = 100

In [102]:
found

[]

In [101]:
# report on thresholding -
if gather_iter.query:
    # if still a query, then we failed the threshold.
    notify(f'found less than {format_bp(threshold_bp)} in common. => exiting')

# basic reporting:
print_results(f'\nfound {len(found)} matches total;')
if num_results and len(found) == num_results:
    print_results(f'(truncated gather because --num-results={num_results})')

p_covered = (1 - weighted_missed) * 100
if is_abundance:
    print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
else:
    print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
print_results('')
if gather_iter.scaled != query.minhash.scaled:
    print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')


found 0 matches total;
the recovered matches hit 0.0% of the abundance-weighted query



[Kfound less than 10 bp  in common. => exiting


[]

In [39]:

# for sig in botryllus_sigs:

#     break

In [None]:
db.

In [43]:
db.gather??

[0;31mSignature:[0m [0mdb[0m[0;34m.[0m[0mgather[0m[0;34m([0m[0mquery[0m[0;34m,[0m [0mthreshold_bp[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mSource:[0m   
    [0;32mdef[0m [0mgather[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mquery[0m[0;34m,[0m [0mthreshold_bp[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m        [0;34m"Return the match with the best Jaccard containment in the Index."[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m        [0mresults[0m [0;34m=[0m [0;34m[[0m[0;34m][0m[0;34m[0m
[0;34m[0m        [0;32mfor[0m [0mresult[0m [0;32min[0m [0mself[0m[0;34m.[0m[0mprefetch[0m[0;34m([0m[0mquery[0m[0;34m,[0m [0mthreshold_bp[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0mresults[0m[0;34m.[0m[0mappend[0m[0;34m([0m[0mresul

In [41]:
sig

SourmashSignature('g1.t1 frame:1', 22719a45)

In [None]:
sig.abu