In [3]:
%load_ext autoreload
%autoreload 2

import glob
import os

from IPython.display import display

import pandas as pd
import screed
import sourmash

base_folder = "/Users/olgabot/botryllus/data/botryllus-proteins"

gather_folder = os.path.join(base_folder, "sourmash_gather_scale10")
search_folder = os.path.join(base_folder, "sourmash_search_scale10")

## `parse_sourmash.py`

In [89]:
%%file parse_sourmash.py
import glob
import os

import pandas as pd


def read_csvs(folder, csv_glob='*.csv'):
    dfs = []
    csvs = glob.glob(os.path.join(folder, csv_glob))
    for csv in csvs:
        df = pd.read_csv(csv)
        dfs.append(df)
    concatenated = pd.concat(dfs, ignore_index=True)
    return concatenated




def extract_pfam_metadata(df):
    pfam_metadata_split1 = df.name.str.rstrip(";").str.split(";", expand=True)
    pfam_metadata_split2 = pfam_metadata_split1[0].str.split(expand=True)

    pfam_name = pfam_metadata_split1[1]
    pfam_name.name = "pfam_name"
    pfam_metadata = pd.concat([pfam_name, pfam_metadata_split2], axis=1)
    pfam_metadata = pfam_metadata.rename(columns={2: "pfam_id_full"})
    pfam_metadata["pfam_id"] = pfam_metadata.pfam_id_full.str.split(".").str[0]
    return pfam_metadata


def reorder_cols(df, first_cols=["similarity", "query_name"]):
    columns_reordered = first_cols + df.columns.difference(first_cols).tolist()
    df = df[columns_reordered]
    return df


def add_pfam_metadata(df, first_cols=["pfam_name", "similarity", "query_name"]):
    metadata = extract_pfam_metadata(df)
    df_with_metadata = pd.concat([df, metadata], axis=1)
    df_with_metadata = reorder_cols(
        df_with_metadata, first_cols
    )
    return df_with_metadata

def extract_gencode_gene_symbol(df, symbol_regex="gene_symbol:([\w\d\-]+)"):
    df["symbol"] = df["name"].str.extract(symbol_regex)
    return df

def read_gencode_folder(folder, csv_glob, first_cols, symbol_regex):
    df = read_csvs(folder, csv_glob)
    df = extract_gencode_gene_symbol(df, symbol_regex)
    df = reorder_cols(df, first_cols)
    return df

Overwriting parse_sourmash.py


## `kmer_utils`

In [53]:
%%file kmer_utils.py

import sourmash
from orpheum.sequence_encodings import encode_peptide


import pandas as pd

def get_encoded_kmer_hashvals(sequence, name, encoding="hp", K=24, sigobj=None):
    lines = []
    for i in range(0, len(sequence) - K + 1):
        kmer = sequence[i : i + K]
        kmer_encoded = encode_peptide(kmer, encoding)
        hashval = sigobj.minhash.seq_to_hashes(kmer, is_protein=True)

        if len(hashval) == 1:
            hashval = hashval[0]
        else:
            print(f"More than one hashval found for {kmer}")
        line = [i, kmer, kmer_encoded, hashval]
        lines.append(line)
    kmer_to_hashes = pd.DataFrame(
        lines, columns=["i", "kmer", f"kmer_{encoding}", "hashval"]
    )
    kmer_to_hashes["name"] = name
    return kmer_to_hashes

def get_matching_kmer_subsequence(sequence, kmers, gene_symbol):
    i_min = len(sequence)
    i_max = 0
    for kmer_human in kmers:
        i_kmer = sequence.find(kmer_human)
        # if i_kmer =< 0:
        #     continue

        if i_kmer < i_min:
            i_min = i_kmer
        if i_kmer > i_max:
            i_max = i_kmer
    j = i_max + len(kmer_human)

    #     print(f'i_min: {i_min}')
    #     print(f'i_max: {i_max}')
    #     print(f'j: {j}')

    
    print(f">{gene_symbol}")
    print(f"before match: {sequence[:i_min]}\n")
    print(f"matching: {sequence[i_min:j]}\n")
    print(f"after match: {sequence[j:]}")
    print(f"Match range (1-based): {i_min+1}-{j}")
    return sequence[i_min:j]


def subset_gene_kmers(merged_kmers, col, gene_symbol, other='human'):
    gene_subset = merged_kmers.query(f"{col} == @gene_symbol")
    
    kmer_other = f"kmer__{other}"

    tidy = pd.concat(
        [
            gene_subset[[kmer_other, "kmer_hp"]],
            gene_subset[["kmer__botryllus", "kmer_hp"]],
        ]
    )
    tidy = tidy.drop_duplicates()
    tidy["species"] = tidy[kmer_other].map(
        lambda x: "botryllus" if pd.isnull(x) else other
    )
    tidy["kmer_seq"] = tidy.apply(
        lambda x: x[kmer_other]
        if pd.isnull(x["kmer__botryllus"])
        else x["kmer__botryllus"],
        axis=1,
    )
    pivoted = tidy.pivot(columns="kmer_hp", index="species", values="kmer_seq")
    return pivoted

Overwriting kmer_utils.py


## Import `parse_sourmash`

In [21]:
import parse_sourmash
import kmer_utils


## Read gather output

In [8]:
gather = parse_sourmash.read_csvs(gather_folder)
print(gather.shape)
gather.head()

(14, 19)


Unnamed: 0,intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_filename,query_name,query_md5,query_bp
0,30,0.166667,0.031915,0.166667,0.166667,1.0,1.0,0.0,A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.1...,../pfam-split-seqkit-1e6/Pfam-A.prefetch.bhf_i...,5273d9b9abca82cb97efe29ee8d89149,0.031915,30,0,130,bhf_isoforms.fa,BHF_isoform1,f1f15760,180
1,30,0.166667,0.130435,0.166667,0.166667,1.0,1.0,0.0,A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936....,../pfam-split-seqkit-1e6/Pfam-A.prefetch.bhf_i...,68fe401bebf694bd45c6491327b7bd76,0.130435,30,1,100,bhf_isoforms.fa,BHF_isoform1,f1f15760,180
2,30,0.166667,0.107143,0.166667,0.166667,1.0,1.0,0.0,A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.2...,../pfam-split-seqkit-1e6/Pfam-A.prefetch.bhf_i...,031d2daae71f955d36562f75f30d8bea,0.107143,30,2,70,bhf_isoforms.fa,BHF_isoform1,f1f15760,180
3,20,0.111111,0.5,0.111111,0.111111,1.0,1.0,0.0,A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22...,../pfam-split-seqkit-1e6/Pfam-A.prefetch.bhf_i...,6f01987979bbf51f859d8816240127e9,0.5,20,3,50,bhf_isoforms.fa,BHF_isoform1,f1f15760,180
4,20,0.111111,0.181818,0.111111,0.111111,1.0,1.0,0.0,B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protami...,../pfam-split-seqkit-1e6/Pfam-A.prefetch.bhf_i...,35467a71a8e1a92a676557cad5575d43,0.181818,20,4,30,bhf_isoforms.fa,BHF_isoform1,f1f15760,180


In [9]:
gather_metadata = parse_sourmash.add_pfam_metadata(gather, first_cols=["pfam_name", "intersect_bp", "query_name"])
gather_metadata.head()

Unnamed: 0,pfam_name,intersect_bp,query_name,0,1,average_abund,f_match,f_match_orig,f_orig_query,f_unique_to_query,...,median_abund,name,pfam_id,pfam_id_full,query_bp,query_filename,query_md5,remaining_bp,std_abund,unique_intersect_bp
0,Nop14,30,BHF_isoform1,A0A6Q2YJA0_ESOLU/18-823,A0A6Q2YJA0.1,1.0,0.031915,0.031915,0.166667,0.166667,...,1.0,A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.1...,PF04147,PF04147.15,180,bhf_isoforms.fa,f1f15760,130,0.0,30
1,Terpene_synth_C,30,BHF_isoform1,A0A022QCL9_ERYGU/105-368,A0A022QCL9.1,1.0,0.130435,0.130435,0.166667,0.166667,...,1.0,A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936....,PF03936,PF03936.19,180,bhf_isoforms.fa,f1f15760,100,0.0,30
2,ZnuA,30,BHF_isoform1,A0A098M3F2_9BACL/57-334,A0A098M3F2.1,1.0,0.107143,0.107143,0.166667,0.166667,...,1.0,A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.2...,PF01297,PF01297.20,180,bhf_isoforms.fa,f1f15760,70,0.0,30
3,Chitin_bind_1,20,BHF_isoform1,A0A1Y2CQP8_9FUNG/32-76,A0A1Y2CQP8.1,1.0,0.5,0.5,0.111111,0.111111,...,1.0,A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22...,PF00187,PF00187.22,180,bhf_isoforms.fa,f1f15760,50,0.0,20
4,Protamine_like,20,BHF_isoform1,B4HZG1_DROSE/1-124,B4HZG1.1,1.0,0.181818,0.181818,0.111111,0.111111,...,1.0,B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protami...,PF06382,PF06382.14,180,bhf_isoforms.fa,f1f15760,30,0.0,20


In [44]:
gather_metadata

Unnamed: 0,pfam_name,intersect_bp,query_name,0,1,average_abund,f_match,f_match_orig,f_orig_query,f_unique_to_query,...,median_abund,name,pfam_id,pfam_id_full,query_bp,query_filename,query_md5,remaining_bp,std_abund,unique_intersect_bp
0,Nop14,30,BHF_isoform1,A0A6Q2YJA0_ESOLU/18-823,A0A6Q2YJA0.1,1.0,0.031915,0.031915,0.166667,0.166667,...,1.0,A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.1...,PF04147,PF04147.15,180,bhf_isoforms.fa,f1f15760,130,0.0,30
1,Terpene_synth_C,30,BHF_isoform1,A0A022QCL9_ERYGU/105-368,A0A022QCL9.1,1.0,0.130435,0.130435,0.166667,0.166667,...,1.0,A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936....,PF03936,PF03936.19,180,bhf_isoforms.fa,f1f15760,100,0.0,30
2,ZnuA,30,BHF_isoform1,A0A098M3F2_9BACL/57-334,A0A098M3F2.1,1.0,0.107143,0.107143,0.166667,0.166667,...,1.0,A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.2...,PF01297,PF01297.20,180,bhf_isoforms.fa,f1f15760,70,0.0,30
3,Chitin_bind_1,20,BHF_isoform1,A0A1Y2CQP8_9FUNG/32-76,A0A1Y2CQP8.1,1.0,0.5,0.5,0.111111,0.111111,...,1.0,A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22...,PF00187,PF00187.22,180,bhf_isoforms.fa,f1f15760,50,0.0,20
4,Protamine_like,20,BHF_isoform1,B4HZG1_DROSE/1-124,B4HZG1.1,1.0,0.181818,0.181818,0.111111,0.111111,...,1.0,B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protami...,PF06382,PF06382.14,180,bhf_isoforms.fa,f1f15760,30,0.0,20
5,2-Hacid_dh,10,BHF_isoform1,A0A3P3D7T7_9RHOB/8-324,A0A3P3D7T7.1,1.0,0.030303,0.030303,0.055556,0.055556,...,1.0,A0A3P3D7T7_9RHOB/8-324 A0A3P3D7T7.1 PF00389.33...,PF00389,PF00389.33,180,bhf_isoforms.fa,f1f15760,20,0.0,10
6,4HB_MCP_1,10,BHF_isoform1,R5Z925_9FIRM/1-181,R5Z925.1,1.0,0.058824,0.058824,0.055556,0.055556,...,1.0,R5Z925_9FIRM/1-181 R5Z925.1 PF12729.10;4HB_MCP_1;,PF12729,PF12729.10,180,bhf_isoforms.fa,f1f15760,10,0.0,10
7,7TM_GPCR_Srh,10,BHF_isoform1,E3LLJ4_CAERE/4-302,E3LLJ4.1,1.0,0.043478,0.043478,0.055556,0.055556,...,1.0,E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPC...,PF10318,PF10318.12,180,bhf_isoforms.fa,f1f15760,0,0.0,10
8,Nop14,30,BHF_isoform2,A0A6Q2YJA0_ESOLU/18-823,A0A6Q2YJA0.1,1.0,0.031915,0.031915,0.1875,0.1875,...,1.0,A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.1...,PF04147,PF04147.15,160,bhf_isoforms.fa,865a91c0,110,0.0,30
9,Terpene_synth_C,30,BHF_isoform2,A0A022QCL9_ERYGU/105-368,A0A022QCL9.1,1.0,0.130435,0.130435,0.1875,0.1875,...,1.0,A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936....,PF03936,PF03936.19,160,bhf_isoforms.fa,865a91c0,80,0.0,30


### Add metadata

## Read search output

In [10]:
search = parse_sourmash.read_csvs(search_folder)
print(search.shape)
search.head()

(4728, 7)


Unnamed: 0,similarity,name,filename,md5,query_filename,query_name,query_md5
0,0.230053,A0A5N5NYA3_PANHP/704-740 A0A5N5NYA3.1 PF00057....,,8236a3e260acf15aa3cc673821003cb7,bhf_isoforms.fa,BHF_isoform2,865a91c0
1,0.186429,A0A6J2X2G5_SITOR/131-211 A0A6J2X2G5.1 PF18391....,,cb641ff362c34d379be11a7302674fc4,bhf_isoforms.fa,BHF_isoform2,865a91c0
2,0.186429,H0Y978_HUMAN/30-85 H0Y978.1 PF00400.35;WD40;,,41dd6480cafa2c71272010815ad0ec96,bhf_isoforms.fa,BHF_isoform2,865a91c0
3,0.186429,A0A673I3J8_9TELE/6-59 A0A673I3J8.1 PF00105.21;...,,15cbdeede334ae99e2e3443133898932,bhf_isoforms.fa,BHF_isoform2,865a91c0
4,0.160861,A0A6G0XDQ6_9STRA/342-370 A0A6G0XDQ6.1 PF00734....,,1a401620b607e1f0e1c3ae66fbe5e5c3,bhf_isoforms.fa,BHF_isoform2,865a91c0


### Add metadata

In [11]:


search_metadata = parse_sourmash.add_pfam_metadata(search)
search_metadata.head()

Unnamed: 0,pfam_name,similarity,query_name,0,1,filename,md5,name,pfam_id,pfam_id_full,query_filename,query_md5
0,Ldl_recept_a,0.230053,BHF_isoform2,A0A5N5NYA3_PANHP/704-740,A0A5N5NYA3.1,,8236a3e260acf15aa3cc673821003cb7,A0A5N5NYA3_PANHP/704-740 A0A5N5NYA3.1 PF00057....,PF00057,PF00057.21,bhf_isoforms.fa,865a91c0
1,CHIP_TPR_N,0.186429,BHF_isoform2,A0A6J2X2G5_SITOR/131-211,A0A6J2X2G5.1,,cb641ff362c34d379be11a7302674fc4,A0A6J2X2G5_SITOR/131-211 A0A6J2X2G5.1 PF18391....,PF18391,PF18391.4,bhf_isoforms.fa,865a91c0
2,WD40,0.186429,BHF_isoform2,H0Y978_HUMAN/30-85,H0Y978.1,,41dd6480cafa2c71272010815ad0ec96,H0Y978_HUMAN/30-85 H0Y978.1 PF00400.35;WD40;,PF00400,PF00400.35,bhf_isoforms.fa,865a91c0
3,zf-C4,0.186429,BHF_isoform2,A0A673I3J8_9TELE/6-59,A0A673I3J8.1,,15cbdeede334ae99e2e3443133898932,A0A673I3J8_9TELE/6-59 A0A673I3J8.1 PF00105.21;...,PF00105,PF00105.21,bhf_isoforms.fa,865a91c0
4,CBM_1,0.160861,BHF_isoform2,A0A6G0XDQ6_9STRA/342-370,A0A6G0XDQ6.1,,1a401620b607e1f0e1c3ae66fbe5e5c3,A0A6G0XDQ6_9STRA/342-370 A0A6G0XDQ6.1 PF00734....,PF00734,PF00734.21,bhf_isoforms.fa,865a91c0


# Get K-mers

## BHF

### File paths

In [13]:

bhf_isoform_sigfile = '/Users/olgabot/botryllus/data/botryllus-proteins/bhf_isoforms.fa.sig'
bhf_isoform_fasta = '/Users/olgabot/botryllus/data/botryllus-proteins/bhf_isoforms.fa'


### Read in BHF isoform fasta

In [18]:
bhf_isoform_seqs = {record.name: record.sequence for record in screed.open(bhf_isoform_fasta)}
bhf_isoform_seqs

{'BHF_isoform1': 'MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR',
 'BHF_isoform2': 'MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKVRSNGV'}

### Read in the botryllus sigfile to have a minhash device

In [16]:

bhf_isoform_sigs = {sig.name: sig for sig in sourmash.load_file_as_signatures(bhf_isoform_sigfile)}
bhf_isoform_sigs

{'BHF_isoform1': SourmashSignature('BHF_isoform1', f1f15760),
 'BHF_isoform2': SourmashSignature('BHF_isoform2', 865a91c0)}

### Get BHF K-mers

In [36]:
dfs = []
for isoform, seq in bhf_isoform_seqs.items():
    sigobj = bhf_isoform_sigs[isoform]
    df = kmer_utils.get_encoded_kmer_hashvals(seq, isoform, sigobj=sigobj)
    dfs.append(df)
bhf_kmers = pd.concat(dfs, ignore_index=True)
bhf_kmers.head()

Unnamed: 0,i,kmer,kmer_hp,hashval,name
0,0,MVHDTEQLLAQGHHEEETECGKYG,hhppppphhhphpppppppphphh,17134648382419275520,BHF_isoform1
1,1,VHDTEQLLAQGHHEEETECGKYGK,hppppphhhphpppppppphphhp,1036020595944595459,BHF_isoform1
2,2,HDTEQLLAQGHHEEETECGKYGKL,ppppphhhphpppppppphphhph,9354784992242920062,BHF_isoform1
3,3,DTEQLLAQGHHEEETECGKYGKLP,pppphhhphpppppppphphhphh,14467040310155683947,BHF_isoform1
4,4,TEQLLAQGHHEEETECGKYGKLPK,ppphhhphpppppppphphhphhp,2993707203445337902,BHF_isoform1


## PFam

In [31]:
pfam_fasta = '/Users/olgabot/botryllus/data/pfam/Pfam-A.prefetch.bhf_isoforms.threshold0.fasta.gz'
pfam_sigfile = '/Users/olgabot/botryllus/data/pfam/scaled10/Pfam-A.prefetch.bhf_isoforms.threshold0.sig.gz'

In [32]:
pfam_seqs = {record.name: record.sequence for record in screed.open(pfam_fasta)}

### Get Pfam kmers

In [34]:
# use last sigobj
dfs = []
for isoform, seq in pfam_seqs.items():
    df = kmer_utils.get_encoded_kmer_hashvals(seq, isoform, sigobj=sigobj)
    dfs.append(df)
pfam_kmers = pd.concat(dfs, ignore_index=True)
pfam_kmers.head()

Unnamed: 0,i,kmer,kmer_hp,hashval,name
0,0,LLTLNAISVKGLDRLPRDSYEIAS,hhphphhphphhpphhppphphhp,12419877632473437588,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...
1,1,LTLNAISVKGLDRLPRDSYEIASE,hphphhphphhpphhppphphhpp,5304522194303411330,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...
2,2,TLNAISVKGLDRLPRDSYEIASEF,phphhphphhpphhppphphhpph,1326463173270241912,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...
3,3,LNAISVKGLDRLPRDSYEIASEFS,hphhphphhpphhppphphhpphp,5050722563078200312,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...
4,4,NAISVKGLDRLPRDSYEIASEFSA,phhphphhpphhppphphhpphph,18176231415180590896,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...


In [39]:
pfam_kmers.shape

(600426, 5)

In [38]:
pfam_kmers_metdata = parse_sourmash.add_pfam_metadata(pfam_kmers, first_cols=['pfam_name', 'kmer', 'kmer_hp'])
pfam_kmers_metdata.head()

Unnamed: 0,pfam_name,kmer,kmer_hp,0,1,hashval,i,name,pfam_id,pfam_id_full
0,2-Hacid_dh,LLTLNAISVKGLDRLPRDSYEIAS,hhphphhphphhpphhppphphhp,F3L153_9GAMM/5-309,F3L153.1,12419877632473437588,0,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33
1,2-Hacid_dh,LTLNAISVKGLDRLPRDSYEIASE,hphphhphphhpphhppphphhpp,F3L153_9GAMM/5-309,F3L153.1,5304522194303411330,1,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33
2,2-Hacid_dh,TLNAISVKGLDRLPRDSYEIASEF,phphhphphhpphhppphphhpph,F3L153_9GAMM/5-309,F3L153.1,1326463173270241912,2,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33
3,2-Hacid_dh,LNAISVKGLDRLPRDSYEIASEFS,hphhphphhpphhppphphhpphp,F3L153_9GAMM/5-309,F3L153.1,5050722563078200312,3,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33
4,2-Hacid_dh,NAISVKGLDRLPRDSYEIASEFSA,phhphphhpphhppphphhpphph,F3L153_9GAMM/5-309,F3L153.1,18176231415180590896,4,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33


## Merge BHF and pfam kmers

In [48]:
bhf_pfam_kmers = pfam_kmers_metdata.merge(
    bhf_kmers,
    how="inner",
    on=["hashval", "kmer_hp"],
    suffixes=("__pfam", "__botryllus"),
)
print(bhf_pfam_kmers.shape)
bhf_pfam_kmers.head()

(14829, 13)


Unnamed: 0,pfam_name,kmer__pfam,kmer_hp,0,1,hashval,i__pfam,name__pfam,pfam_id,pfam_id_full,i__botryllus,kmer__botryllus,name__botryllus
0,2-Hacid_dh,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/5-309,F3L153.1,6861319682135420868,99,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1
1,2-Hacid_dh,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/5-309,F3L153.1,6861319682135420868,99,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform2
2,2-Hacid_dh_C,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/98-277,F3L153.1,6861319682135420868,6,F3L153_9GAMM/98-277 F3L153.1 PF02826.22;2-Haci...,PF02826,PF02826.22,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1
3,2-Hacid_dh_C,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/98-277,F3L153.1,6861319682135420868,6,F3L153_9GAMM/98-277 F3L153.1 PF02826.22;2-Haci...,PF02826,PF02826.22,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform2
4,AAA_10,INPLSFIDSINETNRKLGISKFVD,hphhphhpphpppppphhhpphhp,F4L4C0_HALH1/172-500,F4L4C0.1,6861319682135420868,80,F4L4C0_HALH1/172-500 F4L4C0.1 PF12846.10;AAA_10;,PF12846,PF12846.10,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1


In [59]:
bhf_pfam_kmers_isoform1 = bhf_pfam_kmers.query('name__botryllus == "BHF_isoform1"')
print(bhf_pfam_kmers_isoform1.shape)
bhf_pfam_kmers_isoform2 = bhf_pfam_kmers.query('name__botryllus == "BHF_isoform2"')
print(bhf_pfam_kmers_isoform2.shape)

(8194, 13)
(6635, 13)


In [62]:
bhf_pfam_kmers_isoform1_nop14 = bhf_pfam_kmers_isoform1.query('pfam_name == "Nop14"')
bhf_pfam_kmers_isoform1_nop14.shape

(34, 13)

In [63]:
bhf_pfam_kmers_isoform1_nop14.groupby('name__pfam').size()

name__pfam
A0A087HGT1_ARAAL/28-893 A0A087HGT1.1 PF04147.15;Nop14;     5
A0A3P8ZQ40_ESOLU/19-861 A0A3P8ZQ40.1 PF04147.15;Nop14;    12
A0A6J1KKH7_CUCMA/35-930 A0A6J1KKH7.1 PF04147.15;Nop14;     3
A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;    12
M1VWC2_CLAP2/49-896 M1VWC2.1 PF04147.15;Nop14;             2
dtype: int64

In [67]:
for name, df in bhf_pfam_kmers_isoform1_nop14.groupby('name__pfam'):
    pivoted = kmer_utils.subset_gene_kmers(
        df, col="pfam_name", gene_symbol="Nop14", other="pfam"
    )
    print(f'--- {name} ---')
    display(pivoted)

--- A0A087HGT1_ARAAL/28-893 A0A087HGT1.1 PF04147.15;Nop14; ---


kmer_hp,pppphpppppppphhpppppppph,ppppphpppppppphhpppppppp,pppppphpppppppphhppppppp,ppppppphpppppppphhpppppp,pppppppphpppppppphhppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
botryllus,TKKKAKKDKRKNKPPKKDSETSKP,KTKKKAKKDKRKNKPPKKDSETSK,NKTKKKAKKDKRKNKPPKKDSETS,TNKTKKKAKKDKRKNKPPKKDSET,STNKTKKKAKKDKRKNKPPKKDSE
pfam,DEDDGSSEDSESEGGEDDDDDDDG,SDEDDGSSEDSESEGGEDDDDDDD,ESDEDDGSSEDSESEGGEDDDDDD,SESDEDDGSSEDSESEGGEDDDDD,NSESDEDDGSSEDSESEGGEDDDD


--- A0A3P8ZQ40_ESOLU/19-861 A0A3P8ZQ40.1 PF04147.15;Nop14; ---


kmer_hp,hphpppppppphpppppppphhpp,hpppppppphpppppppphhpppp,phphpppppppphpppppppphhp,phpppppppphpppppppphhppp,pphphpppppppphpppppppphh,ppphphpppppppphpppppppph,ppphpppppppphhpppppppphh,pppphpppppppphhpppppppph,ppppphpppppppphhpppppppp,pppppphpppppppphhppppppp,ppppppphpppppppphhpppppp,pppppppphpppppppphhppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
botryllus,IDASTNKTKKKAKKDKRKNKPPKK,ASTNKTKKKAKKDKRKNKPPKKDS,HIDASTNKTKKKAKKDKRKNKPPK,DASTNKTKKKAKKDKRKNKPPKKD,EHIDASTNKTKKKAKKDKRKNKPP,QEHIDASTNKTKKKAKKDKRKNKP,KKKAKKDKRKNKPPKKDSETSKPA,TKKKAKKDKRKNKPPKKDSETSKP,KTKKKAKKDKRKNKPPKKDSETSK,NKTKKKAKKDKRKNKPPKKDSETS,TNKTKKKAKKDKRKNKPPKKDSET,STNKTKKKAKKDKRKNKPPKKDSE
pfam,GEGSEEDSHSDVNSEEESEEGGEE,GSEEDSHSDVNSEEESEEGGEEEE,DGEGSEEDSHSDVNSEEESEEGGE,EGSEEDSHSDVNSEEESEEGGEEE,EDGEGSEEDSHSDVNSEEESEEGG,EEDGEGSEEDSHSDVNSEEESEEG,HSDVNSEEESEEGGEEEEEDEQPA,SHSDVNSEEESEEGGEEEEEDEQP,DSHSDVNSEEESEEGGEEEEEDEQ,EDSHSDVNSEEESEEGGEEEEEDE,EEDSHSDVNSEEESEEGGEEEEED,SEEDSHSDVNSEEESEEGGEEEEE


--- A0A6J1KKH7_CUCMA/35-930 A0A6J1KKH7.1 PF04147.15;Nop14; ---


kmer_hp,pphphpppppppphpppppppphh,ppphphpppppppphpppppppph,pppphphpppppppphpppppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
botryllus,EHIDASTNKTKKKAKKDKRKNKPP,QEHIDASTNKTKKKAKKDKRKNKP,SQEHIDASTNKTKKKAKKDKRKNK
pfam,KDADGTESEDDDSAEDTDSSDDVG,RKDADGTESEDDDSAEDTDSSDDV,ERKDADGTESEDDDSAEDTDSSDD


--- A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14; ---


kmer_hp,hphpppppppphpppppppphhpp,hpppppppphpppppppphhpppp,phphpppppppphpppppppphhp,phpppppppphpppppppphhppp,pphphpppppppphpppppppphh,ppphphpppppppphpppppppph,ppphpppppppphhpppppppphh,pppphpppppppphhpppppppph,ppppphpppppppphhpppppppp,pppppphpppppppphhppppppp,ppppppphpppppppphhpppppp,pppppppphpppppppphhppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
botryllus,IDASTNKTKKKAKKDKRKNKPPKK,ASTNKTKKKAKKDKRKNKPPKKDS,HIDASTNKTKKKAKKDKRKNKPPK,DASTNKTKKKAKKDKRKNKPPKKD,EHIDASTNKTKKKAKKDKRKNKPP,QEHIDASTNKTKKKAKKDKRKNKP,KKKAKKDKRKNKPPKKDSETSKPA,TKKKAKKDKRKNKPPKKDSETSKP,KTKKKAKKDKRKNKPPKKDSETSK,NKTKKKAKKDKRKNKPPKKDSETS,TNKTKKKAKKDKRKNKPPKKDSET,STNKTKKKAKKDKRKNKPPKKDSE
pfam,GEGSEEDSHSDVNSEEESEEGGEE,GSEEDSHSDVNSEEESEEGGEEEE,DGEGSEEDSHSDVNSEEESEEGGE,EGSEEDSHSDVNSEEESEEGGEEE,EDGEGSEEDSHSDVNSEEESEEGG,EEDGEGSEEDSHSDVNSEEESEEG,HSDVNSEEESEEGGEEEEEDEQPA,SHSDVNSEEESEEGGEEEEEDEQP,DSHSDVNSEEESEEGGEEEEEDEQ,EDSHSDVNSEEESEEGGEEEEEDE,EEDSHSDVNSEEESEEGGEEEEED,SEEDSHSDVNSEEESEEGGEEEEE


--- M1VWC2_CLAP2/49-896 M1VWC2.1 PF04147.15;Nop14; ---


kmer_hp,ppphphpppppppphpppppppph,pppphphpppppppphpppppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1
botryllus,QEHIDASTNKTKKKAKKDKRKNKP,SQEHIDASTNKTKKKAKKDKRKNK
pfam,SDEGSYDDDSDNSQAEEDEEDDEF,ESDEGSYDDDSDNSQAEEDEEDDE


## Iterate over gather hits:


In [83]:
for isoform, df in gather_metadata.groupby("query_name"):
    print(f"\n--- {isoform} ---")
    for i, row in df.iterrows():
        pfam_full = row["name"]
        intersect_bp = row['intersect_bp']
        pivoted = kmer_utils.subset_gene_kmers(
            bhf_pfam_kmers, col="name__pfam", gene_symbol=pfam_full, other="pfam"
        )
        print(f"\n\n--- {pfam_full}, intersect_bp: {intersect_bp} ---")
        display(pivoted)

        kmer_utils.get_matching_kmer_subsequence(
            pfam_seqs[pfam_full], pivoted.loc["pfam"], pfam_full
        )
        print("-")
        kmer_utils.get_matching_kmer_subsequence(
            bhf_isoform_seqs[isoform], pivoted.loc["botryllus"], "BHF"
        )


--- BHF_isoform1 ---


--- A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;, intersect_bp: 30 ---


kmer_hp,hphpppppppphpppppppphhpp,hpppppppphpppppppphhpppp,phphpppppppphpppppppphhp,phpppppppphpppppppphhppp,pphphpppppppphpppppppphh,ppphphpppppppphpppppppph,ppphpppppppphhpppppppphh,pppphpppppppphhpppppppph,ppppphpppppppphhpppppppp,pppppphpppppppphhppppppp,ppppppphpppppppphhpppppp,pppppppphpppppppphhppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
botryllus,IDASTNKTKKKAKKDKRKNKPPKK,ASTNKTKKKAKKDKRKNKPPKKDS,HIDASTNKTKKKAKKDKRKNKPPK,DASTNKTKKKAKKDKRKNKPPKKD,EHIDASTNKTKKKAKKDKRKNKPP,QEHIDASTNKTKKKAKKDKRKNKP,KKKAKKDKRKNKPPKKDSETSKPA,TKKKAKKDKRKNKPPKKDSETSKP,KTKKKAKKDKRKNKPPKKDSETSK,NKTKKKAKKDKRKNKPPKKDSETS,TNKTKKKAKKDKRKNKPPKKDSET,STNKTKKKAKKDKRKNKPPKKDSE
pfam,GEGSEEDSHSDVNSEEESEEGGEE,GSEEDSHSDVNSEEESEEGGEEEE,DGEGSEEDSHSDVNSEEESEEGGE,EGSEEDSHSDVNSEEESEEGGEEE,EDGEGSEEDSHSDVNSEEESEEGG,EEDGEGSEEDSHSDVNSEEESEEG,HSDVNSEEESEEGGEEEEEDEQPA,SHSDVNSEEESEEGGEEEEEDEQP,DSHSDVNSEEESEEGGEEEEEDEQ,EDSHSDVNSEEESEEGGEEEEEDE,EEDSHSDVNSEEESEEGGEEEEED,SEEDSHSDVNSEEESEEGGEEEEE


>A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;
before match: KTSKEIKNNPFEVKINRKKFDILGRKSKHDVGLPGVSRSKAINKRKETLLKEYKTKDKSNKFIDKRFGEYDTKMDPEEKILQRFSMERQRTQDKKNMYNLNEEEELTHYGQSLAEMEKLTDMVDSDDDADERGLLSAEMTASHFGGGGLLMKKTPGEQDEGGKQRAKSRQELIEELIIKSKQEKRERQTQKEESQVLTEKLDQDWKSIQGLLAHKNAPKADRAEDEDKPKLDEYDMMVRELGFEMKAAPSEKLKTPEEVAREEQERLQQLEADRLRRMMGDIVEDNTKAPTHMSADDINDGFVLDGDDRQTLAYQVRDNINRLFTEGEKIEGEEGEKEESGEEEESDEEESEQEG

matching: EEDGEGSEEDSHSDVNSEEESEEGGEEEEEDEQPA

after match: AVPQLSEKVQAEAAKIELPYTFTVPECYRDLKSLLQGHPADHQCIILARTQQCNHPSLGIGNKLKIQVCYLIDIVCIDLLQLSNIFILYDLCQLFPETASKGLQTTLGDDAHSMEEVLEVKGRAAFPKLDMLIYLKITALLFPTSDFRHPVTTPALLYISQALTKCPVTSLKDLTSGLVLCCLALEYVSFSKRFVPELINFLLGALHLAVTDKTTGYSVVPPFRRSGKASDLLVLSSPESSQTWCKKTLPLSATQTLTLTSDLDSHMKLSILATCLDLLKRCFSLYRELPSCLHIFQPIATLLSKHLPIKTYPAALQVSGPALVSRKKKPIPLKLFTPRIVQVLDYGKKRGNTKEEREKERLKHKYKKEFKGALREIRKDSRFLGREKLNEVMARDAERKRKVKELYGSLATQEGE
Match range (1-based): 356-390
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILT

kmer_hp,hhphhpphpppppphhhpphhphh,hhpphpppppphhhpphhphhphp,hphhphhpphpppppphhhpphhp,hphhpphpppppphhhpphhphhp,hpphpppppphhhpphhphhphpp,hpppppppphphhphhpphppppp,phhphhpphpppppphhhpphhph,phhpphpppppphhhpphhphhph,phphhphhpphpppppphhhpphh,phpppppphhhpphhphhphpppp,phpppppppphphhphhpphpppp,pphphhphhpphpppppphhhpph,pphpppppphhhpphhphhphppp,ppphphhphhpphpppppphhhpp,pppphphhphhpphpppppphhhp,ppppphphhphhpphpppppphhh,pppppphphhphhpphpppppphh,ppppppphphhphhpphpppppph,pppppppphphhphhpphpppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
botryllus,YGKLPKKGSECKKHGILCRILTAL,LPKKGSECKKHGILCRILTALHLK,GKYGKLPKKGSECKKHGILCRILT,GKLPKKGSECKKHGILCRILTALH,PKKGSECKKHGILCRILTALHLKK,GHHEEETECGKYGKLPKKGSECKK,KYGKLPKKGSECKKHGILCRILTA,KLPKKGSECKKHGILCRILTALHL,CGKYGKLPKKGSECKKHGILCRIL,KGSECKKHGILCRILTALHLKKRR,QGHHEEETECGKYGKLPKKGSECK,ECGKYGKLPKKGSECKKHGILCRI,KKGSECKKHGILCRILTALHLKKR,TECGKYGKLPKKGSECKKHGILCR,ETECGKYGKLPKKGSECKKHGILC,EETECGKYGKLPKKGSECKKHGIL,EEETECGKYGKLPKKGSECKKHGI,HEEETECGKYGKLPKKGSECKKHG,HHEEETECGKYGKLPKKGSECKKH
pfam,AVSILNEARDKDEEMVVSEIGKLV,ILNEARDKDEEMVVSEIGKLVEYN,LNAVSILNEARDKDEEMVVSEIGK,VSILNEARDKDEEMVVSEIGKLVE,LNEARDKDEEMVVSEIGKLVEYNR,FQKEKEEKKLNAVSILNEARDKDE,NAVSILNEARDKDEEMVVSEIGKL,SILNEARDKDEEMVVSEIGKLVEY,KLNAVSILNEARDKDEEMVVSEIG,EARDKDEEMVVSEIGKLVEYNRRK,SFQKEKEEKKLNAVSILNEARDKD,KKLNAVSILNEARDKDEEMVVSEI,NEARDKDEEMVVSEIGKLVEYNRR,EKKLNAVSILNEARDKDEEMVVSE,EEKKLNAVSILNEARDKDEEMVVS,KEEKKLNAVSILNEARDKDEEMVV,EKEEKKLNAVSILNEARDKDEEMV,KEKEEKKLNAVSILNEARDKDEEM,QKEKEEKKLNAVSILNEARDKDEE


>A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936.19;Terpene_synth_C;
before match: LVFARQDFNHCQALHQKEFQQLERWYKDCGLDRLNYGRNVLHVAHFIASTVIGDPQLVDARMSYAKHVVLVTRIDDFFDHHGSREESHKILELVKEWKEKPAVDYGSQEVEILFSAVYRTVNELAEKASVEQGRCVKHHLINLWVQILTSFVRELDTWCDDAAMTLDEYLSFAWVSIGCRICILTSIHFMGIRLSDDMILGEECTNLCIHVSTVNRLLNDLQ

matching: SFQKEKEEKKLNAVSILNEARDKDEEMVVSEIGKLVEYNRRK

after match: 
Match range (1-based): 223-264
-
>BHF
before match: MVHDTEQLLA

matching: QGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRR

after match: TKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 11-52


--- A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.20;ZnuA;, intersect_bp: 30 ---


kmer_hp,hhphhphppppppppppphhpppp,hhphppppppppppphhpppppph,hphhphppppppppppphhppppp,hphppppppppppphhpppppphp,hpphhphhphppppppppppphhp,hppppppppppphhpppppphphp,phhphhphppppppppppphhppp,phhphppppppppppphhpppppp,phppppppppppphhpppppphph,pphhphhphppppppppppphhpp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
botryllus,ILTALHLKKRRTKHDHQKLLSESQ,ALHLKKRRTKHDHQKLLSESQEHI,LTALHLKKRRTKHDHQKLLSESQE,LHLKKRRTKHDHQKLLSESQEHID,LCRILTALHLKKRRTKHDHQKLLS,LKKRRTKHDHQKLLSESQEHIDAS,RILTALHLKKRRTKHDHQKLLSES,TALHLKKRRTKHDHQKLLSESQEH,HLKKRRTKHDHQKLLSESQEHIDA,CRILTALHLKKRRTKHDHQKLLSE
pfam,GIEMMEGSEEEHEEEDDHAVEEEH,MMEGSEEEHEEEDDHAVEEEHDHG,IEMMEGSEEEHEEEDDHAVEEEHD,MEGSEEEHEEEDDHAVEEEHDHGE,ASKGIEMMEGSEEEHEEEDDHAVE,GSEEEHEEEDDHAVEEEHDHGELD,KGIEMMEGSEEEHEEEDDHAVEEE,EMMEGSEEEHEEEDDHAVEEEHDH,EGSEEEHEEEDDHAVEEEHDHGEL,SKGIEMMEGSEEEHEEEDDHAVEE


>A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.20;ZnuA;
before match: VKTSFYPIYEFTRNVAGDLADVENLVPAGVEPHDWEPTPQDMTGITDADVLIYNGAGMEGWIEQVLDSAGDHLIAVE

matching: ASKGIEMMEGSEEEHEEEDDHAVEEEHDHGELD

after match: PHVWLSPRLAIQEVRNIEVALAKAAPQHADAFKANADAYVTKLEILDQDFRDELKDTLRKDFITQHAAFGYLAKEYGLTQVPIAGLSPEQEPSAAQMAEVVKFAKDHNVKTIFFETLVSSKVADTIAAEIGAKSAVLNPVEGLTEEDISNNLDYVVVMKQNLESLKTA
Match range (1-based): 78-110
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGI

matching: LCRILTALHLKKRRTKHDHQKLLSESQEHIDAS

after match: TNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 39-71


--- A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22;Chitin_bind_1;, intersect_bp: 20 ---


kmer_hp,hppppphhhphpppppppphphhp,pphhhphpppppppphphhphhpp,ppphhhphpppppppphphhphhp,pppphhhphpppppppphphhphh,ppppphhhphpppppppphphhph
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
botryllus,VHDTEQLLAQGHHEEETECGKYGK,EQLLAQGHHEEETECGKYGKLPKK,TEQLLAQGHHEEETECGKYGKLPK,DTEQLLAQGHHEEETECGKYGKLP,HDTEQLLAQGHHEEETECGKYGKL
pfam,GQCCSKYGYCGTSSDHCSKYCIPE,SKYGYCGTSSDHCSKYCIPEYGDC,CSKYGYCGTSSDHCSKYCIPEYGD,CCSKYGYCGTSSDHCSKYCIPEYG,QCCSKYGYCGTSSDHCSKYCIPEY


>A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22;Chitin_bind_1;
before match: SKDGRCGPNFGICPS

matching: GQCCSKYGYCGTSSDHCSKYCIPEYGDC

after match: SN
Match range (1-based): 16-43
-
>BHF
before match: M

matching: VHDTEQLLAQGHHEEETECGKYGKLPKK

after match: GSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 2-29


--- B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protamine_like;, intersect_bp: 20 ---


kmer_hp,hhpphhpphppphhhhpppppppp,hhpphppphhhhppppppppphhh,hpphhpphppphhhhppppppppp,hpphppphhhhppppppppphhhp,phhpphppphhhhppppppppphh,pphhpphppphhhhppppppppph
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
botryllus,FGERLIDRAQNKYAPLDEKQRSES,LIDRAQNKYAPLDEKQRSESHGAG,GERLIDRAQNKYAPLDEKQRSESH,IDRAQNKYAPLDEKQRSESHGAGE,RLIDRAQNKYAPLDEKQRSESHGA,ERLIDRAQNKYAPLDEKQRSESHG
pfam,PAQKVACDLKSDAAGGQQRSCQRQ,VACDLKSDAAGGQQRSCQRQSPYA,AQKVACDLKSDAAGGQQRSCQRQS,ACDLKSDAAGGQQRSCQRQSPYAR,KVACDLKSDAAGGQQRSCQRQSPY,QKVACDLKSDAAGGQQRSCQRQSP


>B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protamine_like;
before match: MGQKRHRTYCPPTYKRQKVARITNNGYLNFMTEYKKRFYGLSPQDMVHYAAKQWTQLSSAEKEAFKSKKPPTVVFKG

matching: PAQKVACDLKSDAAGGQQRSCQRQSPYAR

after match: SRESERRLSRSKTSCKSA
Match range (1-based): 78-106
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDR

matching: FGERLIDRAQNKYAPLDEKQRSESHGAGE

after match: YLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 169-197


--- A0A3P3D7T7_9RHOB/8-324 A0A3P3D7T7.1 PF00389.33;2-Hacid_dh;, intersect_bp: 10 ---


kmer_hp,hppphpphhphphpphphhhpppp,phpphhphphpphphhhpppphhh,pphpphhphphpphphhhpppphh,ppphpphhphphpphphhhpppph
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
botryllus,ARKRIRTVMKATWQSLQAGARSQT,RIRTVMKATWQSLQAGARSQTAFL,KRIRTVMKATWQSLQAGARSQTAF,RKRIRTVMKATWQSLQAGARSQTA
pfam,VETRMSELFNVRLREGDAAMSREE,RMSELFNVRLREGDAAMSREELAA,TRMSELFNVRLREGDAAMSREELA,ETRMSELFNVRLREGDAAMSREEL


>A0A3P3D7T7_9RHOB/8-324 A0A3P3D7T7.1 PF00389.33;2-Hacid_dh;
before match: VVVTRRLPEA

matching: VETRMSELFNVRLREGDAAMSREELAA

after match: ALREADVLVPCVADRIDAALLAHAGPQLRLIANYGAGVDHIDVATARQRGIVVSHTPGVLSEDTADMTLALILGVLRRLPEGLQDMASGNWQGWSPMAHLGRRISGLRLGILGMGQVGLAVARRARAFGMQIHYHNRRRLRPEVEGAVEATYWESLDQMLARMDVISVNCPHTPSTFHLLNARRLKLLKPSAVIVNTSRGEVMDENALLRGLKAGEIAGAGLDVYEHGHEITPGLRDLSQVVLLPHMGSATLEGRVEMGEKVILNIKTFADGHRPPDRVI
Match range (1-based): 11-37
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKK

matching: ARKRIRTVMKATWQSLQAGARSQTAFL

after match: NPQGAVSAALVQNRR
Match range (1-based): 211-237


--- R5Z925_9FIRM/1-181 R5Z925.1 PF12729.10;4HB_MCP_1;, intersect_bp: 10 ---


kmer_hp,hppphphphhpphppphpphhphp
species,Unnamed: 1_level_1
botryllus,LKHQWKGQGAKKARKRIRTVMKAT
pfam,IKTKAEAKIYDDVNSKLKDYWKIE


>R5Z925_9FIRM/1-181 R5Z925.1 PF12729.10;4HB_MCP_1;
before match: MMNIQTRLKKGFRFTTILTALAGGLAIIVLGIMSTQYSDALKYYGFSQGDIGKAMVAFTETRSCTRGLIGYKDLAVLCTLSDDHDTKKESFEKYWSEVGDT

matching: IKTKAEAKIYDDVNSKLKDYWKIE

after match: KEIKKLGLNIADPGAQKKAERRASAELAPAYNEIYKGMVSLMDKKVTEGDALKARL
Match range (1-based): 102-125
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEY

matching: LKHQWKGQGAKKARKRIRTVMKAT

after match: WQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 199-222


--- E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPCR_Srh;, intersect_bp: 10 ---


kmer_hp,hhpphpphpppphpphphhhhphp,hpphpphpppphpphphhhhphph,pphpphpppphpphphhhhphphh
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
botryllus,FATTYEKFDNDSLCSVDLIPVDIE,ATTYEKFDNDSLCSVDLIPVDIEF,TTYEKFDNDSLCSVDLIPVDIEFW
pfam,IFSTVSKLHNQTANNLSILVLSLN,FSTVSKLHNQTANNLSILVLSLNG,STVSKLHNQTANNLSILVLSLNGI


>E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPCR_Srh;
before match: ETPNFVSNSLHLISIITTPIHLIGFYCILFKTPESMKSVKWGMFHVHFWCTLMDWSLTVITIPYLLSPVAAGVPLGFANVLGISTDFQCYFALTSVAAQGMAFVLIFENRYFLIFARNTSWRYIRVVFIIINYCAVFCVFIPLLTMIPEQTEARKAVLKILPDLPEALDVKLIFVLSTDISYILISAVFMESFLSTEAAIFVVLLWTNFKLTRSAQHSLKTMKLQKKFLLAMYIQAAVMFFNLVIPVSYF

matching: IFSTVSKLHNQTANNLSILVLSLNGI

after match: ASTTIMLWVHKPFREACFDLFRC
Match range (1-based): 251-276
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANS

matching: FATTYEKFDNDSLCSVDLIPVDIEFW

after match: DMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKRIRTVMKATWQSLQAGARSQTAFLNPQGAVSAALVQNRR
Match range (1-based): 118-143

--- BHF_isoform2 ---


--- A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;, intersect_bp: 30 ---


kmer_hp,hphpppppppphpppppppphhpp,hpppppppphpppppppphhpppp,phphpppppppphpppppppphhp,phpppppppphpppppppphhppp,pphphpppppppphpppppppphh,ppphphpppppppphpppppppph,ppphpppppppphhpppppppphh,pppphpppppppphhpppppppph,ppppphpppppppphhpppppppp,pppppphpppppppphhppppppp,ppppppphpppppppphhpppppp,pppppppphpppppppphhppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
botryllus,IDASTNKTKKKAKKDKRKNKPPKK,ASTNKTKKKAKKDKRKNKPPKKDS,HIDASTNKTKKKAKKDKRKNKPPK,DASTNKTKKKAKKDKRKNKPPKKD,EHIDASTNKTKKKAKKDKRKNKPP,QEHIDASTNKTKKKAKKDKRKNKP,KKKAKKDKRKNKPPKKDSETSKPA,TKKKAKKDKRKNKPPKKDSETSKP,KTKKKAKKDKRKNKPPKKDSETSK,NKTKKKAKKDKRKNKPPKKDSETS,TNKTKKKAKKDKRKNKPPKKDSET,STNKTKKKAKKDKRKNKPPKKDSE
pfam,GEGSEEDSHSDVNSEEESEEGGEE,GSEEDSHSDVNSEEESEEGGEEEE,DGEGSEEDSHSDVNSEEESEEGGE,EGSEEDSHSDVNSEEESEEGGEEE,EDGEGSEEDSHSDVNSEEESEEGG,EEDGEGSEEDSHSDVNSEEESEEG,HSDVNSEEESEEGGEEEEEDEQPA,SHSDVNSEEESEEGGEEEEEDEQP,DSHSDVNSEEESEEGGEEEEEDEQ,EDSHSDVNSEEESEEGGEEEEEDE,EEDSHSDVNSEEESEEGGEEEEED,SEEDSHSDVNSEEESEEGGEEEEE


>A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;
before match: KTSKEIKNNPFEVKINRKKFDILGRKSKHDVGLPGVSRSKAINKRKETLLKEYKTKDKSNKFIDKRFGEYDTKMDPEEKILQRFSMERQRTQDKKNMYNLNEEEELTHYGQSLAEMEKLTDMVDSDDDADERGLLSAEMTASHFGGGGLLMKKTPGEQDEGGKQRAKSRQELIEELIIKSKQEKRERQTQKEESQVLTEKLDQDWKSIQGLLAHKNAPKADRAEDEDKPKLDEYDMMVRELGFEMKAAPSEKLKTPEEVAREEQERLQQLEADRLRRMMGDIVEDNTKAPTHMSADDINDGFVLDGDDRQTLAYQVRDNINRLFTEGEKIEGEEGEKEESGEEEESDEEESEQEG

matching: EEDGEGSEEDSHSDVNSEEESEEGGEEEEEDEQPA

after match: AVPQLSEKVQAEAAKIELPYTFTVPECYRDLKSLLQGHPADHQCIILARTQQCNHPSLGIGNKLKIQVCYLIDIVCIDLLQLSNIFILYDLCQLFPETASKGLQTTLGDDAHSMEEVLEVKGRAAFPKLDMLIYLKITALLFPTSDFRHPVTTPALLYISQALTKCPVTSLKDLTSGLVLCCLALEYVSFSKRFVPELINFLLGALHLAVTDKTTGYSVVPPFRRSGKASDLLVLSSPESSQTWCKKTLPLSATQTLTLTSDLDSHMKLSILATCLDLLKRCFSLYRELPSCLHIFQPIATLLSKHLPIKTYPAALQVSGPALVSRKKKPIPLKLFTPRIVQVLDYGKKRGNTKEEREKERLKHKYKKEFKGALREIRKDSRFLGREKLNEVMARDAERKRKVKELYGSLATQEGE
Match range (1-based): 356-390
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILT

kmer_hp,hhphhpphpppppphhhpphhphh,hhpphpppppphhhpphhphhphp,hphhphhpphpppppphhhpphhp,hphhpphpppppphhhpphhphhp,hpphpppppphhhpphhphhphpp,hpppppppphphhphhpphppppp,phhphhpphpppppphhhpphhph,phhpphpppppphhhpphhphhph,phphhphhpphpppppphhhpphh,phpppppphhhpphhphhphpppp,phpppppppphphhphhpphpppp,pphphhphhpphpppppphhhpph,pphpppppphhhpphhphhphppp,ppphphhphhpphpppppphhhpp,pppphphhphhpphpppppphhhp,ppppphphhphhpphpppppphhh,pppppphphhphhpphpppppphh,ppppppphphhphhpphpppppph,pppppppphphhphhpphpppppp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
botryllus,YGKLPKKGSECKKHGILCRILTAL,LPKKGSECKKHGILCRILTALHLK,GKYGKLPKKGSECKKHGILCRILT,GKLPKKGSECKKHGILCRILTALH,PKKGSECKKHGILCRILTALHLKK,GHHEEETECGKYGKLPKKGSECKK,KYGKLPKKGSECKKHGILCRILTA,KLPKKGSECKKHGILCRILTALHL,CGKYGKLPKKGSECKKHGILCRIL,KGSECKKHGILCRILTALHLKKRR,QGHHEEETECGKYGKLPKKGSECK,ECGKYGKLPKKGSECKKHGILCRI,KKGSECKKHGILCRILTALHLKKR,TECGKYGKLPKKGSECKKHGILCR,ETECGKYGKLPKKGSECKKHGILC,EETECGKYGKLPKKGSECKKHGIL,EEETECGKYGKLPKKGSECKKHGI,HEEETECGKYGKLPKKGSECKKHG,HHEEETECGKYGKLPKKGSECKKH
pfam,AVSILNEARDKDEEMVVSEIGKLV,ILNEARDKDEEMVVSEIGKLVEYN,LNAVSILNEARDKDEEMVVSEIGK,VSILNEARDKDEEMVVSEIGKLVE,LNEARDKDEEMVVSEIGKLVEYNR,FQKEKEEKKLNAVSILNEARDKDE,NAVSILNEARDKDEEMVVSEIGKL,SILNEARDKDEEMVVSEIGKLVEY,KLNAVSILNEARDKDEEMVVSEIG,EARDKDEEMVVSEIGKLVEYNRRK,SFQKEKEEKKLNAVSILNEARDKD,KKLNAVSILNEARDKDEEMVVSEI,NEARDKDEEMVVSEIGKLVEYNRR,EKKLNAVSILNEARDKDEEMVVSE,EEKKLNAVSILNEARDKDEEMVVS,KEEKKLNAVSILNEARDKDEEMVV,EKEEKKLNAVSILNEARDKDEEMV,KEKEEKKLNAVSILNEARDKDEEM,QKEKEEKKLNAVSILNEARDKDEE


>A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936.19;Terpene_synth_C;
before match: LVFARQDFNHCQALHQKEFQQLERWYKDCGLDRLNYGRNVLHVAHFIASTVIGDPQLVDARMSYAKHVVLVTRIDDFFDHHGSREESHKILELVKEWKEKPAVDYGSQEVEILFSAVYRTVNELAEKASVEQGRCVKHHLINLWVQILTSFVRELDTWCDDAAMTLDEYLSFAWVSIGCRICILTSIHFMGIRLSDDMILGEECTNLCIHVSTVNRLLNDLQ

matching: SFQKEKEEKKLNAVSILNEARDKDEEMVVSEIGKLVEYNRRK

after match: 
Match range (1-based): 223-264
-
>BHF
before match: MVHDTEQLLA

matching: QGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRR

after match: TKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKVRSNGV
Match range (1-based): 11-52


--- A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.20;ZnuA;, intersect_bp: 30 ---


kmer_hp,hhphhphppppppppppphhpppp,hhphppppppppppphhpppppph,hphhphppppppppppphhppppp,hphppppppppppphhpppppphp,hpphhphhphppppppppppphhp,hppppppppppphhpppppphphp,phhphhphppppppppppphhppp,phhphppppppppppphhpppppp,phppppppppppphhpppppphph,pphhphhphppppppppppphhpp
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
botryllus,ILTALHLKKRRTKHDHQKLLSESQ,ALHLKKRRTKHDHQKLLSESQEHI,LTALHLKKRRTKHDHQKLLSESQE,LHLKKRRTKHDHQKLLSESQEHID,LCRILTALHLKKRRTKHDHQKLLS,LKKRRTKHDHQKLLSESQEHIDAS,RILTALHLKKRRTKHDHQKLLSES,TALHLKKRRTKHDHQKLLSESQEH,HLKKRRTKHDHQKLLSESQEHIDA,CRILTALHLKKRRTKHDHQKLLSE
pfam,GIEMMEGSEEEHEEEDDHAVEEEH,MMEGSEEEHEEEDDHAVEEEHDHG,IEMMEGSEEEHEEEDDHAVEEEHD,MEGSEEEHEEEDDHAVEEEHDHGE,ASKGIEMMEGSEEEHEEEDDHAVE,GSEEEHEEEDDHAVEEEHDHGELD,KGIEMMEGSEEEHEEEDDHAVEEE,EMMEGSEEEHEEEDDHAVEEEHDH,EGSEEEHEEEDDHAVEEEHDHGEL,SKGIEMMEGSEEEHEEEDDHAVEE


>A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.20;ZnuA;
before match: VKTSFYPIYEFTRNVAGDLADVENLVPAGVEPHDWEPTPQDMTGITDADVLIYNGAGMEGWIEQVLDSAGDHLIAVE

matching: ASKGIEMMEGSEEEHEEEDDHAVEEEHDHGELD

after match: PHVWLSPRLAIQEVRNIEVALAKAAPQHADAFKANADAYVTKLEILDQDFRDELKDTLRKDFITQHAAFGYLAKEYGLTQVPIAGLSPEQEPSAAQMAEVVKFAKDHNVKTIFFETLVSSKVADTIAAEIGAKSAVLNPVEGLTEEDISNNLDYVVVMKQNLESLKTA
Match range (1-based): 78-110
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGI

matching: LCRILTALHLKKRRTKHDHQKLLSESQEHIDAS

after match: TNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKVRSNGV
Match range (1-based): 39-71


--- A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22;Chitin_bind_1;, intersect_bp: 20 ---


kmer_hp,hppppphhhphpppppppphphhp,pphhhphpppppppphphhphhpp,ppphhhphpppppppphphhphhp,pppphhhphpppppppphphhphh,ppppphhhphpppppppphphhph
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
botryllus,VHDTEQLLAQGHHEEETECGKYGK,EQLLAQGHHEEETECGKYGKLPKK,TEQLLAQGHHEEETECGKYGKLPK,DTEQLLAQGHHEEETECGKYGKLP,HDTEQLLAQGHHEEETECGKYGKL
pfam,GQCCSKYGYCGTSSDHCSKYCIPE,SKYGYCGTSSDHCSKYCIPEYGDC,CSKYGYCGTSSDHCSKYCIPEYGD,CCSKYGYCGTSSDHCSKYCIPEYG,QCCSKYGYCGTSSDHCSKYCIPEY


>A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22;Chitin_bind_1;
before match: SKDGRCGPNFGICPS

matching: GQCCSKYGYCGTSSDHCSKYCIPEYGDC

after match: SN
Match range (1-based): 16-43
-
>BHF
before match: M

matching: VHDTEQLLAQGHHEEETECGKYGKLPKK

after match: GSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKVRSNGV
Match range (1-based): 2-29


--- B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protamine_like;, intersect_bp: 20 ---


kmer_hp,hhpphhpphppphhhhpppppppp,hhpphppphhhhppppppppphhh,hpphhpphppphhhhppppppppp,hpphppphhhhppppppppphhhp,phhpphppphhhhppppppppphh,pphhpphppphhhhppppppppph
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
botryllus,FGERLIDRAQNKYAPLDEKQRSES,LIDRAQNKYAPLDEKQRSESHGAG,GERLIDRAQNKYAPLDEKQRSESH,IDRAQNKYAPLDEKQRSESHGAGE,RLIDRAQNKYAPLDEKQRSESHGA,ERLIDRAQNKYAPLDEKQRSESHG
pfam,PAQKVACDLKSDAAGGQQRSCQRQ,VACDLKSDAAGGQQRSCQRQSPYA,AQKVACDLKSDAAGGQQRSCQRQS,ACDLKSDAAGGQQRSCQRQSPYAR,KVACDLKSDAAGGQQRSCQRQSPY,QKVACDLKSDAAGGQQRSCQRQSP


>B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protamine_like;
before match: MGQKRHRTYCPPTYKRQKVARITNNGYLNFMTEYKKRFYGLSPQDMVHYAAKQWTQLSSAEKEAFKSKKPPTVVFKG

matching: PAQKVACDLKSDAAGGQQRSCQRQSPYAR

after match: SRESERRLSRSKTSCKSA
Match range (1-based): 78-106
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANSFATTYEKFDNDSLCSVDLIPVDIEFWDMENEPVDQLPHEIXESVHMYGDDR

matching: FGERLIDRAQNKYAPLDEKQRSESHGAGE

after match: YLKHQWKGQGAKKARKVRSNGV
Match range (1-based): 169-197


--- E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPCR_Srh;, intersect_bp: 10 ---


kmer_hp,hhpphpphpppphpphphhhhphp,hpphpphpppphpphphhhhphph,pphpphpppphpphphhhhphphh
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
botryllus,FATTYEKFDNDSLCSVDLIPVDIE,ATTYEKFDNDSLCSVDLIPVDIEF,TTYEKFDNDSLCSVDLIPVDIEFW
pfam,IFSTVSKLHNQTANNLSILVLSLN,FSTVSKLHNQTANNLSILVLSLNG,STVSKLHNQTANNLSILVLSLNGI


>E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPCR_Srh;
before match: ETPNFVSNSLHLISIITTPIHLIGFYCILFKTPESMKSVKWGMFHVHFWCTLMDWSLTVITIPYLLSPVAAGVPLGFANVLGISTDFQCYFALTSVAAQGMAFVLIFENRYFLIFARNTSWRYIRVVFIIINYCAVFCVFIPLLTMIPEQTEARKAVLKILPDLPEALDVKLIFVLSTDISYILISAVFMESFLSTEAAIFVVLLWTNFKLTRSAQHSLKTMKLQKKFLLAMYIQAAVMFFNLVIPVSYF

matching: IFSTVSKLHNQTANNLSILVLSLNGI

after match: ASTTIMLWVHKPFREACFDLFRC
Match range (1-based): 251-276
-
>BHF
before match: MVHDTEQLLAQGHHEEETECGKYGKLPKKGSECKKHGILCRILTALHLKKRRTKHDHQKLLSESQEHIDASTNKTKKKAKKDKRKNKPPKKDSETSKPAQTTISRLPSNRNNNNANS

matching: FATTYEKFDNDSLCSVDLIPVDIEFW

after match: DMENEPVDQLPHEIXESVHMYGDDRFGERLIDRAQNKYAPLDEKQRSESHGAGEYLKHQWKGQGAKKARKVRSNGV
Match range (1-based): 118-143


In [82]:
df['name'].values

array(['A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;',
       'A0A022QCL9_ERYGU/105-368 A0A022QCL9.1 PF03936.19;Terpene_synth_C;',
       'A0A098M3F2_9BACL/57-334 A0A098M3F2.1 PF01297.20;ZnuA;',
       'A0A1Y2CQP8_9FUNG/32-76 A0A1Y2CQP8.1 PF00187.22;Chitin_bind_1;',
       'B4HZG1_DROSE/1-124 B4HZG1.1 PF06382.14;Protamine_like;',
       'E3LLJ4_CAERE/4-302 E3LLJ4.1 PF10318.12;7TM_GPCR_Srh;'],
      dtype=object)

In [70]:
%debug

> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/internals/blocks.py[0m(1979)[0;36mcheck_ndim[0;34m()[0m
[0;32m   1977 [0;31m            )
[0m[0;32m   1978 [0;31m        [0;32mif[0m [0mlen[0m[0;34m([0m[0mplacement[0m[0;34m)[0m [0;34m!=[0m [0mlen[0m[0;34m([0m[0mvalues[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1979 [0;31m            raise ValueError(
[0m[0;32m   1980 [0;31m                [0;34mf"Wrong number of items passed {len(values)}, "[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1981 [0;31m                [0;34mf"placement implies {len(placement)}"[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/internals/blocks.py[0m(1937)[0;36mnew_block[0;34m()[0m
[0;32m   1935 [0;31m[0;34m[0m[0m
[0m[0;32m   1936 [0;31m    [0mvalues[0m[0;34m,[0m [0m_[0m [0;34m=[0m [0mextract_pandas_array[0m[0;34m([0m[0mvalues[0m[0;34m,[0m [0;32mNone[0m[0;34m,[0m [0mndim[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1937 [0;31m    [0mcheck_ndim[0m[0;34m([0m[0mvalues[0m[0;34m,[0m [0mplacement[0m[0;34m,[0m [0mndim[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1938 [0;31m[0;34m[0m[0m
[0m[0;32m   1939 [0;31m    [0;32mif[0m [0mklass[0m [0;32mis[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/internals/managers.py[0m(1162)[0;36minsert[0;34m()[0m
[0;32m   1160 [0;31m            [0mvalue[0m [0;34m=[0m [0mensure_block_shape[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mndim[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mndim[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1161 [0;31m[0;34m[0m[0m
[0m[0;32m-> 1162 [0;31m        [0mblock[0m [0;34m=[0m [0mnew_block[0m[0;34m([0m[0mvalues[0m[0;34m=[0m[0mvalue[0m[0;34m,[0m [0mndim[0m[0;34m=[0m[0mself[0m[0;34m.[0m[0mndim[0m[0;34m,[0m [0mplacement[0m[0;34m=[0m[0mslice[0m[0;34m([0m[0mloc[0m[0;34m,[0m [0mloc[0m [0;34m+[0m [0;36m1[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1163 [0;31m[0;34m[0m[0m
[0m[0;32m   1164 [0;31m        [0;32mfor[0m [0mblkno[0m[0;34m,[0m [0mcount[0m [0;32min[0m [0m_fast_count_smallints[0m[0;34m([0m[0mself[0m[

ipdb>  u


> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/frame.py[0m(3754)[0;36m_set_item_mgr[0;34m()[0m
[0;32m   3752 [0;31m        [0;32mexcept[0m [0mKeyError[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3753 [0;31m            [0;31m# This item wasn't present, just insert at end[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3754 [0;31m            [0mself[0m[0;34m.[0m[0m_mgr[0m[0;34m.[0m[0minsert[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mself[0m[0;34m.[0m[0m_info_axis[0m[0;34m)[0m[0;34m,[0m [0mkey[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3755 [0;31m        [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3756 [0;31m            [0mself[0m[0;34m.[0m[0m_iset_item_mgr[0m[0;34m([0m[0mloc[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  dir()


['key', 'self', 'value']


ipdb>  u


> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/frame.py[0m(3742)[0;36m_set_item_frame_value[0;34m()[0m
[0;32m   3740 [0;31m        [0;31m# now align rows[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3741 [0;31m        [0marraylike[0m [0;34m=[0m [0m_reindex_for_setitem[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mself[0m[0;34m.[0m[0mindex[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3742 [0;31m        [0mself[0m[0;34m.[0m[0m_set_item_mgr[0m[0;34m([0m[0mkey[0m[0;34m,[0m [0marraylike[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3743 [0;31m[0;34m[0m[0m
[0m[0;32m   3744 [0;31m    [0;32mdef[0m [0m_iset_item_mgr[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mloc[0m[0;34m:[0m [0mint[0m [0;34m|[0m [0mslice[0m [0;34m|[0m [0mnp[0m[0;34m.[0m[0mndarray[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  u


> [0;32m/Users/olgabot/opt/miniconda3/envs/botryllus/lib/python3.10/site-packages/pandas/core/frame.py[0m(3602)[0;36m__setitem__[0;34m()[0m
[0;32m   3600 [0;31m            [0mself[0m[0;34m.[0m[0m_setitem_array[0m[0;34m([0m[0mkey[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3601 [0;31m        [0;32melif[0m [0misinstance[0m[0;34m([0m[0mvalue[0m[0;34m,[0m [0mDataFrame[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 3602 [0;31m            [0mself[0m[0;34m.[0m[0m_set_item_frame_value[0m[0;34m([0m[0mkey[0m[0;34m,[0m [0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   3603 [0;31m        elif (
[0m[0;32m   3604 [0;31m            [0mis_list_like[0m[0;34m([0m[0mvalue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  dir()


['indexer', 'key', 'self', 'value']


ipdb>  a


self = Empty DataFrame
Columns: [kmer__pfam, kmer_hp, kmer__botryllus, species]
Index: []
key = 'kmer_seq'
value = Empty DataFrame
Columns: [kmer__pfam, kmer_hp, kmer__botryllus, species]
Index: []


ipdb>  u


> [0;32m/Users/olgabot/code/botryllus-mhc/notebooks/kmer_utils.py[0m(69)[0;36msubset_gene_kmers[0;34m()[0m
[0;32m     67 [0;31m        [0;32mlambda[0m [0mx[0m[0;34m:[0m [0;34m"botryllus"[0m [0;32mif[0m [0mpd[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0mx[0m[0;34m)[0m [0;32melse[0m [0mother[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     68 [0;31m    )
[0m[0;32m---> 69 [0;31m    tidy["kmer_seq"] = tidy.apply(
[0m[0;32m     70 [0;31m        [0;32mlambda[0m [0mx[0m[0;34m:[0m [0mx[0m[0;34m[[0m[0mkmer_other[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     71 [0;31m        [0;32mif[0m [0mpd[0m[0;34m.[0m[0misnull[0m[0;34m([0m[0mx[0m[0;34m[[0m[0;34m"kmer__botryllus"[0m[0;34m][0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  dir()


['col', 'gene_subset', 'gene_symbol', 'kmer_other', 'merged_kmers', 'other', 'tidy']


ipdb>  tidy.head()


Empty DataFrame
Columns: [kmer__pfam, kmer_hp, kmer__botryllus, species]
Index: []


ipdb>  a


merged_kmers =           pfam_name                kmer__pfam                   kmer_hp  \
0        2-Hacid_dh  VEGVQWVNTLTDHNDKAALNTLLE  hphhphhpphpppppphhhpphhp   
1        2-Hacid_dh  VEGVQWVNTLTDHNDKAALNTLLE  hphhphhpphpppppphhhpphhp   
2      2-Hacid_dh_C  VEGVQWVNTLTDHNDKAALNTLLE  hphhphhpphpppppphhhpphhp   
3      2-Hacid_dh_C  VEGVQWVNTLTDHNDKAALNTLLE  hphhphhpphpppppphhhpphhp   
4            AAA_10  INPLSFIDSINETNRKLGISKFVD  hphhphhpphpppppphhhpphhp   
...             ...                       ...                       ...   
14824         PQQ_2  EQQNNQTAPTQTDQCIEPDNHSNS  ppppppphhpppppphphpppppp   
14825         PQQ_2  ERENNQTVPTHSDKCVEPDNHSNS  ppppppphhpppppphphpppppp   
14826         PQQ_2  ERENNQTVPTHSDKCVEPDNHSNS  ppppppphhpppppphphpppppp   
14827           UCH  STENGVCTHKSDKSLLTDHLTEIF  pppphhpppppppphhppphpphh   
14828           UCH  STENGVCTHKSDKSLLTDHLTEIF  pppphhpppppppphhppphpphh   

                                0             1               hashval  \
0          

ipdb>  u


> [0;32m/var/folders/7z/r1593ybs1sj2ks5zzl9vy8840000gn/T/ipykernel_58253/836512331.py[0m(4)[0;36m<module>[0;34m()[0m
[0;32m      2 [0;31m    [0mprint[0m[0;34m([0m[0;34mf'--- {isoform} ---'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      3 [0;31m    [0;32mfor[0m [0mi[0m[0;34m,[0m [0mrow[0m [0;32min[0m [0mdf[0m[0;34m.[0m[0miterrows[0m[0;34m([0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m----> 4 [0;31m        pivoted = kmer_utils.subset_gene_kmers(
[0m[0;32m      5 [0;31m            [0mbhf_pfam_kmers[0m[0;34m,[0m [0mcol[0m[0;34m=[0m[0;34m"name__pfam"[0m[0;34m,[0m [0mgene_symbol[0m[0;34m=[0m[0mrow[0m[0;34m.[0m[0mname[0m[0;34m,[0m [0mother[0m[0;34m=[0m[0;34m"pfam"[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      6 [0;31m        )
[0m


ipdb>  row['name']


'A0A6Q2YJA0_ESOLU/18-823 A0A6Q2YJA0.1 PF04147.15;Nop14;'


ipdb>  q


In [69]:
bhf_pfam_kmers.head()

Unnamed: 0,pfam_name,kmer__pfam,kmer_hp,0,1,hashval,i__pfam,name__pfam,pfam_id,pfam_id_full,i__botryllus,kmer__botryllus,name__botryllus
0,2-Hacid_dh,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/5-309,F3L153.1,6861319682135420868,99,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1
1,2-Hacid_dh,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/5-309,F3L153.1,6861319682135420868,99,F3L153_9GAMM/5-309 F3L153.1 PF00389.33;2-Hacid...,PF00389,PF00389.33,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform2
2,2-Hacid_dh_C,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/98-277,F3L153.1,6861319682135420868,6,F3L153_9GAMM/98-277 F3L153.1 PF02826.22;2-Haci...,PF02826,PF02826.22,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1
3,2-Hacid_dh_C,VEGVQWVNTLTDHNDKAALNTLLE,hphhphhpphpppppphhhpphhp,F3L153_9GAMM/98-277,F3L153.1,6861319682135420868,6,F3L153_9GAMM/98-277 F3L153.1 PF02826.22;2-Haci...,PF02826,PF02826.22,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform2
4,AAA_10,INPLSFIDSINETNRKLGISKFVD,hphhphhpphpppppphhhpphhp,F4L4C0_HALH1/172-500,F4L4C0.1,6861319682135420868,80,F4L4C0_HALH1/172-500 F4L4C0.1 PF12846.10;AAA_10;,PF12846,PF12846.10,20,GKYGKLPKKGSECKKHGILCRILT,BHF_isoform1
