In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import gzip
import json

import pandas as pd
from tqdm import tqdm

In [3]:
with gzip.open(
    "/Users/olgabot/botryllus-data/2024-04-16__uniprotkb_ZNF292_OR_RNMT_OR_BMP3_OR_CFA__botryllus_bhf_matches.json.gz"
) as f:
    uniprot_data = json.load(f)

In [4]:
entry = uniprot_data["results"][0]

In [5]:
entry.keys()

dict_keys(['entryType', 'primaryAccession', 'secondaryAccessions', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'comments', 'features', 'keywords', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])

In [6]:
# def add_subsequence_to_feature(feature, full_sequence):
#     start = feature["location"]["start"]["value"] - 1
#     end = feature["location"]["end"]["value"]
#     feature_seq = full_sequence[start:end]
#     length = len(feature_seq)
#     feature["sequence"] = {"value": feature_seq, "length": length}
#     return feature


# def parse_features(entry):
#     full_sequence = entry["sequence"]["value"]

#     sequence_features = []

#     for feature in entry["features"]:
#         feature = add_subsequence_to_feature(feature, full_sequence)
#         sequence_features.append(feature)
#     return sequence_features


# parse_features(entry)

In [7]:
def get_uniprot_gene_names(entry):
    if "genes" in entry:
        genes = ";".join(x["geneName"]["value"] for x in entry["genes"])
    else:
        genes = ""
    return genes

## Make Sourmash signatures

In [8]:
import csv
import os

import sig2kmer

import sourmash
from sourmash.logging import set_quiet

set_quiet(True)


def create_signature_from_sequence(
    sequence: str,
    protein_ksizes=[27],
    seed=sourmash.DEFAULT_SEED,
    protein=False,
    dayhoff=False,
    hp=True,
    dna=False,
    num_hashes=0,
    track_abundance=True,
    scaled=1,
):
    """
    protein_ksizes : list of ints
        This is the length of the k-mer in the amino acid sequence
    """

    # Multiply by three to get DNA-based ksizes
    # This is necessary because all the ksizes in sourmash are relative to DNA
    dna_ksizes = [k * 3 for k in protein_ksizes]

    params = sourmash.command_compute.ComputeParameters(
        ksizes=dna_ksizes,
        seed=seed,
        protein=protein,
        dayhoff=dayhoff,
        hp=hp,
        dna=dna,
        num_hashes=num_hashes,
        track_abundance=track_abundance,
        scaled=scaled,
    )

    sig = sourmash.signature.SourmashSignature.from_params(params)

    sig.add_protein(sequence)
    return sig


def write_hashval_kmers(
    sequence: str,
    sig: sourmash.SourmashSignature,
    ksize: int,
    moltype: str,
    filename: str,
    entry: dict,
):
    """
    entry : dict
        One result from the Uniprot REST API response
    """
    columns = [
        "hashval",
        "kmer_in_sequence",
        "kmer_in_alphabet",
        "start",
        "accession",
        "uniprotkb_id",
        "organism_scientific_name",
        "organism_common_name",
        "organism_taxonid",
        "gene_name",
    ]

    # response

    with gzip.open(filename, "wt") as f:
        writer = csv.writer(f)
        writer.writerow(columns)
        genes = get_uniprot_gene_names(entry)

        for kmer_in_seq, kmer_encoded, hashval, i in sig2kmer.get_kmers_for_hashvals(
            sequence, sig.minhash.hashes.keys(), ksize, moltype, input_is_protein=True
        ):
            writer.writerow(
                [
                    str(hashval),
                    kmer_in_seq,
                    kmer_encoded,
                    i,
                    entry["primaryAccession"],
                    entry["uniProtkbId"],
                    entry["organism"]["scientificName"],
                    entry["organism"]["commonName"],
                    entry["organism"]["taxonId"],
                    genes,
                ]
            )

## Iterate over entries and create kmer signatures

In [9]:
outdir = "/Users/olgabot/protein-job-search/2024-04-16--botryllus-known-matches"
!mkdir -p $outdir

ksize = 24
moltype = "hp"

for entry in uniprot_data["results"]:
    sequence = entry["sequence"]["value"]

    sig = create_signature_from_sequence(sequence, protein_ksizes=[ksize], scaled=1)
    accession = entry["primaryAccession"]
    uniprotkbid = entry["uniProtkbId"]
    prefix = f"{accession}__{uniprotkbid}"

    filename = os.path.join(outdir, f"{prefix}.csv.gz")
    write_hashval_kmers(
        sequence, sig, ksize=ksize, moltype=moltype, filename=filename, entry=entry
    )
    sourmash.command_compute.save_siglist(
        [sig], os.path.join(outdir, f"{prefix}.sig.gz")
    )

In [10]:
# ! wc -l $outdir/*.csv.gz

## Import botryllus sequence

In [11]:
botryllus_dir = "/Users/olgabot/botryllus/data/botryllus-proteins/"

botryllus_sigfile = os.path.join(botryllus_dir, "Bs_proteins.fa.hp.k24.scale5.sig")

### Read botryllus kmer hashes

In [12]:
def read_hash_csv(hash_csv):
    df = pd.read_csv(hash_csv)

    # Force hashval to be strings to avoid overflow/underflow errors
    df["hashval"] = df["hashval"].astype(str)
    # df = df.set_index(["hashval"])
    return df

In [29]:
hash_csv = os.path.join(
    botryllus_dir, "Bs_proteins.fa.hp.k24.scale5.aggregated.kmers.csv"
)
botryllus_kmer_hashes = read_hash_csv(hash_csv)
# botryllus_kmer_hashes.index = botryllus_kmer_hashes.index.astype(str)
botryllus_kmer_hashes

Unnamed: 0,kmer_in_sequence,kmer_in_alphabet,hashval,read_name,filename
0,VHDTEQLLAQGHHEEETECGKYGK,hppppphhhphpppppppphphhp,1036020595944595459,BHF,Bs_proteins.fa.gz
1,TEQLLAQGHHEEETECGKYGKLPE,ppphhhphpppppppphphhphhp,2993707203445337902,BHF,Bs_proteins.fa.gz
2,EQLLAQGHHEEETECGKYGKLPEK,pphhhphpppppppphphhphhpp,973051056292048589,BHF,Bs_proteins.fa.gz
3,ETECGKYGKLPEKGSECKKHGIFC,pppphphhphhpphpppppphhhp,385192726330471768,BHF,Bs_proteins.fa.gz
4,KYGKLPEKGSECKKHGIFCRILTA,phhphhpphpppppphhhpphhph,1194029314525711201,BHF,Bs_proteins.fa.gz
...,...,...,...,...,...
303186,LEREIEAQQLVMLEAEIDCLQRRF,hppphphpphhhhphphpphppph,1833930335722081195,g72096.t1 frame:1,Bs_proteins.fa.gz
303187,EREIEAQQLVMLEAEIDCLQRRFE,ppphphpphhhhphphpphppphp,616964295969867237,g72096.t1 frame:1,Bs_proteins.fa.gz
303188,MLEAEIDCLQRRFEDMESTWISRI,hhphphpphppphpphppphhpph,1869927996039995699,g72096.t1 frame:1,Bs_proteins.fa.gz
303189,EIDCLQRRFEDMESTWISRIDGTS,phpphppphpphppphhpphphpp,995001100309400823,g72096.t1 frame:1,Bs_proteins.fa.gz


In [30]:
bhf_hashes = botryllus_kmer_hashes.query('read_name == "BHF"')
bhf_hashes.head()

Unnamed: 0,kmer_in_sequence,kmer_in_alphabet,hashval,read_name,filename
0,VHDTEQLLAQGHHEEETECGKYGK,hppppphhhphpppppppphphhp,1036020595944595459,BHF,Bs_proteins.fa.gz
1,TEQLLAQGHHEEETECGKYGKLPE,ppphhhphpppppppphphhphhp,2993707203445337902,BHF,Bs_proteins.fa.gz
2,EQLLAQGHHEEETECGKYGKLPEK,pphhhphpppppppphphhphhpp,973051056292048589,BHF,Bs_proteins.fa.gz
3,ETECGKYGKLPEKGSECKKHGIFC,pppphphhphhpphpppppphhhp,385192726330471768,BHF,Bs_proteins.fa.gz
4,KYGKLPEKGSECKKHGIFCRILTA,phhphhpphpppppphhhpphhph,1194029314525711201,BHF,Bs_proteins.fa.gz


In [31]:
entry_to_gene_name = pd.Series(
    {x["primaryAccession"]: get_uniprot_gene_names(x) for x in uniprot_data["results"]}
)
entry_to_gene_name

O00327      BMAL1
O00422      SAP18
O14745     NHERF1
O15393    TMPRSS2
O43148       RNMT
           ...   
Q5TA04     CFAP43
Q5TA05     CFAP43
Q6P495     ZNF292
X5D9G8    CACNA1G
X5D9M1    CACNA1G
Length: 191, dtype: object

In [32]:
znf292_entries = entry_to_gene_name[entry_to_gene_name == "ZNF292"]
znf292_entries

O60281        ZNF292
J3KNV1        ZNF292
A0A8V8TP18    ZNF292
A0A8V8TPC9    ZNF292
A0A8V8TPI9    ZNF292
A0A8V8TQW3    ZNF292
E5RJG2        ZNF292
H0YAU0        ZNF292
H0YB79        ZNF292
Q6P495        ZNF292
dtype: object

In [33]:
znf292_csv = "/Users/olgabot/protein-job-search/2024-04-16--botryllus-known-matches/O60281__ZN292_HUMAN.csv.gz"

In [34]:
znf292_hashes = read_hash_csv(znf292_csv)
znf292_hashes

Unnamed: 0,hashval,kmer_in_sequence,kmer_in_alphabet,start,accession,uniprotkb_id,organism_scientific_name,organism_common_name,organism_taxonid,gene_name
0,13638207749623269045,hhppphpppphpphphhphhphpp,MADEEAEQERLSCGEGGCVAELQR,0,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
1,8404708278310155067,hppphpppphpphphhphhphpph,ADEEAEQERLSCGEGGCVAELQRL,1,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
2,8048618853620713690,ppphpppphpphphhphhphpphh,DEEAEQERLSCGEGGCVAELQRLG,2,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
3,17517879030661681686,pphpppphpphphhphhphpphhp,EEAEQERLSCGEGGCVAELQRLGE,3,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
4,17298076713827473283,phpppphpphphhphhphpphhpp,EAEQERLSCGEGGCVAELQRLGER,4,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
...,...,...,...,...,...,...,...,...,...,...
2695,5603276598621902447,phphpppphphphhpphphhphph,KLEVHSNDPDMSVMKDISIGKATG,2695,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
2696,2918542526153815211,hphpppphphphhpphphhphphp,LEVHSNDPDMSVMKDISIGKATGR,2696,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
2697,17814690177903370705,phpppphphphhpphphhphphph,EVHSNDPDMSVMKDISIGKATGRG,2697,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
2698,14562833835520679758,hpppphphphhpphphhphphphp,VHSNDPDMSVMKDISIGKATGRGQ,2698,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292


In [35]:
znf292_hashes_in_bhf = znf292_hashes.loc[znf292_hashes.hashval.isin(bhf_hashes.hashval)]
znf292_hashes_in_bhf

Unnamed: 0,hashval,kmer_in_sequence,kmer_in_alphabet,start,accession,uniprotkb_id,organism_scientific_name,organism_common_name,organism_taxonid,gene_name
606,1452226584929511062,pphhphhphppppppppppphhpp,RRLGRPPKITTTNENQKTNTVAKQ,606,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292
609,791636867468234754,hphhphppppppppppphhppppp,GRPPKITTTNENQKTNTVAKQEQR,609,O60281,ZN292_HUMAN,Homo sapiens,Human,9606,ZNF292


In [36]:
entry.keys()

dict_keys(['entryType', 'primaryAccession', 'uniProtkbId', 'entryAudit', 'annotationScore', 'organism', 'proteinExistence', 'proteinDescription', 'genes', 'features', 'references', 'uniProtKBCrossReferences', 'sequence', 'extraAttributes'])

In [None]:
entry["primaryAccession"]

In [None]:
entry["uniProtkbId"]