In [1]:
%load_ext autoreload
%autoreload 2

# References

Where to get domain information:
- [UniProtKB column names for programmatic access
](https://www.uniprot.org/help/return_fields)

General Uniprot API
- [Uniprot API: Programmatic access - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)
- [Programmatic access - Retrieving individual entries
](https://www.uniprot.org/help/api_retrieve_entries)
- **[REST API - Access the UniProt website programmatically](https://www.uniprot.org/help/api)**
- **[REST API - Retrieve entries](https://www.uniprot.org/help/api_retrieve_entries)**
- **[REST API - ID Mapping](https://www.uniprot.org/help/id_mapping)**
- **[REST API - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)**
- **[REST API - Downloading](https://www.uniprot.org/help/api_downloading)**

# Read in UniProt API request data 

In [2]:
import json

import requests

api_url = "https://rest.uniprot.org/uniprotkb/search"

snap25_accession = "P60880"
data = {
    "query": snap25_accession,
    "fields": [
        "id",
        "gene_names",
        "organism_name",
        "ft_coiled",
        "ft_compbias",
        "cc_domain",
        "ft_domain",
        "ft_motif",
        "protein_families",
        "ft_region",
        "ft_repeat",
        "ft_zn_fing",
        "sequence",
        "cc_sequence_caution",
        "ft_conflict",
        "ft_unsure",
        "sequence_version",
        "ft_var_seq",
    ],
}


# Thanks ChatGPT :)
def fetch_data(url, params=None):
    """
    Fetch data from a REST API endpoint.

    :param url: URL of the REST API endpoint.
    :param params: Dictionary of query parameters, defaults to None.
    :return: Parsed JSON data from the API response.
    """
    try:
        response = requests.get(url, params=params)
        # Check if the request was successful
        response.raise_for_status()

        # Parse the JSON response
        data = response.json()["results"]
        return data
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Error connecting to the server: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred: {req_err}")
    except json.JSONDecodeError as json_err:
        print(f"Error decoding JSON: {json_err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None


# response = requests.get(api_url, json=data)
# response
response = fetch_data(api_url, data)[0]
response

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'P60880',
 'uniProtkbId': 'SNP25_HUMAN',
 'entryAudit': {'sequenceVersion': 1},
 'organism': {'scientificName': 'Homo sapiens',
  'commonName': 'Human',
  'taxonId': 9606,
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'genes': [{'geneName': {'value': 'SNAP25'}, 'synonyms': [{'value': 'SNAP'}]}],
 'comments': [{'texts': [{'evidences': [{'evidenceCode': 'ECO:0000305'}],
     'value': 'Belongs to the SNAP-25 family'}],
   'commentType': 'SIMILARITY'}],
 'features': [{'type': 'Domain',
   'location': {'start': {'value': 19, 'modifier': 'EXACT'},
    'end': {'value': 81, 'modifier': 'EXACT'}},
   'description': 't-SNARE coiled-coil homology 1',
   'evidences': [{'evidenceCode': 'ECO:0000255',
     'source': 'PROSITE-ProRule',
     

## Get the sequences (and k-mers) for each domain

In [3]:
full_sequence = response["sequence"]["value"]

sequence_features = []

for feature in response["features"]:
    start = feature["location"]["start"]["value"] - 1
    end = feature["location"]["end"]["value"]
    feature_seq = full_sequence[start:end]
    length = len(feature_seq)
    feature["sequence"] = {"value": feature_seq, "length": length}
    sequence_features.append(feature)
sequence_features

[{'type': 'Domain',
  'location': {'start': {'value': 19, 'modifier': 'EXACT'},
   'end': {'value': 81, 'modifier': 'EXACT'}},
  'description': 't-SNARE coiled-coil homology 1',
  'evidences': [{'evidenceCode': 'ECO:0000255',
    'source': 'PROSITE-ProRule',
    'id': 'PRU00202'}],
  'sequence': {'value': 'DQLADESLESTRRMLQLVEESKDAGIRTLVMLDEQGEQLERIEEGMDQINKDMKEAEKNLTDL',
   'length': 63}},
 {'type': 'Domain',
  'location': {'start': {'value': 140, 'modifier': 'EXACT'},
   'end': {'value': 202, 'modifier': 'EXACT'}},
  'description': 't-SNARE coiled-coil homology 2',
  'evidences': [{'evidenceCode': 'ECO:0000255',
    'source': 'PROSITE-ProRule',
    'id': 'PRU00202'}],
  'sequence': {'value': 'DARENEMDENLEQVSGIIGNLRHMALDMGNEIDTQNRQIDRIMEKADSNKTRIDEANQRATKM',
   'length': 63}},
 {'type': 'Region',
  'location': {'start': {'value': 1, 'modifier': 'EXACT'},
   'end': {'value': 75, 'modifier': 'EXACT'}},
  'description': 'Interaction with CENPF',
  'evidences': [{'evidenceCode': 'ECO:00002

## Make a sourmash signature and get underlying kmers

In [4]:
import sig2kmer

# from sig2kmer import get_kmers_for_hashvals

In [5]:
import sourmash

In [8]:
def create_signature_from_sequence(
    sequence: str,
    protein_ksizes=[27],
    seed=sourmash.DEFAULT_SEED,
    protein=False,
    dayhoff=False,
    hp=True,
    dna=False,
    num_hashes=0,
    track_abundance=True,
    scaled=1,
):
    """
    protein_ksizes : list of ints
        This is the length of the k-mer in the amino acid sequence
    """

    # Multiply by three to get DNA-based ksizes
    # This is necessary because all the ksizes in sourmash are relative to DNA
    dna_ksizes = [k * 3 for k in protein_ksizes]

    params = sourmash.command_compute.ComputeParameters(
        ksizes=dna_ksizes,
        seed=seed,
        protein=protein,
        dayhoff=dayhoff,
        hp=hp,
        dna=dna,
        num_hashes=num_hashes,
        track_abundance=track_abundance,
        scaled=scaled,
    )

    sig = sourmash.signature.SourmashSignature.from_params(params)

    sig.add_protein(sequence)
    return sig


sequence = response["sequence"]["value"]

sig = create_signature_from_sequence(sequence)
sig

sig.minhash.moltype

'hp'

In [9]:
sig.minhash.ksize

27

In [10]:
len(sequence)

206

In [11]:
len(sig.minhash.hashes)

180

In [12]:
# sig.minhash.hashes

In [13]:
18435018991608296529 in sig.minhash.hashes.keys()

False

In [14]:
sig2kmer.hash_murmur("hhpphphppphpphppphpphhppphp")

7372733179435548602

In [15]:
sig2kmer.hash_murmur(b"hhpphphppphpphppphpphhppphp")

7372733179435548602

In [16]:
sig2kmer.hash_murmur(b"hhpphphppphpphppphpphhppphp") in sig.minhash.hashes.keys()

True

In [17]:
import csv
import gzip

Taxonomy/species data information

In [18]:
def write_hashval_kmers(
    sequence: str,
    sig: sourmash.SourmashSignature,
    ksize: int,
    moltype: str,
    filename: str,
    entry: dict,
):
    """
    entry : dict
        One result from the Uniprot REST API response
    """
    columns = [
        "hashval",
        "kmer_in_sequence",
        "kmer_in_alphabet",
        "start",
        "accession",
        "uniprotkb_id",
        "organism_scientific_name",
        "organism_common_name",
        "organism_taxonid",
        "gene_name",
    ]

    # response

    with gzip.open(filename, "wt") as f:
        writer = csv.writer(f)
        writer.writerow(columns)

        for kmer_in_seq, kmer_encoded, hashval, i in sig2kmer.get_kmers_for_hashvals(
            sequence, sig.minhash.hashes.keys(), ksize, moltype, input_is_protein=True
        ):
            writer.writerow(
                [
                    str(hashval),
                    kmer_in_seq,
                    kmer_encoded,
                    i,
                    entry["primaryAccession"],
                    entry["uniProtkbId"],
                    entry["organism"]["scientificName"],
                    entry["organism"]["commonName"],
                    entry["organism"]["taxonId"],
                    ";".join(x["geneName"]["value"] for x in entry["genes"]),
                ]
            )


write_hashval_kmers(
    sequence, sig, ksize=27, moltype="hp", filename="snap25.csv.gz", entry=response
)

In [19]:
%timeit sig2kmer.degenerate_protein_chatgpt("LIVE", 'hp')

3.05 µs ± 19.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [20]:
%timeit sig2kmer.degenerate_protein("LIVE", 'hp')

3.4 µs ± 228 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [22]:
sourmash._lowlevel.lib.sourmash_aa_to_hp(b"A")

b'h'

# Query for human reviewed sequences

In [23]:
human_reviewed_query = "(organism_id:9606) AND (reviewed:true)"
data = {
    "query": human_reviewed_query,
    "fields": [
        "id",
        "gene_names",
        "organism_name",
        "ft_coiled",
        "ft_compbias",
        "cc_domain",
        "ft_domain",
        "ft_motif",
        "protein_families",
        "ft_region",
        "ft_repeat",
        "ft_zn_fing",
        "sequence",
        "cc_sequence_caution",
        "ft_conflict",
        "ft_unsure",
        "sequence_version",
        "ft_var_seq",
    ],
}

In [24]:
sourmash.command_compute.save_siglist?

[0;31mSignature:[0m [0msourmash[0m[0;34m.[0m[0mcommand_compute[0m[0;34m.[0m[0msave_siglist[0m[0;34m([0m[0msiglist[0m[0;34m,[0m [0msigfile_name[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Save multiple signatures to a filename.
[0;31mFile:[0m      ~/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_compute.py
[0;31mType:[0m      function

In [25]:
sourmash.command_compute.save_sigs_to_location?

[0;31mSignature:[0m [0msourmash[0m[0;34m.[0m[0mcommand_compute[0m[0;34m.[0m[0msave_sigs_to_location[0m[0;34m([0m[0msiglist[0m[0;34m,[0m [0msave_sig[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Save multiple signatures to an already-open location.
[0;31mFile:[0m      ~/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_compute.py
[0;31mType:[0m      function

In [26]:
sourmash.command_sketch?

[0;31mType:[0m        module
[0;31mString form:[0m <module 'sourmash.command_sketch' from '/Users/olgabot/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_sketch.py'>
[0;31mFile:[0m        ~/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_sketch.py
[0;31mDocstring:[0m   Functions implementing the 'sketch' subcommands and related functions.

In [27]:
sourmash.command_compute.save_siglist([sig], "snap25.sig.gz")

[Ksaved 1 signature(s) to 'snap25.sig.gz'


In [28]:
outdir = "/Users/olgabot/protein-job-search/2024-03-15--uniprot-query"
!mkdir -p $outdir

In [31]:
import os

from sourmash.logging import set_quiet

In [33]:
set_quiet(True)

In [34]:
response = fetch_data(api_url, data)
for entry in response:
    sequence = entry["sequence"]["value"]

    sig = create_signature_from_sequence(sequence)
    accession = entry["primaryAccession"]
    uniprotkbid = entry["uniProtkbId"]
    prefix = f"{accession}__{uniprotkbid}"

    filename = os.path.join(outdir, f"{prefix}.csv.gz")
    write_hashval_kmers(
        sequence, sig, ksize=27, moltype="hp", filename=filename, entry=entry
    )
    sourmash.command_compute.save_siglist(
        [sig], os.path.join(outdir, f"{prefix}.sig.gz")
    )