In [2]:
%load_ext autoreload
%autoreload 2

# References

Where to get domain information:
- [UniProtKB column names for programmatic access
](https://www.uniprot.org/help/return_fields)

General Uniprot API
- [Uniprot API: Programmatic access - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)
- [Programmatic access - Retrieving individual entries
](https://www.uniprot.org/help/api_retrieve_entries)
- **[REST API - Access the UniProt website programmatically](https://www.uniprot.org/help/api)**
- **[REST API - Retrieve entries](https://www.uniprot.org/help/api_retrieve_entries)**
- **[REST API - ID Mapping](https://www.uniprot.org/help/id_mapping)**
- **[REST API - Retrieving entries via queries](https://www.uniprot.org/help/api_queries)**
- **[REST API - Downloading](https://www.uniprot.org/help/api_downloading)**

# Read in UniProt API request data 

In [3]:
import json

import requests

api_url = "https://rest.uniprot.org/uniprotkb/search"

snap25_accession = "P60880"
data = {"query": snap25_accession, "fields": ["id", "gene_names", "organism_name", 'ft_coiled', 'ft_compbias', 'cc_domain',
 'ft_domain',
 'ft_motif',
 'protein_families',
 'ft_region',
 'ft_repeat',
 'ft_zn_fing', 'sequence',
 'cc_sequence_caution',
 'ft_conflict',
 'ft_unsure',
 'sequence_version', 
                                             'ft_var_seq']}

# Thanks ChatGPT :)
def fetch_data(url, params=None):
    """
    Fetch data from a REST API endpoint.

    :param url: URL of the REST API endpoint.
    :param params: Dictionary of query parameters, defaults to None.
    :return: Parsed JSON data from the API response.
    """
    try:
        response = requests.get(url, params=params)
        # Check if the request was successful
        response.raise_for_status()
        
        # Parse the JSON response
        data = response.json()['results']
        return data
    except requests.exceptions.HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except requests.exceptions.ConnectionError as conn_err:
        print(f'Error connecting to the server: {conn_err}')
    except requests.exceptions.Timeout as timeout_err:
        print(f'Timeout error: {timeout_err}')
    except requests.exceptions.RequestException as req_err:
        print(f'An error occurred: {req_err}')
    except json.JSONDecodeError as json_err:
        print(f'Error decoding JSON: {json_err}')
    except Exception as e:
        print(f'An unexpected error occurred: {e}')
    return None

# response = requests.get(api_url, json=data)
# response
response = fetch_data(api_url, data)[0]
response

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'P60880',
 'uniProtkbId': 'SNP25_HUMAN',
 'entryAudit': {'sequenceVersion': 1},
 'organism': {'scientificName': 'Homo sapiens',
  'commonName': 'Human',
  'taxonId': 9606,
  'lineage': ['Eukaryota',
   'Metazoa',
   'Chordata',
   'Craniata',
   'Vertebrata',
   'Euteleostomi',
   'Mammalia',
   'Eutheria',
   'Euarchontoglires',
   'Primates',
   'Haplorrhini',
   'Catarrhini',
   'Hominidae',
   'Homo']},
 'genes': [{'geneName': {'value': 'SNAP25'}, 'synonyms': [{'value': 'SNAP'}]}],
 'comments': [{'texts': [{'evidences': [{'evidenceCode': 'ECO:0000305'}],
     'value': 'Belongs to the SNAP-25 family'}],
   'commentType': 'SIMILARITY'}],
 'features': [{'type': 'Domain',
   'location': {'start': {'value': 19, 'modifier': 'EXACT'},
    'end': {'value': 81, 'modifier': 'EXACT'}},
   'description': 't-SNARE coiled-coil homology 1',
   'evidences': [{'evidenceCode': 'ECO:0000255',
     'source': 'PROSITE-ProRule',
     

## Get the sequences (and k-mers) for each domain

In [4]:
full_sequence = response['sequence']['value']

sequence_features = []

for feature in response['features']:
    start = feature['location']['start']['value']-1
    end = feature['location']['end']['value']
    feature_seq = full_sequence[start:end]
    length = len(feature_seq)
    feature['sequence'] = {'value': feature_seq, 'length': length}
    sequence_features.append(feature)
sequence_features

[{'type': 'Domain',
  'location': {'start': {'value': 19, 'modifier': 'EXACT'},
   'end': {'value': 81, 'modifier': 'EXACT'}},
  'description': 't-SNARE coiled-coil homology 1',
  'evidences': [{'evidenceCode': 'ECO:0000255',
    'source': 'PROSITE-ProRule',
    'id': 'PRU00202'}],
  'sequence': {'value': 'DQLADESLESTRRMLQLVEESKDAGIRTLVMLDEQGEQLERIEEGMDQINKDMKEAEKNLTDL',
   'length': 63}},
 {'type': 'Domain',
  'location': {'start': {'value': 140, 'modifier': 'EXACT'},
   'end': {'value': 202, 'modifier': 'EXACT'}},
  'description': 't-SNARE coiled-coil homology 2',
  'evidences': [{'evidenceCode': 'ECO:0000255',
    'source': 'PROSITE-ProRule',
    'id': 'PRU00202'}],
  'sequence': {'value': 'DARENEMDENLEQVSGIIGNLRHMALDMGNEIDTQNRQIDRIMEKADSNKTRIDEANQRATKM',
   'length': 63}},
 {'type': 'Region',
  'location': {'start': {'value': 1, 'modifier': 'EXACT'},
   'end': {'value': 75, 'modifier': 'EXACT'}},
  'description': 'Interaction with CENPF',
  'evidences': [{'evidenceCode': 'ECO:00002

## Make a sourmash signature and get underlying kmers

In [28]:
from sig2kmer import get_kmers_for_hashvals

In [35]:
import sourmash

[0;31mType:[0m        module
[0;31mString form:[0m <module 'sourmash.command_sketch' from '/Users/olgabot/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_sketch.py'>
[0;31mFile:[0m        ~/anaconda3/envs/sourmash-v4.8.6/lib/python3.12/site-packages/sourmash/command_sketch.py
[0;31mDocstring:[0m   Functions implementing the 'sketch' subcommands and related functions.

42

In [41]:
params = sourmash.command_compute.ComputeParameters(            ksizes=[24, 27, 30],
            seed=sourmash.DEFAULT_SEED,
            protein=True,
            dayhoff=False,
            hp=True,
            dna=False,
            num_hashes=0,
            track_abundance=False,
            scaled=1)

sourmash.signature.SourmashSignature.from_params(params)

SourmashSignature('', 1ff1de77)

In [32]:
%timeit sig2kmer.degenerate_protein_chatgpt("LIVE", 'hp')

1.95 µs ± 148 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [34]:
%timeit sig2kmer.degenerate_protein("LIVE", 'hp')

2.12 µs ± 259 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [None]:
byte_encoded

In [20]:
letter

76

In [19]:
sourmash._lowlevel.lib.sourmash_aa_to_hp(b'A')

b'h'

In [18]:
byte_encoded

b'LIVE'

TypeError: initializer for ctype 'char' must be a bytes of length 1, not bytes

# Query for human reviewed sequences

In [None]:
human_reviewed_query = '(organism_id:9606) AND (reviewed:true)'