In [13]:
import requests
import time

def query_uniprot():
    base_url = "https://rest.uniprot.org/uniprotkb/search"
    params = {
        "query": "organism_id:9606 AND reviewed:true",
        "fields": "id,accession,cc_caution,go",
        "size": 25  # Number of results per page (adjust if needed)
    }
    
    results = []
    cursor = None
    
    while True:
        print(f"CURSOR={cursor}")
        if cursor:
            params["cursor"] = cursor
        
        response = requests.get(base_url, params=params)
        if response.status_code != 200:
            print(f"Error: {response.status_code}, {response.text}")
            break
        
        data = response.json()
        results.extend(data.get("results", []))
        print(len(results))
        
        # Extract the next cursor from headers
        link_header = response.headers.get("Link")
        if link_header and "rel=\"next\"" in link_header:
            cursor_start = link_header.find("cursor=") + len("cursor=")
            cursor_end = link_header.find("&", cursor_start)
            if cursor_end == -1:
                cursor_end = len(link_header)
            cursor = link_header[cursor_start:cursor_end]
        else:
            break  # No more pages
        
        time.sleep(1)  # To avoid overwhelming the API
    
    return results


In [17]:
import pandas as pd

In [14]:
results = query_uniprot()

CURSOR=None
25
CURSOR=1mkycb2xwxbouulovxb0x6345m8cpfhc7y9d
50
CURSOR=1q25awc2b43xtmojtguxhqekqxui2l1ahn3n
75
CURSOR=1sez60yweyzjbrqq21ik8123loefpvfkgpkp
100
CURSOR=bc4hnkgsgmiobbs91qamwmuz99lpjjr8stpbu
125
CURSOR=bc6uhflfaqdjwtxb7yvajd5ms4c9h73lasyuv
150
CURSOR=bcaboe6of4kc5spe2weufxpy8nxu1yfq692y8
175
CURSOR=bccoi9bb98f7raug94zi2o0lrioe4e3gcusfs
200
CURSOR=bkf6tu31j3a6ulbepmojyhb4suuj51vy5jzua
225
CURSOR=bkhjnp7od752g3ggvv97l7lsbpl32p6cneqls
250
CURSOR=bkl0unsxhlbup28jqssrhs63s96nr8zfqbga8
275
CURSOR=bkndoixkbp6qakdlx1df4igrb3x7ow9tz1m44
300
CURSOR=bspw03palk1pduukdj2h0bracg3cmg1cj6l2o
325
CURSOR=bss8tytxfnwkzczmjrn4n21xvatwn7imsqd6d
350
CURSOR=bsvq0xf6k23d8brpep6ojmm9bufh7yurp1e9k
375
CURSOR=bsy2usjte5y8ttwrkxrc6cwwup615m74ew8j4
400
CURSOR=c10l6dbjo0t7x4dq1fge267fw1rphd1ol4wmt
425
CURSOR=c12y08g6i4o3imis7o11owi3ewi9ewc1bwarc
450
CURSOR=c16f771fmiuvrlav2lkllh2evg3tznu2zfzn8
475
CURSOR=c18s1262gmprd3fx8u5987d2eaudxb8f117hv
500
CURSOR=c9bacmxsqhkqgdwvpbub40nlfn0iv9uooqf7d
525
CURSOR=c9d

In [15]:
len(results)

{'entryType': 'UniProtKB reviewed (Swiss-Prot)',
 'primaryAccession': 'A0A0C5B5G6',
 'uniProtkbId': 'MOTSC_HUMAN',
 'comments': [{'texts': [{'evidences': [{'evidenceCode': 'ECO:0000305',
       'source': 'PubMed',
       'id': '25738459'}],
     'value': 'This peptide has been shown to be biologically active but is the product of a mitochondrial gene. Usage of the mitochondrial genetic code yields tandem start and stop codons so translation must occur in the cytoplasm. The mechanisms allowing the production and secretion of the peptide remain unclear'}],
   'commentType': 'CAUTION'}],
 'uniProtKBCrossReferences': [{'database': 'GO',
   'id': 'GO:0005615',
   'properties': [{'key': 'GoTerm', 'value': 'C:extracellular space'},
    {'key': 'GoEvidenceType', 'value': 'IDA:UniProtKB'}],
   'evidences': [{'evidenceCode': 'ECO:0000314',
     'source': 'PubMed',
     'id': '25738459'}]},
  {'database': 'GO',
   'id': 'GO:0005739',
   'properties': [{'key': 'GoTerm', 'value': 'C:mitochondrion'}

In [16]:
len(results)

20421

In [25]:
rows = []
for result in results:
    row = {}
    id = result["uniProtkbId"]
    for comment in result.get("comments", []):
        typ = comment["commentType"]
        for text in comment.get("texts", []):
            rows.append({"id": id, "text": text["value"]})

summary_df = pd.DataFrame(rows)

In [26]:
summary_df

Unnamed: 0,id,text
0,MOTSC_HUMAN,This peptide has been shown to be biologically...
1,POTB3_HUMAN,Maps to a duplicated region on chromosome 15; ...
2,MYO1C_HUMAN,Represents an unconventional myosin. This prot...
3,IMA4_HUMAN,Was termed importin alpha-4
4,S22A1_HUMAN,Cellular localization of OCT1 in the intestine...
...,...,...
2385,CO005_HUMAN,Product of a dubious CDS prediction. May be a ...
2386,YI012_HUMAN,Could be the product of a pseudogene
2387,YN010_HUMAN,Product of a dubious CDS prediction
2388,YO007_HUMAN,Product of a dubious CDS prediction. May be a ...


In [29]:
!mkdir -p workdir

In [30]:
summary_df.to_csv("workdir/uniprot_comments.tsv", sep="\t", index=False)