# Download SwissProt proteomes

In [22]:
import requests

import pandas as pd

from io import StringIO
from pathlib import Path
from tqdm import tqdm

In [1]:
# this fell from the sky!
apid_species = {632, 1392, 3702, 6239, 7227, 7955, 8355, 9031, 9913, 9986, 10090, 10116, 10299, 10377, 11706, 36329, 37296, 39947, 83333, 85962, 192222, 224308, 237561, 243276, 272634, 284812, 333760, 559292, 868565, 1111708, 2697049}
len(apid_species)

31

Query [UniProt proteomes](https://www.uniprot.org/proteomes/) for reference proteomes using our species identifiers

In [7]:
url = 'https://www.uniprot.org/proteomes/?fil=reference:yes&format=tab&query=' + '+OR+'.join(
    f'organism:{sp}' for sp in apid_species)

In [18]:
r = requests.get(url)
tab = pd.read_csv(StringIO(r.text), sep='\t')
assert len(tab['Proteome ID']) == len(tab)
tab

Unnamed: 0,Proteome ID,Organism,Organism ID,Protein count,BUSCO,CPD,Genome representation (RefSeq)
0,UP000001570,Bacillus subtilis (strain 168) (Strain: 168),224308,4260,"C:99.3%[S:99.1%,D:0.2%],F:0.2%,M:0.4%,n:450",Standard,full
1,UP000001940,Caenorhabditis elegans (Strain: Bristol N2),6239,26548,"C:100%[S:74.7%,D:25.2%],F:0%,M:0%,n:3131",Close to standard (low value),full
2,UP000000589,Mus musculus (Mouse) (Strain: C57BL/6J),10090,55315,"C:99.7%[S:50.8%,D:48.9%],F:0.1%,M:0.3%,n:13798",Close to standard (high value),full
3,UP000009294,Human herpesvirus 1 (strain 17) (HHV-1) (Human...,10299,73,,Close to standard (high value),full
4,UP000000808,Mycoplasma pneumoniae (strain ATCC 29342 / M12...,272634,686,"C:96.6%[S:96.6%,D:0%],F:1.1%,M:2.3%,n:174",Standard,full
5,UP000186698,Xenopus laevis (African clawed frog) (Strain: J),8355,44571,"C:94.7%[S:46%,D:48.8%],F:1.1%,M:4.2%,n:5310",Unknown,full
6,UP000000811,Treponema pallidum (strain Nichols) (Strain: N...,243276,1027,"C:91%[S:91%,D:0%],F:1.2%,M:7.8%,n:345",Standard,full
7,UP000001450,Plasmodium falciparum (isolate 3D7) (Strain: I...,36329,5376,"C:99.1%[S:98%,D:1.1%],F:0%,M:0.9%,n:3642",Standard,full
8,UP000002311,Saccharomyces cerevisiae (strain ATCC 204508 /...,559292,6062,"C:99.6%[S:97.4%,D:2.2%],F:0.1%,M:0.3%,n:2137",Outlier (high value),full
9,UP000059680,Oryza sativa subsp. japonica (Rice) (Strain: c...,39947,48900,"C:84.4%[S:78.1%,D:6.3%],F:5%,M:10.6%,n:4896",Outlier (high value),full


In [15]:
# find missing species
apid_species - set(tab['Organism ID'])

{37296}

Manual lookup turned up [UP000097197](https://www.uniprot.org/proteomes/UP000097197); and of course [UP000005640](https://www.uniprot.org/proteomes/UP000005640) for Human

In [40]:
proteome_dir = Path.cwd().parent / 'proteomes'
proteome_dir.mkdir(exist_ok=True, parents=True)
proteome_url = 'https://www.uniprot.org/uniprot/?format=fasta&query=proteome:'

Downloading doesn't even take 10min and it's only 320MB overall.

In [38]:
with tqdm(tab.iterrows(), total=len(tab)) as pbar:
    for i, proteome in pbar:
        pbar.set_postfix(batch=proteome['Organism'].split('(')[0].strip())
        r = requests.get(proteome_url + proteome['Proteome ID'], stream=True)
        with (proteome_dir / f'{proteome["Organism ID"]}.fasta').open('wb') as fd:
            for chunk in r.iter_content(chunk_size=1024 * 128):
                fd.write(chunk)

100%|██████████| 30/30 [07:57<00:00, 15.92s/it, batch=Human papillomavirus type 16]                         


Dann noch `apid.fasta` aus `6082/apid_sequences` dazu kopiert und ab geht die Post

`nrapid.py`

```python
# !/usr/bin/env python3

import sys

from pathlib import Path
from tqdm import tqdm

wd = Path(__file__).resolve().parent
package_root = wd.parents[1]
if package_root not in sys.path:
    sys.path.append(str(package_root))

from data.utils.general import run_uniqueprot2D

proteomes = {f for f in wd.rglob('*.fasta') if f.stem.isnumeric()}
assert len(proteomes) == 32, f'wrong number of FASTA files found: {len(proteomes)}, should be 32'

hval_config = dict(shortAlignmentLen=50,
                   longSeqLen=180,
                   reduceRandomAlign=False,
                   clusterThreshold=20)

with tqdm(proteomes) as pbar:
    for p in pbar:
        pbar.set_postfix(batch=p.stem)
        run_uniqueprot2D(input_file=p, database_file=wd / 'apid.fasta',
                         output_file=wd / f'{p.stem}_nrapid.fasta',
                         hval_config=hval_config)
```

`nrapid.yml`

```yaml
# For more options, check out:
# https://www.ibm.com/support/knowledgecenter/en/SSWRJV_10.1.0/lsf_command_ref/bsub.yaml.1.html
io:
    outputOverwriteFile: /mnt/project/kaindl/ppi/data/proteomes/stdout.log
    errorOverwriteFile:  /mnt/project/kaindl/ppi/data/proteomes/stderr.log
    cwd: /mnt/project/kaindl/ppi/data/proteomes
limit:
    coreLimit: 40
    # in hh:mm
    runtimeLimit: 16:00
    # Limit the execution to 8GB of CPU RAM
    memLimit: 100GB!
resource:
    # GPU options
    # shared job up to 43GB of GPU RAM
    # IMPORTANT: limits are not strictly enforced
    # make sure you allocate as much as you will maximally need!
    # Failing to do so may result in your or someone elses job failing.

    # gpu: num=1/task:mode=shared:gmem=4G:j_exclusive=no:gpack=yes

    # If job>43GB, ask for exclusive GPU use
    # this MUST be limited to 2 exclusive use jobs per user!
    #gpu: num=1:mode=exclusive_process:gmem=40G:j_exclusive=yes

    machines: lsf-server-2

properties:
    queueName: mid-end-normal
    jobName: kaindl_proteomes_clust
command: python /mnt/project/kaindl/ppi/data/proteomes/nrapid.py
```