In [19]:
import datacache
import gzip

def read_protein_sequence_from_path(path, max_count=None):
    proteins = {}
    metadata = {}
    buffer = []
    last_name = None
    last_metadata_entry = None
    count = 0 
    def add_entry():
        nonlocal count 
        if buffer and last_name and (max_count is None or count < max_count):
            seq = "".join(buffer)
            if "*" not in seq:
                assert last_name not in proteins
                assert last_name not in metadata
                proteins[last_name] = seq
                metadata[last_name] = last_metadata_entry
                count += 1

            buffer.clear()
    if path.endswith("gz"):
        opener = gzip.open
    else:
        opener = open
    with opener(path, "rt", newline="\n") as f:
        for l in f:
            if l[0] == ">":
                add_entry()
                if max_count is not None and count >= max_count:
                    break
                parts = l[1:].split()
                last_name = parts[0]
                last_metadata_entry = {}
                curr_key = "description"
                curr_value = ""
                for part in parts[1:]:
                    if "=" in part:
                        if curr_value:
                            last_metadata_entry[curr_key] = curr_value
                        curr_key, curr_value = part.split("=")
                    else:
                        curr_value += " "
                        curr_value += part
                
            
                
            elif len(l) == 0:
                continue
            else:
                # get rid of last line because it's '\n'
                buffer.append(l[:-1])
    add_entry()
    return proteins, metadata

def read_protein_sequence_from_url(url, max_count=None):
    path = cache.fetch(url)
    return read_protein_sequence_from_path(path, max_count=max_count)

In [20]:
prot, metadata = read_protein_sequence_from_path("../data/uniprot_sprot.fasta")

In [22]:
!wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz

--2023-06-09 17:45:21--  https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.tar.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.13, 165.112.9.228
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 130061535 (124M) [application/x-gzip]
Saving to: ‘new_taxdump.tar.gz’


2023-06-09 17:45:30 (14.5 MB/s) - ‘new_taxdump.tar.gz’ saved [130061535/130061535]



In [23]:
!gunzip new_taxdump.tar.gz


In [24]:
ls

Inspect Uniprot data.ipynb    new_taxdump.tar
Process reference data.ipynb


In [58]:
!tar xvf new_taxdump.tar

x citations.dmp
x delnodes.dmp
x division.dmp
x excludedfromtype.dmp
x fullnamelineage.dmp
x gencode.dmp
x host.dmp
x images.dmp
x merged.dmp
x names.dmp
x nodes.dmp
x rankedlineage.dmp
x taxidlineage.dmp
x typematerial.dmp
x typeoftype.dmp


In [59]:
!mv *.dmp ../data

In [53]:
from collections import Counter
mapping = {}
counter = Counter()
for i, l in enumerate(open("../data/rankedlineage.dmp").read().split("\n")):
    parts = [part.strip() for part in l.split("|") if len(part.strip()) > 0]
    if not parts:
        continue
    ident = parts[0]
    counter[parts[-2]] += 1

In [54]:
counter.most_common()

[('Metazoa', 1186593),
 ('Viridiplantae', 256096),
 ('Pseudomonadota', 222396),
 ('Fungi', 191090),
 ('Orthornavirae', 183607),
 ('Bacillota', 89517),
 ('Actinomycetota', 83206),
 ('Cyanobacteriota', 29484),
 ('Heunggongvirae', 27869),
 ('Bacteroidota', 24396),
 ('Rhodophyta', 9646),
 ('Apicomplexa', 9249),
 ('Euryarchaeota', 8198),
 ('Shotokuvirae', 6790),
 ('Dinophyceae', 5580),
 ('Ciliophora', 5124),
 ('Bacillariophyta', 4859),
 ('Pararnavirae', 4071),
 ('Mycoplasmatota', 3954),
 ('Spirochaetota', 3653),
 ('Oomycota', 3548),
 ('Campylobacterota', 3486),
 ('Thermodesulfobacteriota', 2943),
 ('Bamfordvirae', 2709),
 ('Phaeophyceae', 2637),
 ('Euglenozoa', 2289),
 ('Foraminifera', 1883),
 ('Discosea', 1833),
 ('Verrucomicrobiota', 1823),
 ('Chloroflexota', 1613),
 ('Bigyra', 1419),
 ('Planctomycetota', 1407),
 ('Acidobacteriota', 1328),
 ('Evosea', 1303),
 ('Deltaproteobacteria', 1288),
 ('Nitrososphaerota', 1270),
 ('Thermoproteota', 1259),
 ('Sangervirae', 1237),
 ('Cercozoa', 1140),

In [64]:
!for f in ../data/*.dmp; do echo $f; head -n 10 $f; done

../data/citations.dmp
5	|	The domestic cat: perspective on the nature and diversity of cats.	|	0	|	8603894	|	 	|		|	9685 	|
7	|	Equine herpesvirus	|	0	|	819656	|	 	|		|		|
8	|	Yabuuchi E et al. (1990)	|	0	|	2111872	|		|	Yabuuchi, E., Yano, I., Oyaizu, H., Hashimoto, Y., Ezaki, T., and Yamamoto, H. \"Proposals of Sphingomonas paucimobilis gen. nov. and comb. nov., Sphingomonas parapaucimobilis sp. nov., Sphingomonas yanoikuyae sp. nov., Sphingomonas adhaesiva sp. nov., Sphingomonas capsulata comb. nov., and two genospecies of the genus Sphingomonas.\" Microbiol. Immunol. (1990) 34:99-119.	|	13687 13688 13689 13690 28212 28213 	|
9	|	Dennis PJ et al. (1993)	|	0	|	8494743	|		|	Dennis, P.J., Brenner, D.J., Thacker, W.L., Wait, R., Vesey, G., Steigerwalt, A.G., and Benson, R.F. \"Five new Legionella species isolated from water.\" Int. J. Syst. Bacteriol. (1993) 43:329-337.	|	45065 45068 45070 45072 45076 	|
53	|	ATCC 39723	|	0	|	0	|	http://www.atcc.org/cgi-bin/SFgate?language=english&v

In [72]:
!rm -rf ../data; mkdir -p ../data; cd ../data

--2023-06-12 15:03:48--  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6527 (6.4K) [text/html]
Saving to: ‘ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/index.html’


2023-06-12 15:03:48 (34.0 MB/s) - ‘ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/index.html’ saved [6527/6527]

Loading robots.txt; please ignore errors.
--2023-06-12 15:03:48--  https://ftp.uniprot.org/robots.txt
Reusing existing connection to ftp.uniprot.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 26 [text/plain]
Saving to: ‘ftp.uniprot.org/robots.txt’


2023-06-12 15:03:48 (24.8 MB/s) - ‘ftp.uniprot.org/robots.txt’ saved [26/26]

FINISHED --2023-06-12 15:03:48--
Total w

In [81]:
!rm ../data/ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/index.html

In [83]:
# download swiss-prot 
"""
      uniprot_sprot_archaea.dat.gz                          2023-05-03 10:00   14M  
      uniprot_sprot_archaea.xml.gz                          2023-05-03 10:00   17M  
      uniprot_sprot_bacteria.dat.gz                         2023-05-03 10:00  211M  
      uniprot_sprot_bacteria.xml.gz                         2023-05-03 10:00  293M  
      uniprot_sprot_fungi.dat.gz                            2023-05-03 10:00   54M  
      uniprot_sprot_fungi.xml.gz                            2023-05-03 10:00   74M  
      uniprot_sprot_human.dat.gz                            2023-05-03 10:00  103M  
      uniprot_sprot_human.xml.gz                            2023-05-03 10:00  149M  
      uniprot_sprot_invertebrates.dat.gz                    2023-05-03 10:00   37M  
      uniprot_sprot_invertebrates.xml.gz                    2023-05-03 10:00   51M  
      uniprot_sprot_mammals.dat.gz                          2023-05-03 10:00   21M  
      uniprot_sprot_mammals.xml.gz                          2023-05-03 10:00   31M  
      uniprot_sprot_plants.dat.gz                           2023-05-03 10:00   54M  
      uniprot_sprot_plants.xml.gz                           2023-05-03 10:00   80M  
      uniprot_sprot_rodents.dat.gz                          2023-05-03 10:00   65M  
      uniprot_sprot_rodents.xml.gz                          2023-05-03 10:00  103M  
      uniprot_sprot_vertebrates.dat.gz                      2023-05-03 10:00   19M  
      uniprot_sprot_vertebrates.xml.gz                      2023-05-03 10:00   25M  
      uniprot_sprot_viruses.dat.gz                          2023-05-03 10:00   17M  
      uniprot_sprot_viruses.xml.gz                          2023-05-03 10:00   23M  
"""


'\n      uniprot_sprot_archaea.dat.gz                          2023-05-03 10:00   14M  \n      uniprot_sprot_archaea.xml.gz                          2023-05-03 10:00   17M  \n      uniprot_sprot_bacteria.dat.gz                         2023-05-03 10:00  211M  \n      uniprot_sprot_bacteria.xml.gz                         2023-05-03 10:00  293M  \n      uniprot_sprot_fungi.dat.gz                            2023-05-03 10:00   54M  \n      uniprot_sprot_fungi.xml.gz                            2023-05-03 10:00   74M  \n      uniprot_sprot_human.dat.gz                            2023-05-03 10:00  103M  \n      uniprot_sprot_human.xml.gz                            2023-05-03 10:00  149M  \n      uniprot_sprot_invertebrates.dat.gz                    2023-05-03 10:00   37M  \n      uniprot_sprot_invertebrates.xml.gz                    2023-05-03 10:00   51M  \n      uniprot_sprot_mammals.dat.gz                          2023-05-03 10:00   21M  \n      uniprot_sprot_mammals.xml.gz                 

In [86]:
!wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_{archaea,bacteria,fungi,human,invertebrates,mammals,plants,rodents,vertebrates,viruses}.dat.gz

--2023-06-12 15:07:48--  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_archaea.dat.gz
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15097140 (14M) [application/x-gzip]
Saving to: ‘uniprot_sprot_archaea.dat.gz’


2023-06-12 15:07:54 (2.46 MB/s) - ‘uniprot_sprot_archaea.dat.gz’ saved [15097140/15097140]

--2023-06-12 15:07:54--  https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_sprot_bacteria.dat.gz
Reusing existing connection to ftp.uniprot.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 221276608 (211M) [application/x-gzip]
Saving to: ‘uniprot_sprot_bacteria.dat.gz’


2023-06-12 15:09:57 (1.71 MB/s) - ‘uniprot_sprot_bacteria.dat.gz’ saved [221276608/221276608]

--2023-06-12 15:09:57--  https://ftp.unip

In [87]:
ls

Inspect Uniprot data.ipynb          uniprot_sprot_invertebrates.dat.gz
Process reference data.ipynb        uniprot_sprot_mammals.dat.gz
new_taxdump.tar                     uniprot_sprot_plants.dat.gz
uniprot_sprot_archaea.dat.gz        uniprot_sprot_rodents.dat.gz
uniprot_sprot_bacteria.dat.gz       uniprot_sprot_vertebrates.dat.gz
uniprot_sprot_fungi.dat.gz          uniprot_sprot_viruses.dat.gz
uniprot_sprot_human.dat.gz


In [88]:
mv *.dat.gz ../data

In [89]:
ls

Inspect Uniprot data.ipynb    new_taxdump.tar
Process reference data.ipynb


In [90]:
rm new_taxdump.tar

In [91]:
cd ../data

/Users/iskander/code/weirdo/data


In [92]:
ls

[1m[36mftp.uniprot.org[m[m/                    uniprot_sprot_mammals.dat.gz
uniprot_sprot_archaea.dat.gz        uniprot_sprot_plants.dat.gz
uniprot_sprot_bacteria.dat.gz       uniprot_sprot_rodents.dat.gz
uniprot_sprot_fungi.dat.gz          uniprot_sprot_vertebrates.dat.gz
uniprot_sprot_human.dat.gz          uniprot_sprot_viruses.dat.gz
uniprot_sprot_invertebrates.dat.gz


In [93]:
rm -rf ftp.uniprot.org/

ID   1433B_HUMAN             Reviewed;         246 AA.
AC   P31946; A8K9K2; E1P616;
DT   01-JUL-1993, integrated into UniProtKB/Swiss-Prot.
DT   23-JAN-2007, sequence version 3.
DT   03-MAY-2023, entry version 242.
DE   RecName: Full=14-3-3 protein beta/alpha;
DE   AltName: Full=Protein 1054;
DE   AltName: Full=Protein kinase C inhibitor protein 1;
DE            Short=KCIP-1;
DE   Contains:
gzcat: error writing to output: Broken pipe
gzcat: uniprot_sprot_human.dat.gz: uncompress failed
