-
Notifications
You must be signed in to change notification settings - Fork 0
/
ncbi_fetch_protein.py
executable file
·70 lines (49 loc) · 1.71 KB
/
ncbi_fetch_protein.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''Protein fetcher tool for Zim.
Fetches aminoacid sequences using NCBI identifier. Sequence is written to the
notebook page with the corresponding URL for quick access.
Dependencies:
- Python 2.7
- Biopython 1.56
Install:
Tools > Custom Tools > +
Name: NCBI Protein
Description: Fetches aminoacid sequence of a gene from NCBI
command: ~/.local/share/zim/plugins/zim-biotools/custom_tools/ncbi_fetch_protein.py your@email.com %t %s
Icon: no default
UNCHECKED Command does not modify data
UNCHECKED Output should replace current selection
CHECKED Show in the toolbar
Usage:
Paste the gene protein ID in the page where you want to have the sequence.
Select the ID and run the custom tool with Tools > "NCBI Protein".
'''
import sys
from Bio import Entrez, SeqIO
# User email
user_email = sys.argv[1]
# Sequence ID
seq_id = sys.argv[2]
# Output file
page = sys.argv[3]
#Set email address required for Entrez.
Entrez.email = user_email
# Fetch data via Entrez using the sequence id as FASTA.
handle = Entrez.efetch(db='protein', id=seq_id, rettype='fasta')
# Create record from FASTA.
record = SeqIO.read(handle, 'fasta')
# Parse gene id to generate url.
gene_id = record.id.split('|')[1]
url = 'http://www.ncbi.nlm.nih.gov/protein/%s?report=fasta' % gene_id
# Parse organism (dirtily...).
species = record.description.split('[')[1].split(']')[0].replace(' ', '_')
# Open notebook page as file.
f = open(page, 'a')
# Print protein sequence.
f.write("""@{species}\n\n{url}\n\n@gene\n'''\n{sequence}\n'''\n""".format(
url=url, species=species, sequence=record.format('fasta')))
# Close file.
f.close()
# Exit program.
sys.exit(0)