forked from biopython/biopython
/
ACT_example.py
117 lines (107 loc) · 4.11 KB
/
ACT_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import os
import time
from reportlab.lib import colors
from reportlab.lib.units import cm
from Bio.Blast.Applications import NcbitblastxCommandline
from Bio.Graphics.GenomeDiagram import Diagram, CrossLink
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
#Modify this line to point at the Artemis/ACT example data which is online at:
# https://github.com/sanger-pathogens/Artemis/tree/master/etc
input_folder = "/Applications/Artemis/Artemis.app/Contents/artemis/etc"
name = "af063097_v_b132222"
file_a = "af063097.embl"
file_b = "b132222.embl"
format_a = "embl"
format_b = "embl"
file_a_vs_b = "af063097_v_b132222.crunch"
for f in [file_a, file_b, file_a_vs_b]:
if not os.path.isfile(os.path.join(input_folder, f)):
print "Missing input file %s.fna" % f
sys.exit(1)
#Only doing a_vs_b here, could also have b_vs_c and c_vs_d etc
genomes = [
(os.path.join(input_folder, file_a), format_a),
(os.path.join(input_folder, file_b), format_b),
]
comparisons = [os.path.join(input_folder, file_a_vs_b)]
#Create diagram with tracks, each with a feature set
assert len(genomes) >= 2 and len(genomes) == len(comparisons)+1
gd_diagram = Diagram(name, track_size=0.35, circular=False)
tracks = dict()
feature_sets = dict()
records = dict()
for f, format in genomes:
records[f] = SeqIO.read(f, format)
tracks[f] = gd_diagram.new_track(1, name=f, start=0, end=len(records[f]),
scale_smalltick_interval=1000,
scale_largetick_interval=10000,
greytrack=True, greytrack_labels=0)
feature_sets[f] = tracks[f].new_set()
print "Drawing matches..."
for i, crunch_file in enumerate(comparisons):
q = genomes[i+1][0] #query file
s = genomes[i][0] #subject file
q_set = feature_sets[q]
s_set = feature_sets[s]
handle = open(crunch_file)
for line in handle:
if line[0]=="#":
continue
parts = line.rstrip("\n").split(None,7)
#0 = score
#1 = id
#2 = S1
#3 = E1
#4 = seq1
#5 = S2
#6 = E2
#7 = seq2
try:
q_start, q_end = int(parts[2]), int(parts[3])
s_start, s_end = int(parts[5]), int(parts[6])
except IndexError:
sys.stderr.write(repr(line) + "\n")
sys.stderr.write(repr(parts) + "\n")
raise
flip = False
if q_start > q_end:
flip = not flip
q_start, q_end = q_end, q_start
if s_start > s_end:
flip = not flip
s_start, s_end = s_end, s_start
if flip:
c = colors.Color(0, 0, 1, alpha=0.25)
b = False
else:
c = colors.Color(1, 0, 0, alpha=0.25)
b = False
q_feature = q_set.add_feature(SeqFeature(FeatureLocation(q_start-1, q_end)),
color=c, border=b)
s_feature = s_set.add_feature(SeqFeature(FeatureLocation(s_start-1, s_end)),
color=c, border=b)
gd_diagram.cross_track_links.append(CrossLink(q_feature, s_feature, c, b))
#NOTE: We are using the same colour for all the matches,
#with transparency. This means overlayed matches will appear darker.
#It also means the drawing order not very important.
#Note ACT puts long hits at the back, and colours by hit score
handle.close()
print "Drawing CDS features..."
for f, format in genomes:
record = records[f]
feature_set = feature_sets[f]
#Mark the CDS features
for cds in record.features:
if cds.type != "CDS":
continue
feature_set.add_feature(cds, sigil="ARROW",
color=colors.lightblue,
border=colors.blue)
gd_diagram.draw(format="linear", fragments=3,
orientation="landscape", pagesize=(20*cm,10*cm))
gd_diagram.write(name + ".pdf", "PDF")
gd_diagram.draw(format="circular",
orientation="landscape", pagesize=(20*cm,20*cm))
gd_diagram.write(name + "_c.pdf", "PDF")