forked from dence/trichy_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
annotate_refseqs.py
156 lines (134 loc) · 5.88 KB
/
annotate_refseqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/home/dence/applications/anaconda/bin/python
#Daniel Ence
#06/27/2015
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import sys
import os
import argparse
import re
import subprocess
parser = argparse.ArgumentParser(description="annotate contigs with reference sequences")
parser.add_argument("RefSeq",type=str, help="fasta file of the reference sequences")
parser.add_argument("output",type=str, help="directory to be made and will contain the maker output directories")
parser.add_argument("directories",nargs='+',type=str, help="list of directories containing velvet contigs")
#take a RefSeq and a bunch of assembly directories
args=parser.parse_args()
print "making directory for output " + args.output
if not os.path.exists(args.output):
os.mkdir(args.output)
#print args.directories
#figure out which files go with which refseqs
refseq_fasta = SeqIO.parse(args.RefSeq,"fasta")
filesPerSeqID = {}
samples = []
for refseq in refseq_fasta:
print "Gathering fasta files of the velvet-assembled contigs"
new_file = args.output + "/" + refseq.id + ".velvet_contigs.fa"
new_filehandle = open(new_file, 'w')
filesPerSeqID[refseq.id] = new_file
curr_ID_pattern = re.compile(refseq.id)
#get the velvet contig for reach sample
for directory in args.directories:
curr_fasta = directory + "/" + refseq.id + ".classified_reads.fastq_velvet/contigs.fa"
curr_sample = re.split("/",directory)[1]
samples.append(curr_sample)
if(os.path.exists(curr_fasta)):
curr_fasta_file = SeqIO.parse(curr_fasta,"fasta")
counter = 0
for contig in curr_fasta_file:
curr_id = curr_sample + "_" + str(counter) + "." + refseq.id
counter = counter + 1
tmp_record = SeqRecord(contig.seq,id=curr_id,name=curr_id,description="")
SeqIO.write(tmp_record,new_filehandle,"fasta")
else:
print "This file was not found:\t" + curr_fasta + " in this directory:\t" + directory
continue
new_filehandle.close()
print "Finished gathering fasta files of velvet-assembled contigs"
#move into the output directory
subprocess.call("cd " + args.output,shell=True)
#Run maker for the files that we just made
subprocess.call("maker -CTL",shell=True)
#fix the control files for the current refseq
opts_handle = open("maker_opts.ctl",'r')
tmp_opts = "tmp_opts"
tmp_opts_handle = open("tmp_opts",'w')
for line in opts_handle:
if re.match("est=",line):
tmp_opts_handle.write(line.replace("est= ","est=" + args.RefSeq + " "))
elif re.match("single_exon=",line):
tmp_opts_handle.write(line.replace("single_exon=0","single_exon=1"))
elif re.match("est2genome=",line):
tmp_opts_handle.write(line.replace("est2genome=0","est2genome=1"))
elif re.match("model_org=",line):
tmp_opts_handle.write(line.replace("model_org=all","model_org="))
elif re.match("always_complete=",line):
tmp_opts_handle.write(line.replace("always_complete=0","always_complete=1"))
else:
tmp_opts_handle.write(line)
opts_handle.close()
tmp_opts_handle.close()
subprocess.call("maker -g " + new_file + " " + tmp_opts + " maker_bopts.ctl maker_exe.ctl",shell=True)
print "Finished running maker on velvet contigs"
refseq_fasta.close()
refseq_fasta = SeqIO.parse(args.RefSeq,"fasta")
print "Aligning est2genome hits with clustalw"
for refseq in refseq_fasta:
print "\tAligning for this ref seq:\t"
curr_datastore = refseq.id + ".velvet_contigs.maker.output/" + refseq.id \
+ ".velvet_contigs_datastore"
curr_datastore_log = refseq.id + ".velvet_contigs.maker.output/" + refseq.id \
+ ".velvet_contigs_master_datastore_index.log"
est2genome_fasta = args.output + "/" + refseq.id + ".est2genome.fasta"
est2genome_handle = open(est2genome_fasta,'w')
SeqIO.write(refseq,est2genome_handle,"fasta")
log_handle = open(curr_datastore_log,'r')
for line in log_handle:
seqname = line.split("\t")[0]
if(re.search(refseq.id,seqname) and re.search("FINISHED",line)):
print "\n\n" + refseq.id
print len(refseq.seq)
seq_dir = line.split("\t")[1]
#print seq_dir.split("/")
tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position
gff_file = refseq.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff"
gff_handle = open(gff_file,'r')
for gff_line in gff_handle:
if(re.search("est2genome",gff_line) and \
re.search("\texpressed_sequence_match\t",gff_line)):
curr_start = int(gff_line.split("\t")[3])
curr_stop = int(gff_line.split("\t")[4])
curr_strand = gff_line.split("\t")[6]
should_be_length = curr_stop - curr_start
print gff_line
print should_be_length
tmp_handle = open(filesPerSeqID[refseq.id],'r')
tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta"))
tmp_handle.close()
if seq_dir.split("/")[3] in tmp_fasta:
curr_record = tmp_fasta[seq_dir.split("/")[3]]
else:
continue
new_seq = curr_record.seq[curr_start - 1:curr_stop]
if(curr_strand == "-"):
new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement()
print len(new_seq)
new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="")
SeqIO.write(new_record,est2genome_handle,"fasta")
gff_handle.close()
est2genome_handle.close()
curr_seqID_align_out = args.output + "/" + refseq.id + ".pptx"
clustalw2_command = "clustalw2 -INFILE=" + est2genome_fasta + " -align -type=DNA -outfile=" + curr_seqID_align_out \
+ " -output=phylip-relaxed"
print "aligning this file:\t" + est2genome_fasta
print clustalw2_command
subprocess.call(clustalw2_command,shell=True)
refseq_fasta.close()
#make a maker directory for curr refseq
#get the velvet_contig for curr refseq from each assembly directory
#put all the contigs into a fasta file in the maker directory
#make control files with the refseq file as EST evidence and est2genome turned on
#run maker with the contigs as the genome and the refeseq files as the EST evidence
#get the sequences with their names