/
annotate_with_genic_elements.pm
193 lines (152 loc) · 6.75 KB
/
annotate_with_genic_elements.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
=head1 NAME
CLIPSeqTools::PreprocessApp::annotate_with_genic_elements - Annotate alignments in a database table with genic information.
=head1 SYNOPSIS
clipseqtools-preprocess annotate_with_genic_elements [options/parameters]
=head1 DESCRIPTION
Annotate alignments in a database table with genic information
Adds columns named "transcript", "exon", "coding_transcript", "utr5", "cds", "utr3".
Column values will be NOT NULL if an alignment is contained in the corresponding region and NULL otherwise.
=head1 OPTIONS
Input options for library.
--driver <Str> driver for database connection (eg. mysql,
SQLite).
--database <Str> database name or path to database file for file
based databases (eg. SQLite).
--table <Str> database table.
--host <Str> hostname for database connection.
--user <Str> username for database connection.
--password <Str> password for database connection.
--records_class <Str> type of records stored in database.
--filter <Filter> filter library. May be used multiple times.
Syntax: column_name="pattern"
e.g. keep reads with deletions AND not repeat
masked AND longer than 31
--filter deletion="def"
--filter rmsk="undef" .
--filter query_length=">31".
Operators: >, >=, <, <=, =, !=, def, undef
Other input
--gtf <Str> GTF file with genes/transcripts.
Database options.
--drop drop columns if they already exist (not
supported in SQlite).
Other options.
-v --verbose print progress lines and extra information.
-h -? --usage --help print help message
=cut
package CLIPSeqTools::PreprocessApp::annotate_with_genic_elements;
# Make it an app command
use MooseX::App::Command;
extends 'CLIPSeqTools::PreprocessApp';
#######################################################################
####################### Load External modules #####################
#######################################################################
use Modern::Perl;
use autodie;
use namespace::autoclean;
use Try::Tiny;
#######################################################################
####################### Command line options ######################
#######################################################################
option 'drop' => (
is => 'rw',
isa => 'Bool',
documentation => 'drop columns if they already exist (not supported in SQlite).',
);
#######################################################################
########################## Consume Roles ##########################
#######################################################################
with
"CLIPSeqTools::Role::Option::Library" => {
-alias => { validate_args => '_validate_args_for_library' },
-excludes => 'validate_args',
},
"CLIPSeqTools::Role::Option::Transcripts" => {
-alias => { validate_args => '_validate_args_for_transcripts' },
-excludes => 'validate_args',
};
#######################################################################
######################## Interface Methods ########################
#######################################################################
sub validate_args {
my ($self) = @_;
$self->_validate_args_for_library;
$self->_validate_args_for_transcripts;
}
sub run {
my ($self) = @_;
warn "Starting: annotate_with_genic_elements\n";
warn "Validating arguments\n" if $self->verbose;
$self->validate_args();
warn "Creating transcript collection\n" if $self->verbose;
my $transcript_collection = $self->transcript_collection;
warn "Creating reads collection\n" if $self->verbose;
my $reads_collection = $self->reads_collection;
my $reads_rs = $reads_collection->resultset;
my $table = $self->table;
if ($self->drop) {
warn "Droping columns transcript, exon, coding_transcript, utr5, cds, utr3\n" if $self->verbose;
try {
$reads_collection->schema->storage->dbh_do( sub {
my ($storage, $dbh, @cols) = @_;
$dbh->do( "ALTER TABLE $table DROP COLUMN transcript" );
$dbh->do( "ALTER TABLE $table DROP COLUMN coding_transcript" );
$dbh->do( "ALTER TABLE $table DROP COLUMN noncoding_transcript" );
$dbh->do( "ALTER TABLE $table DROP COLUMN exon" );
$dbh->do( "ALTER TABLE $table DROP COLUMN utr5" );
$dbh->do( "ALTER TABLE $table DROP COLUMN cds" );
$dbh->do( "ALTER TABLE $table DROP COLUMN utr3" );
});
};
}
try {
warn "Creating columns transcript, exon, coding_transcript, utr5, cds, utr3\n" if $self->verbose;
$reads_collection->schema->storage->dbh_do( sub {
my ($storage, $dbh, @cols) = @_;
$dbh->do( "ALTER TABLE $table ADD COLUMN transcript INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN coding_transcript INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN noncoding_transcript INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN exon INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN utr5 INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN cds INT(1)" );
$dbh->do( "ALTER TABLE $table ADD COLUMN utr3 INT(1)" );
});
} catch {
warn "Warning: Column creation failed. Maybe some columns already exist.\n" if $self->verbose;
warn "Caught error: $_\n" if $self->verbose > 1;
};
warn "Looping on transcripts to annotate records.\nThis might take a long time. Relax...\n" if $self->verbose;
$reads_collection->schema->txn_do(sub {
$transcript_collection->foreach_record_do( sub {
my ($transcript) = @_;
my $transcript_reads_rs = $reads_rs->search({
strand => $transcript->strand,
rname => $transcript->rname,
start => { '-between' => [$transcript->start, $transcript->stop] },
stop => { '-between' => [$transcript->start, $transcript->stop] },
});
$transcript_reads_rs->update({transcript => 1});
foreach my $exon (@{$transcript->exons}) {
my $exon_reads_rs = $transcript_reads_rs->search([
start => { '-between' => [$exon->start, $exon->stop] },
stop => { '-between' => [$exon->start, $exon->stop] },
]);
$exon_reads_rs->update({exon => 1});
}
if ($transcript->is_coding) {
foreach my $part_type ('utr5', 'cds', 'utr3') {
my $part = $transcript->$part_type() or next;
my $part_reads_rs = $transcript_reads_rs->search([
start => { '-between' => [$part->start, $part->stop] },
stop => { '-between' => [$part->start, $part->stop] },
]);
$part_reads_rs->update({$part_type => 1});
}
$transcript_reads_rs->update({coding_transcript => 1});
} else {
$transcript_reads_rs->update({noncoding_transcript => 1});
}
});
});
}
1;