Terry's updates to microbedb involving log4perl and other updates

mlangill · May 10, 2012 · 0d4e824 · 0d4e824
1 parent a36e0d1
commit 0d4e824
Show file tree

Hide file tree

Showing 18 changed files with 248 additions and 37 deletions.
diff --git a/GenomeProject.pm b/GenomeProject.pm
@@ -32,9 +32,12 @@ use base ("MicrobeDB::MicrobeDB");
 use Carp;
 
 require MicrobeDB::Replicon;
-
 require MicrobeDB::Search;
 
+use Log::Log4perl qw(get_logger :nowarn);
+my $logger = Log::Log4perl->get_logger();
+
+
 my @FIELDS;
 my @_db_fields;
 my %_field_hash;
@@ -154,6 +157,14 @@ sub new {
 					$arg{$attr}->[$i] = new MicrobeDB::Replicon( %{ $arg{$attr}->[$i] } );
 				}
 			}
+		} elsif ($attr eq 'genome_size') {
+			# We have to test genome_size and genome_gc to see if they're
+			# null, otherwise when it tries to retreive the replicons
+			# to calculate it and gpv_id isn't yet set bad things happen.
+			# (infinite loop)
+			next unless$arg{$attr};			
+		} elsif ($attr eq 'genome_gc') {
+			next unless$arg{$attr};			
 		}
 
 		#do the same for references
@@ -185,7 +196,7 @@ sub add_replicon {
 	} elsif ( ref($rep) eq 'HASH' ) {
 		$rep_obj = new MicrobeDB::Replicon(%$rep);
 	} else {
-		croak "Only a MicrobeDB::Replicon object or hash can be used to add a Replicon";
+		$logger->logcroak("Only a MicrobeDB::Replicon object or hash can be used to add a Replicon");
 	}
 	push( @{ $self->{replicons} }, $rep_obj );
 }
@@ -212,6 +223,12 @@ sub next_replicon {
 #retrieves all replicons for this genome project
 sub _retrieve_replicons{
     my ($self) =@_;
+
+    # In case a genome didn't load properly and we don't
+    # get back a proper object, we DON'T want to search
+    # with no gpv_id, ugh, not good.
+    return () unless( $self->{gpv_id});
+
     my $rep = new MicrobeDB::Replicon(gpv_id => $self->gpv_id());
     my $so = new MicrobeDB::Search();
     my @reps = $so->object_search($rep);

diff --git a/Iterator.pm b/Iterator.pm
@@ -0,0 +1,83 @@
+# Copyright (C) Matthew R. Laird
+# Author lairdm@sfu.ca
+
+# This file is part of MicrobeDB
+
+#MicrobeDB is free software: you can redistribute it and/or modify
+#it under the terms of the GNU General Public License as published by
+#the Free Software Foundation, either version 3 of the License, or
+#(at your option) any later version.
+
+#MicrobeDB is distributed in the hope that it will be useful,
+#but WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#GNU General Public License for more details.
+
+#You should have received a copy of the GNU General Public License
+#along with MicrobeDB.  If not, see <http://www.gnu.org/licenses/>.
+
+package MicrobeDB::Iterator;
+
+# This class contains an iterator object for use with 
+# MicrobeDB::Search.  Some search results are too large
+# to return as a single object so the iterator class
+# gives access to the results row-by-row.
+
+#inherit common methods and fields from the MicroDB class
+use base ("MicrobeDB::MicrobeDB");
+
+use strict;
+use warnings;
+use Carp;
+
+my @FIELDS;
+
+BEGIN {
+@FIELDS = qw(
+  ret_obj
+  dbh_obj
+);
+}
+use fields @FIELDS;
+
+sub new {
+    my ($class, %arg) = @_;
+
+    # bless and restruct the object
+    my $self = fields::new($class);
+
+    foreach my $attr ( keys(%arg) ) {
+
+	#set the attribute in the object
+	$self->$attr( $arg{$attr} );
+    }
+
+    return $self;
+}
+
+# Return the next object from the database
+
+sub nextRecord {
+    my ($self) = @_;
+
+    	#Extract the results back into objects
+	{
+
+	    #temporarily turn off strict
+	    no strict "refs";
+	    if ( my $curr_row = $self->{dbh_obj}->fetchrow_hashref ) {
+
+		#Create a object of the return type from the hash
+		my $obj = $self->{ret_obj}->new(%$curr_row);
+		return $obj;
+	    }
+	}
+
+	return undef;
+}
+
+sub rows {
+    my ($self) = @_;
+
+    return $self->{dbh_obj}->rows;
+}
diff --git a/MicrobeDB.pm b/MicrobeDB.pm
@@ -34,13 +34,15 @@ BEGIN{
 }
 use fields @FIELDS;
 
-#PATH Settings
+use Log::Log4perl qw(get_logger :nowarn);
+my $logger = Log::Log4perl->get_logger();
+
 
 #MicrobeDB MySQL settings
-my $db = 'microbedb';
-my $db_config = "$ENV{HOME}/.my.cnf";
-die "MySQL config file:$db_config can not be found!" unless -e $db_config;
-my $dsn = "DBI:mysql:database=$db;mysql_read_default_file=$db_config";
+my $db_config = $ENV{HOME}/.my.cnf";
+$logger->logdie("MySQL config file: $db_config can not be found!") unless -e $db_config;
+my $database=$ENV{"MicrobeDB"}||"microbedb";  # if unable to access .bashrc, use microbedb
+my $dsn = "DBI:mysql:database=$database;mysql_read_default_file=$db_config";
 
 #note that these fields are taken from the config file "my.cnf"
 my ($user,$pass) = ("","");
@@ -121,13 +123,13 @@ sub _db_connect {
 		eval {
 			#Try to connect to microbeDB
 			$dbh = DBI->connect( $dsn, $user, $pass, { RaiseError => 1 } )
-			  || die $DBI::errstr;
+			  || $logger->logdie($DBI::errstr);
 		};
 		#if there is an error or we the handle is empty then try again
 		if($@ || !defined($dbh)){
 	
-		croak("Failed to connect to microbeDB! $max_tries tries have failed! \n$@") if $try == $max_tries;
-		warn "Failed to connect to microbeDB! Trying again in 5 seconds. This is attempt $try of $max_tries. \n$@";
+		$logger->logcroak("Failed to connect to microbeDB! $max_tries tries have failed! $@") if $try == $max_tries;
+		$logger->logwarn("Failed to connect to microbeDB! Trying again in 5 seconds. This is attempt $try of $max_tries. $@");
 		
 		#increase wait time by 5 seconds on each failure
 		sleep(5*$try);
@@ -136,7 +138,7 @@ sub _db_connect {
 		}
 	}
 	unless(defined($dbh)){
-	    die "Can't connect to db:$!";
+	    $logger->logdie("Can't connect to db: $!");
 	}
 	
 	# Save the dhb for later

diff --git a/Parse.pm b/Parse.pm
@@ -71,6 +71,13 @@ sub parse_genome{
 
     my $gpo=$self->gpo();
     $gpo->gpv_directory($dir);
+    $logger->debug("Parsing directory: $dir");
+
+    # Parse directory name for gp_id
+    if($dir =~ /.*_uid(\d+)\/?$/) {
+	$gpo->gp_id($1);
+	$logger->debug("Found gp_id: $1");
+    }
 
     my @files = glob($dir.'*');
 
@@ -154,7 +161,8 @@ sub parse_gbk {
 	    $rep->rep_seq($seq->seq());
 	    $rep->file_name($file_name);
 	    $rep->file_types($file_types);
-	    $rep->rep_accnum($seq->accession_number());
+	    my $rep_accnum = $seq->accession_number().'.'.$seq->version();
+	    $rep->rep_accnum($rep_accnum);
 
 	    my $rep_ginum = $seq->primary_id();
 	    unless($rep_ginum =~ /\D/){
@@ -322,10 +330,11 @@ sub parse_ncbicompgenomefile {
 sub parse_ncbiorginfofile {
     my($self,$org_info_file)=@_;
     my $gpo=$self->gpo();
-    my $taxon_id=$gpo->taxon_id();
+#    my $taxon_id=$gpo->taxon_id();
+    my $gp_id = $gpo->gp_id();
 
-    unless($taxon_id){
-	$logger->warn("No taxon_id so can't look up organism information in $org_info_file");
+    unless($gp_id){
+	$logger->warn("No gp_id so can't look up organism information in $org_info_file");
 	return;
     }
 
@@ -343,7 +352,8 @@ sub parse_ncbiorginfofile {
 	    }
 	} elsif (/^\d+\s+\w+/) {
 	    my @entries = split(/\t/);
-	    if ( $entries[2] == $taxon_id ) {
+#	    if ( $entries[2] == $taxon_id ) {
+	    if ( $entries[0] == $gp_id ) {
 		my $i=0;
 		foreach (@entries) {
 		    $info_org_parse{ $headings[$i] } = $_;
@@ -356,7 +366,7 @@ sub parse_ncbiorginfofile {
     }
     if($found_orginfo){
 	#map the old code parse hash to the gpo
-	$gpo->gp_id($info_org_parse{'RefSeq project ID'}) if exists($info_org_parse{'RefSeq project ID'});
+#	$gpo->gp_id($info_org_parse{'RefSeq project ID'}) if exists($info_org_parse{'RefSeq project ID'});
 	$gpo->gram_stain($info_org_parse{'Gram Stain'}) if exists($info_org_parse{'Gram Stain'});
 	$gpo->disease($info_org_parse{'Disease'}) if exists($info_org_parse{'Disease'});
 	$gpo->pathogenic_in($info_org_parse{'Pathogenic in'}) if exists($info_org_parse{'Pathogenic in'});
@@ -391,7 +401,7 @@ sub parse_ncbiorginfofile {
 	}
 
     }else{
-	$logger->warn("Couldn't find taxon id: $taxon_id within the org_info_file: $org_info_file . Many fields in GenomeProject will not be filled for this organism");
+	$logger->warn("Couldn't find gp id: $gp_id within the org_info_file: $org_info_file . Many fields in GenomeProject will not be filled for this organism");
     }
 
 }

diff --git a/Replicon.pm b/Replicon.pm
@@ -30,8 +30,12 @@ use Carp;
 
 
 use MicrobeDB::Gene;
+use MicrobeDB::GenomeProject;
 require MicrobeDB::Search;
 
+use Log::Log4perl qw(get_logger :nowarn);
+my $logger = Log::Log4perl->get_logger();
+
 my @FIELDS;
 my @replicon;
 my @version;
@@ -63,8 +67,10 @@ BEGIN {
   rna_num
   file_types
   rep_seq
+  distance_calculated
 );
 
+
 @version = qw(
   version_id
   dl_directory
@@ -100,6 +106,24 @@ my @_other = qw(
 
 use fields  @FIELDS;
 
+# Needed for outputting fasta files,
+# these are the allowed substitutions in the string
+my %header_lookup = (
+    'gene_id'    => '$gene->gene_id',
+    'ref'        => '$gene->protein_accnum',
+    'gi'         => '$gene->pid',
+    'rpv_id'     => '$gene->rpv_id',
+    'gpv_id'     => '$gene->gpv_id',
+    'start'      => '$gene->gene_start',
+    'end'        => '$gene->gene_end',
+    'length'     => '$gene->gene_length',
+    'locus_tag'  => '$gene->locus_tag',
+    'desc'       => '$gene->gene_product',
+    'rep_desc'   => '$self->definition',
+    'rep_accnum' => '$self->rep_accnum',
+    'rep_type'   => '$self->rep_type', 
+);
+
 sub new {
 	my ( $class, %arg ) = @_;
 
@@ -130,6 +154,7 @@ sub new {
 		#set the attribute in the object
 		$self->$attr( $arg{$attr} );
 	}
+
 	return $self;
 }
 
@@ -138,21 +163,32 @@ sub new {
 #undef is returned if that file is not available for that replicon (based on the file_types field)
 sub get_filename {
 	my ($self,$file_suffix)=@_;
-	
+
 	#check to see if the file type is available for this replicon
 	unless($self->file_types =~ /( |^)\.$file_suffix( |$)/){
 		return undef;	
 	}
 	my $search_obj = new MicrobeDB::Search(return_obj => 'MicrobeDB::GenomeProject');
+
+	if (!defined($search_obj)) {
+		$logger->error("Genome Project is missing!?");
+	}
+
 	my ($gpo) = $search_obj->object_search($self);
+
+	if (!defined($gpo)) {
+		$logger->error($self->rep_accnum, " is missing in GenomeProject");
+	}
+
 	my $file_name = $gpo->gpv_directory() . $self->file_name() . ".$file_suffix";
-	
+
 	#small hack since symbolic links will not work when called by php from web browser
 	unless($file_name =~ /home.westgrid/){
 	    if($file_name =~ /home\/shared/){
 	        $file_name =~ s/home/home.westgrid/;
 	    }
 	}
+
 	return $file_name;
 }
 
@@ -165,7 +201,7 @@ sub add_gene {
 	} elsif ( ref($gene) eq 'HASH' ) {
 		$gene_obj = new MicrobeDB::Gene(%$gene);
 	} else {
-		croak "Only a Gene object or hash can be used to add a Gene";
+		$logger->logcroak("Only a Gene object or hash can be used to add a Gene");
 	}
 	push( @{ $self->{genes} }, $gene_obj );
 }
@@ -245,6 +281,48 @@ sub _retrieve_rep_seq{
     return $replicon->rep_seq();
 }
 
+sub write_fasta {
+    my ($self, %args) = @_;
+
+    my $outfile = $args{'filename'} || $self->file_name;
+
+    my $append = $args{'append'} || 0;
+
+    my $seqtype = $args{'seqtype'} || 'protein';
+
+    my $headerfmt = $args{'headerfmt'} || 'gi|#gi#|ref|#ref#| #desc# [#rep_desc#]';
+
+    # If the user already gave the filename an extension
+    # don't tack one on
+    unless($outfile =~ /\.\S{3}$/) {
+	$outfile .= ($seqtype eq 'protein' ? '.faa' : '.ffn');
+    }
+
+    # Build the outfile name
+    my $writeline = ($append?'>':'') . ">$outfile";
+
+    open(OUT, $writeline) or
+	croak "Error opening fasta file $outfile: $!\n";
+
+    foreach my $gene (@{$self->genes()}) {
+	unless(ref($gene) eq 'MicrobeDB::Gene') {
+	    croak "Only a Gene object can be returned here, this is a " . ref($gene);
+	}
+
+	next if(($seqtype eq 'protein') && !($gene->protein_seq));
+	next if(($seqtype eq 'dna') && !($gene->gene_seq));
+
+	# Evaluate the header format string
+	(my $header = $headerfmt) =~ s/#(\w+)#/$header_lookup{$1}/gee;
+	print OUT ">$header\n";
+	print OUT join("\n", grep { $_ } split(/(.{1,70})/,
+		      ($seqtype eq 'protein'?$gene->protein_seq:$gene->gene_seq)));
+	print OUT "\n";
+    }
+
+    close OUT;
+}
+
 sub table_names {    
 	my ( $self, $field_name ) = @_;