diff --git a/docs/algorithm.rst b/docs/algorithm.rst new file mode 100644 index 0000000..2721354 --- /dev/null +++ b/docs/algorithm.rst @@ -0,0 +1,70 @@ +.. highlight:: shell + +====================== +Annotation Algorithm +====================== + +.. note:: There are several places where hard-coded logic was added to make the algorithm work with certain sequences. For instance, + + +#. **Check if locus is provided** (:ref:`blast`) + + * yes, then continue to step 2 + * no, then blast to get locus + +#. **Check if any exact matches exist** (:ref:`ref`) + + * yes, then return annotation associated with the exact match and go to step 7 + * no, then go on to step 3 + +#. **Blast sequence and get list of alleles** (:ref:`blast`) + + * If all of the returned sequences are partial then the last sequence will be replaced with a fully characterized sequence. + +#. **Iterate through the list and try to annotate with reference sequences** (:ref:`seq_search`) + + * Break reference up into features + * Search for each feature in the provided sequence + * If all features are mapped, then go to step 7, else.. + * Try and assemble the remaining features based on what has already been mapped. Since we know the coordinates of the mapped features and the remaining unmapped sequence, we can determine if the unmapped sequences fall between two mapped features or at the ends/beginning after/before mapped sequences. + * If all features could be mapped, then go to step 7 else go back to step 4A using any partial annotations for each reference sequence. If no annotation could be created after searching all of the reference sequences, then move on to step 5. + +#. **Loop through each reference sequence and do targeted alignments** (:ref:`align`) + + * Break up each reference sequence into features and create feature combos that will be used for doing alignments. Order the feature combos by the ones that make the most sense first. + * Do targeted alignments on all of the remaining blocks of sequences that have not yet been mapped. + + * If a high enough proportion of the unmapped sequence maps and the deletion/insertion rate is low enough, then extract the unmapped sequence from the alignment and map it. + * If all features are mapped then go to step 7, else.. + * run step 4 with the updated partial annotation to see if the annotation can now be assembled. Go to step 7 if all features are mapped else.. + * Loop through all feature combinations for all reference sequences. This slows down the annotation if it's very novel. For instance, if it's a new feature sequence and that specific feature has only been reported in IMGT/HLA a few time for a given locus. The acceptance rate for the alignments is decreased slightly after each loop. For class I that decrease stops after the second reference sequence, but for class II it will keep going lower. + * Rerun targeted alignment but with exons only combinations. + +#. **Do a full sequence alignment and use any partial annotation** (:ref:`align`) + + * If this fails and the rerun flag is set to ``True``, then rerun the whole annotation process starting from step 1. This time, skip the first reference allele that was used for doing the annotation and increase the number of reference alleles used by 1. + +#. **Generate GFE notation** (:ref:`gfe`) + + * Once a complete annotation is generated the GFE notation will be made + * If the sequence only contains A,T,C or G, then a GFE notation can be created + + + + + + + + + + + + + + + + + + + + diff --git a/docs/blast.rst b/docs/blast.rst new file mode 100644 index 0000000..b81cb6a --- /dev/null +++ b/docs/blast.rst @@ -0,0 +1,17 @@ +.. highlight:: shell + +====================== +Creating blastn files +====================== + +.. note:: Make sure blastn is properly installed before running! + +1) Download the allele list and the ``_gen`` and ``_nuc`` fasta files for each locus + +2) Create the blast files by running the **ngs-imgt-db** perl script + +.. code-block:: console + + $ ngs-imgt-db -i /path/to/imgt/files -o /output/dir + +3) Add the new blast files to the seqann/data/blast directory and check them in. \ No newline at end of file diff --git a/docs/db.rst b/docs/db.rst index 68cec2a..8bf7461 100644 --- a/docs/db.rst +++ b/docs/db.rst @@ -1,5 +1,7 @@ .. highlight:: shell +.. _bio: + ====================== BioSQL Database ====================== diff --git a/docs/index.rst b/docs/index.rst index 0e95b7b..6725186 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,15 +6,22 @@ Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All R readme installation + algorithm db + blast .. toctree:: :maxdepth: 2 :caption: Developer Documentation + debug + testing seqann + models + seqann.feature_client + seqann.feature_client.apis + seqann.feature_client.models contributing - debug history authors diff --git a/docs/models.rst b/docs/models.rst new file mode 100644 index 0000000..137006f --- /dev/null +++ b/docs/models.rst @@ -0,0 +1,36 @@ +seqann models +============== + +.. toctree:: + + models + +.. _ann: + +Annotation +---------- + +.. automodule:: seqann.models.annotation + :members: + :undoc-members: + :show-inheritance: + +.. _ref: + +Reference Data +--------------- +.. automodule:: seqann.models.reference_data + :members: + :undoc-members: + :show-inheritance: + +.. _bl: + +Blast +--------------- +.. automodule:: seqann.models.blast + :members: + :undoc-members: + :show-inheritance: + + diff --git a/docs/modules.rst b/docs/modules.rst new file mode 100644 index 0000000..a8f11da --- /dev/null +++ b/docs/modules.rst @@ -0,0 +1,7 @@ +seqann +====== + +.. toctree:: + :maxdepth: 4 + + seqann diff --git a/docs/seqann.feature_client.apis.rst b/docs/seqann.feature_client.apis.rst new file mode 100644 index 0000000..f23bca3 --- /dev/null +++ b/docs/seqann.feature_client.apis.rst @@ -0,0 +1,22 @@ +seqann.feature\_client.apis package +=================================== + +Submodules +---------- + +seqann.feature\_client.apis.features\_api module +------------------------------------------------ + +.. automodule:: seqann.feature_client.apis.features_api + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: seqann.feature_client.apis + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/seqann.feature_client.models.rst b/docs/seqann.feature_client.models.rst new file mode 100644 index 0000000..2f6076a --- /dev/null +++ b/docs/seqann.feature_client.models.rst @@ -0,0 +1,38 @@ +seqann.feature\_client.models package +===================================== + +Submodules +---------- + +seqann.feature\_client.models.feature module +-------------------------------------------- + +.. automodule:: seqann.feature_client.models.feature + :members: + :undoc-members: + :show-inheritance: + +seqann.feature\_client.models.feature\_request module +----------------------------------------------------- + +.. automodule:: seqann.feature_client.models.feature_request + :members: + :undoc-members: + :show-inheritance: + +seqann.feature\_client.models.sequence module +--------------------------------------------- + +.. automodule:: seqann.feature_client.models.sequence + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: seqann.feature_client.models + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/seqann.feature_client.rst b/docs/seqann.feature_client.rst new file mode 100644 index 0000000..3bc21d7 --- /dev/null +++ b/docs/seqann.feature_client.rst @@ -0,0 +1,46 @@ +seqann.feature\_client package +============================== + +Subpackages +----------- + +.. toctree:: + + seqann.feature_client.apis + seqann.feature_client.models + +Submodules +---------- + +seqann.feature\_client.api\_client module +----------------------------------------- + +.. automodule:: seqann.feature_client.api_client + :members: + :undoc-members: + :show-inheritance: + +seqann.feature\_client.configuration module +------------------------------------------- + +.. automodule:: seqann.feature_client.configuration + :members: + :undoc-members: + :show-inheritance: + +seqann.feature\_client.rest module +---------------------------------- + +.. automodule:: seqann.feature_client.rest + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: seqann.feature_client + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/seqann.rst b/docs/seqann.rst index 4b78d3f..1423b85 100644 --- a/docs/seqann.rst +++ b/docs/seqann.rst @@ -1,19 +1,57 @@ -SeqAnn Module +seqann package ============== -BioSeqAnn ----------- +.. toctree:: + + seqann + + +seqann.sequence_annotation +--------------------------- .. automodule:: seqann.sequence_annotation :members: :undoc-members: :show-inheritance: +.. _seq_search: + +seqann.seq_search +--------------------------- + +.. automodule:: seqann.seq_search + :members: + :undoc-members: + :show-inheritance: + +.. _gfe: -Models ---------------- +seqann.gfe +--------------------------- -.. automodule:: seqann.models.annotation +.. automodule:: seqann.gfe :members: :undoc-members: :show-inheritance: + +.. _blast: + +seqann.blast_cmd +--------------------------- + +.. automodule:: seqann.blast_cmd + :members: + :undoc-members: + :show-inheritance: + +.. _align: + +seqann.align +--------------------------- + +.. automodule:: seqann.align + :members: + :undoc-members: + :show-inheritance: + + diff --git a/docs/testing.rst b/docs/testing.rst new file mode 100644 index 0000000..6fe5503 --- /dev/null +++ b/docs/testing.rst @@ -0,0 +1,38 @@ +.. highlight:: shell + +====================== +Testing +====================== + +.. warning:: Before running tests clustalo, blastn and all the required python packages must be installed! + +To run all test simply execute the following command in the top directory of the SeqAnn repository. + +.. code-block:: console + + $ python -m unittest tests + + +Different tests +--------------------- + +.. note:: If you don't have a imgt_biosql db running then not all of the test will run! + +* test_seqann +* test_align +* test_blast +* test_feature +* test_gfe +* test_refdata +* test_seqsearch +* test_util + +Running specific tests +----------------------- + +You can test a specific test by providing the full test path on the command line. + +.. code-block:: console + + $ python -m unittest tests.test_seqann.TestBioSeqAnn.test_004_ambig + diff --git a/scripts/ngs-imgt-db b/scripts/ngs-imgt-db new file mode 100755 index 0000000..83dfd1d --- /dev/null +++ b/scripts/ngs-imgt-db @@ -0,0 +1,294 @@ +#!/usr/bin/env perl +=head1 NAME + + ngs-imgt-db + +=head1 SYNOPSIS + +./ngs-imgt-db + +=head1 AUTHOR Mike Halagan + + Bioinformatics Scientist + 3001 Broadway Stree NE + Minneapolis, MN 55413 + ext. 8225 + +=head1 DESCRIPTION + + This tool builds the blastn database from the IMGT/HLA sequence files. This + is used in the second validation workflow. + +=head1 EXAMPLES + + ./ngs-imgt-db -o /output/dir -c + + ./ngs-imgt-db -l A,B,C,DRB1,DQB,DRB3,DRB4,DRB5 -i /path/to/imgt/files -o /output/dir -t nucl + +=head1 CAVEATS + + - Must have makeblastdb installed + +=head1 LICENSE + + pipeline Consensus assembly and allele interpretation pipeline. + Copyright (c) 2015 National Marrow Donor Program (NMDP) + + This library is free software; you can redistribute it and/or modify it + under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 3 of the License, or (at + your option) any later version. + + This library is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this library; if not, write to the Free Software Foundation, + Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. + + > http://www.gnu.org/licenses/lgpl.html + +=head1 VERSIONS + + 1.0.0 Initial version of the tool + +=head1 TODO + + +=head1 SUBROUTINES + +=cut +use strict; +use warnings; +use Data::Dumper; +use Getopt::Long; +use vars qw($USAGE $VERSION $machine $start_time $user $working); +BEGIN{ + + $VERSION = '1.0.0'; + $start_time = time; + $working = `pwd`;chomp($working); + $user = `whoami`;chomp($user); + $machine = `uname`;chomp($machine); + + $USAGE = + qq{./ngs-imgt-db [--dbversion] [--genefamily] [--directory] [--loci] [--type] [--clone] [--keep] [--verbose] [--help] + -s/--dbversion Database version of the databse (default = ** ALL VERSIONS **) + -g/--genefamily Gene family of the databse (default = HLA) + -l/--loci List of loci to include in the database (default = A,B,C,DRB1,DQB,DRB3,DRB4,DRB5) + -i/--directory Directory of the IMGT/HLA files (default = working directory) + -t/--type Type of sequence database (default = nucl) + -k/--keep Flag for keeping the IMGT/HLA fasta files + -c/--clone Flag for cloning the IMGT/HLA database from github + -r/--remove Flag for removing the IMGT/HLA files after the datbases have been built + -o/--output Output directory + -f/--forced Force overwrite without prompting (default = 0) + -v/--verbose (default = 0) + -h/--help Run perldocs + }; + +} + +my %h_allele_list; # Hash of alleles that exist in each db release + +our ( $help, $s_dbversion, $s_genefamily, $s_loci, $s_db_type, $s_output_dir, $s_directory, $b_clone_imgt, $b_keep_fasta, $b_forced, $b_remove_imgt, $b_verbose) = + (undef, undef, "HLA", "A,B,C,DRB1,DQB1,DRB3,DRB4,DRB5,DQA1,DPB1,DPA1", "nucl", $working, $working, undef, undef, undef, undef, 0); + +&GetOptions("output|o=s" => \$s_output_dir, + "dbversion|s=s" => \$s_dbversion, + "loci|l=s" => \$s_loci, + "genefamily|g=s" => \$s_genefamily, + "directory|i=s" => \$s_directory, + "type|t=s" => \$s_db_type, + "forced|f" => \$b_forced, + "remove|r" => \$b_remove_imgt, + "keep|k" => \$b_keep_fasta, + "clone|c" => \$b_clone_imgt, + "verbose|v=n" => \$b_verbose, + "help|h" => \$help + ); + +# print $s_directory,"\n"; +# exit; + +if(defined $help){ # If help flag provided then provide the perl doc and exit + exec('perldoc',$0); + die; +} + +# print $s_directory,"\n"; +# exit; +################################################################################### +my $makeblastdb = `which makeblastdb`;chomp($makeblastdb); +if(!defined $makeblastdb + || $makeblastdb !~/\S/){ # Die if makeblastdb is not installed on your computer + die($USAGE,'makeblastdb must be installed on your machine!'); +} + +#$s_dbversion =~ s/\.//g if(defined $s_dbversion && $s_dbversion =~ /\./); +# if(defined $s_dbversion +# && !validImgtDb($s_dbversion)){ # Die if the provided imgt db is invalid +# die($USAGE,'Please provide a valid imgt database.'); +# } + +if(!-d $s_output_dir){ # Die if a no valid output directory is provided + die($USAGE,'Please provide a valid ouput directory.'); +} + +#$s_directory = $s_directory !~ /IMGT/ ? $s_directory."/IMGTHLA" : $s_directory; +#print `git clone https://github.com/ANHIG/IMGTHLA` if defined $b_clone_imgt; +#print STDERR $s_directory,"\n"; + +if(!-d $s_directory){ # Die if a no valid input directory is provided + die($USAGE,'Please provide a valid input directory.'); +} +################################################################################### + +#my %h_loci = map{ $_ => 1 } split(/,/,$s_loci); +my @a_allele_list_files = glob "$s_directory/Allelelist.*"; # Getting the allele lists for each db version +my @a_fasta_gen = glob "$s_directory/*_gen.fasta";# Getting the full gene sequences +my @a_fasta_nuc = glob "$s_directory/*_nuc.fasta";# Getting the coding region sequences + +&loadAlleleLists(\@a_allele_list_files); # Load the allele lists for each db version + +print "DB lists: \n"; +print join("\n",keys %h_allele_list),"\n"; + +# Looping through each database +foreach my $s_db (keys %h_allele_list){ + print STDERR "ALLELELIST $s_db\n"; + # If a database version is provided then only make that database + #next if (defined $s_dbversion && $s_dbversion =~ /\S/ && $s_db ne $s_dbversion); + + my %h_fasta_seqs; + foreach my $s_fasta_file (@a_fasta_gen){ + print STDERR "FASTA FILE $s_fasta_file\n"; + my @a_fasta_path = split(/\//,$s_fasta_file); + my($s_loc,@x) = split(/_/,$a_fasta_path[$#a_fasta_path]); + + #next unless defined $h_loci{$s_loc}; + + my $allele; + my $skip = 1; + # Loading the sequence data from the fasta files + open(my $fh_loc_fasta,"<",$s_fasta_file) or die "CANT OPEN FILE $! $0"; + while(<$fh_loc_fasta>){ + chomp; + if($_ =~ /^>/){ + my($s_id,$s_allele,$len,$bp) = split(/\s/,$_); + # Skip any allele that doesn't exist in the db version + #$skip = defined $h_allele_list{$s_db}{$s_allele} ? 1 : 0; + print STDERR $skip,"\t",$s_id,"\t",$s_allele,"\n"; + $skip = 1; + $allele = $skip ? $s_allele : ""; + next; + }elsif($skip == 1){ + push(@{$h_fasta_seqs{"HLA-" . $allele}},$_); + } + } + close $fh_loc_fasta; + } + + my %h_no_genomic; + foreach my $s_allele (keys %{$h_allele_list{$s_db}}){ + next unless !defined $h_fasta_seqs{$s_allele}; + $h_no_genomic{$s_allele}++; + } + + # Loop through the fasta files that don't have genomic level data + foreach my $s_fasta_file (@a_fasta_nuc){ + + print STDERR "NUCLEOTIDE $s_fasta_file\n"; + my @a_fasta_path = split(/\//,$s_fasta_file); + my($s_loc,@x) = split(/_/,$a_fasta_path[$#a_fasta_path]); + + #next unless defined $h_loci{$s_loc}; + + my $allele; + my $skip = 1; + # Loading the sequence data from the fasta files + open(my $fh_loc_fasta,"<",$s_fasta_file) or die "CANT OPEN FILE $! $0"; + while(<$fh_loc_fasta>){ + chomp; + print "LINE: ",$_,"\n"; + if($_ =~ /^>/){ + my($s_id,$s_allele,$len,$bp) = split(/\s/,$_); + + # Skip any allele that doesn't exist in the db version + #$skip = defined $h_allele_list{$s_db}{$s_allele} ? 1 : 0; + print STDERR $skip,"\t",$s_id,"\t",$s_allele,"\n"; + $skip = 1; + $allele = $skip ? $s_allele : ""; + next; + }elsif($skip == 1){ + print "PUSHING 2\n"; + push(@{$h_fasta_seqs{$allele."_nuc"}},$_); + } + } + close $fh_loc_fasta; + } + + + my $db_file = $s_output_dir."/".$s_db.".nsq";# Database file that will be created + my $s_db_fasta_file = $s_output_dir."/".$s_db.".fa"; # Die if the files already exist and the forced flag wasn't given + die('Database files already exist!') if(-e $db_file && !defined $b_forced);# Die if the db file already exists + + open(my $fa,">",$s_db_fasta_file) or die "CANT OPEN FILE $! $0"; + foreach my $s_allele (sort {$a cmp $b} keys %h_fasta_seqs){ + print $fa ">".$s_allele."\n"; + print $fa join("\n",@{$h_fasta_seqs{$s_allele}}),"\n"; # Print out the fasta sequence + if(!defined $h_fasta_seqs{$s_allele}){ + print STDERR "NOT FOUND: ",$s_allele,"\n"; + } + } + close $fa; + + print `makeblastdb -in $s_db_fasta_file -out $s_db -dbtype nucl`; # Building the db + print `mv $s_db.* $s_output_dir`; # Moving the db files + print "IMGT FASTA: ",$s_db_fasta_file,"\n"; + #print `rm $s_db_fasta_file` if !defined $b_keep_fasta; # Removing the db fasta file + +} + +print `rm -rf $s_directory` if defined $b_remove_imgt; # Removing the IMGT/HLA files if the flag is provided + +################################################################################################################ +=head3 loadAlleleLists + + Title: loadAlleleLists + Usage: loadAlleleLists($s_emails) + Function: Getting the allele list for each imgt db version + Returns: na + Args: array of the allele list files + +=cut +sub loadAlleleLists{ + + my $ra_allele_list_files = shift; + + foreach my $s_allele_list_file (@$ra_allele_list_files){ + + my @a_file_parts = split(/\//,$s_allele_list_file); + my $s_file_name = $a_file_parts[$#a_file_parts]; + my ($Allelelist,$db,$txt) = split(/\./,$s_file_name); + next if length($db) != 4; + #$db = "KIR"; + print "Loading db ".$db."\n"; + open(my $fh_allele_list,"<",$s_allele_list_file) or die "CANT OPEN FILE $! $0"; + while(<$fh_allele_list>){ + chomp; + my $spl = $_ =~ /,/ ? ',' : ' '; + my($id,$allele) = split $spl, $_; + $h_allele_list{$db}{$allele}++; + + } + close $fh_allele_list; + + } + +} + + + diff --git a/seqann/__init__.py b/seqann/__init__.py index e13ece7..45c77f0 100644 --- a/seqann/__init__.py +++ b/seqann/__init__.py @@ -27,4 +27,4 @@ __author__ = """Mike Halagan""" __email__ = 'mhalagan@nmdp.org' -__version__ = '0.0.45' +__version__ = '1.0.0' diff --git a/seqann/align.py b/seqann/align.py index 387110c..2c2a9ee 100644 --- a/seqann/align.py +++ b/seqann/align.py @@ -24,10 +24,7 @@ from __future__ import absolute_import -import os import re -import sys - from Bio.Alphabet import SingleLetterAlphabet from Bio.SeqRecord import SeqRecord from subprocess import Popen @@ -48,9 +45,7 @@ from seqann.util import get_seqfeat from seqann.seq_search import getblocks from seqann.models.annotation import Annotation -from seqann.util import is_classII from seqann.util import get_structures -from Bio.SeqUtils import nt_search import logging @@ -60,12 +55,27 @@ def align_seqs(found_seqs, sequence, locus, start_pos, missing, annotated, cutoff=0.90, verbose=False, verbosity=0): """ - Aligns sequences with clustalw - - :param locus: string containing HLA locus. - :param sequence: string containing sequence data. - - :return: GFEobject. + align_seqs - Aligns sequences with clustalo + + :param found_seqs: List of the reference sequences + :type found_seqs: ``List`` + :param sequence: The input consensus sequence. + :type sequence: SeqRecord + :param locus: The gene locus associated with the sequence. + :type locus: ``str`` + :param annotated: dictonary of the annotated features + :type annotated: ``dict`` + :param start_pos: Where the reference sequence starts + :type start_pos: ``int`` + :param missing: List of the unmapped features + :type missing: ``List`` + :param cutoff: The alignment cutoff + :type cutoff: ``float`` + :param verbose: Flag for running in verbose mode. + :type verbose: ``bool`` + :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. + :type verbosity: ``int`` + :rtype: :ref:`ann` """ logger = logging.getLogger("Logger." + __name__) seqs = [found_seqs, sequence] @@ -78,12 +88,13 @@ def align_seqs(found_seqs, sequence, locus, start_pos, missing, seqs.append(found_seqs) seqs.append(sequence) - # print(found_seqs.id) - # for f in found_seqs.features: - # print(f) - # print("--**--") align = [] + + # piping to clustalo failed + # when sequences were over ~7k bp if len(sequence) > 7000: + + # Writing sequences out to fasta files.. if verbose: logger.info("Sequence too large to use pipe") randid = randomid() @@ -100,10 +111,9 @@ def align_seqs(found_seqs, sequence, locus, start_pos, missing, align.append(str(aln.seq)) # Delete files - # if not "_".join(found_seqs.id.split("_")[0:2]) == "exon_2": - # cleanup(randid) cleanup(randid) else: + # Running clustalo by piping in sequences indata = flatten([[">" + str(s.id), str(s.seq)] for s in seqs]) child = Popen(['clustalo', @@ -179,16 +189,29 @@ def align_seqs(found_seqs, sequence, locus, start_pos, missing, def find_features(feats, sequ, annotated, start_pos, cutoff): + """ + find_features - Finds the reference sequence features in the alignments and records the positions + + :param feats: Dictonary of sequence features + :type feats: ``dict`` + :param sequ: The sequence alignment for the input sequence + :type sequ: ``List`` + :param annotated: dictonary of the annotated features + :type annotated: ``dict`` + :param start_pos: Where the reference sequence starts + :type start_pos: ``int`` + :param missing: List of the unmapped features + :type missing: ``List`` + :param cutoff: The alignment cutoff + :type cutoff: ``float`` + :param verbose: Flag for running in verbose mode. + :type verbose: ``bool`` + :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. + :type verbosity: ``int`` + :rtype: ``List`` + """ feats_a = list(feats.keys()) - #print(found_seqs.id) - # print("start = ",str(start_pos)) - # print("## Before ##") - # for f in feats: - # print(f) - # print(feats[f]) - # print("##") - j = 0 s = 0 en = 0 @@ -209,7 +232,7 @@ def find_features(feats, sequ, annotated, start_pos, cutoff): start_val = feats[feats_a[j]].location.start #if feats_a[j] == "five_prime_UTR": # start_val = 0 - + if((annotated == 0 and start_pos == 0 and cutoff < 0.9) or (annotated == 0 and start_pos == 0 @@ -217,21 +240,16 @@ def find_features(feats, sequ, annotated, start_pos, cutoff): or (start_pos == 0 and len(feats) == 1 and cutoff < .9)): start_val = 0 - print("HERE !!!!!") + else: if feats_a[j] == 'five_prime_UTR': start_val = 0 - #print("HERE 12") - #print("Before 1-1",feats_a[j],str(feats[feats_a[j]].location.start),str(feats[feats_a[j]].location.end),feats[feats_a[j]].type) feats[feats_a[j]] = SeqFeature(FeatureLocation(ExactPosition(start_val), ExactPosition(int(feats[feats_a[j]].location.end + 1)), strand=1), type=feats[feats_a[j]].type) - #print("After 1-1",feats_a[j],str(start_val),str(feats[feats_a[j]].location.end),feats[feats_a[j]].type) - + if j != len(feats_a): for l in range(j+1, len(feats_a)): - #print("Before 1-2",feats_a[l],str(feats[feats_a[l]].location.start),str(feats[feats_a[l]].location.end),feats[feats_a[l]].type) feats[feats_a[l]] = SeqFeature(FeatureLocation(ExactPosition(feats[feats_a[l]].location.start+1), ExactPosition(int(feats[feats_a[l]].location.end + 1)), strand=1), type=feats[feats_a[l]].type) - #print("After 1-2",feats_a[l],str(feats[feats_a[l]].location.start),str(feats[feats_a[l]].location.end),feats[feats_a[l]].type) else: if s == 1: st = feats[feats_a[j]].location.start + start @@ -250,26 +268,34 @@ def find_features(feats, sequ, annotated, start_pos, cutoff): if feats_a[j] == 'five_prime_UTR': start_val = 0 - #print("Before 2-1",feats_a[j],str(feats[feats_a[j]].location.start),str(feats[feats_a[j]].location.end),feats[feats_a[j]].type) feats[feats_a[j]] = SeqFeature(FeatureLocation(ExactPosition(start_val), ExactPosition(end), strand=1), type=feats[feats_a[j]].type) - #print("After 2-1",feats_a[j],str(start_val),str(feats[feats_a[j]].location.end),feats[feats_a[j]].type) if j != len(feats_a): - for l in range(j+1, len(feats_a)): - #print("Before 2-2",feats_a[l],str(feats[feats_a[l]].location.start),str(feats[feats_a[l]].location.end),feats[feats_a[l]].type) + for l in range(j+1, len(feats_a)): feats[feats_a[l]] = SeqFeature(FeatureLocation(ExactPosition(feats[feats_a[l]].location.start+st), ExactPosition(int(feats[feats_a[l]].location.end + st)), strand=1), type=feats[feats_a[l]].type) - #print("After 2-2",feats_a[l],str(feats[feats_a[l]].location.start),str(feats[feats_a[l]].location.end),feats[feats_a[l]].type) + s = 0 return feats def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False, verbosity=0): """ - Resolves features from alignments - - :param feat_list: string containing HLA locus. - :param seqin: string containing sequence data. - - :return: Annotation. + resolve_feats - Resolves features from alignments + + :param feat_list: List of the found features + :type feat_list: ``List`` + :param seqin: The input sequence + :type seqin: ``str`` + :param locus: The input locus + :type locus: ``str`` + :param start: Where the sequence start in the alignment + :type start: ``int`` + :param missing: List of the unmapped features + :type missing: ``List`` + :param verbose: Flag for running in verbose mode. + :type verbose: ``bool`` + :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. + :type verbosity: ``int`` + :rtype: :ref:`ann` """ structures = get_structures() logger = logging.getLogger("Logger." + __name__) @@ -297,9 +323,7 @@ def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False diff_f = True for feat in feature_list: - #logger.error("TRYING TO RESOLVE ***>>>> " +feat) if feat in missing: - #logger.error("FEAT IN MISSING >> " + feat) f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) @@ -323,16 +347,6 @@ def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False ExactPosition(ep), strand=1), type=f.type) - # if feat == "exon_7": - # #if diff > 0: - #print(feat, str(diff), str(start), str(featn.location.start),str(featn.location.end)) - #print(feat, str(diff), str(start), str(f.location.start),str(f.location.end)) - #print(feat,str(seqrec.seq)) - #seq_search = nt_search(str(seq.seq), str(seqrec.seq)) - # print("SEQ SEARCH IN ALIGN") - # print(seq_search) - # print("***") - # print("") features.update({feat: featn}) full_annotation.update({feat: seqrec}) @@ -341,29 +355,19 @@ def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False del coordinates[i] mapping[i] = feat else: - f = features[feat] seqrec = f.extract(seq) seq_covered -= len(seqrec.seq) if re.search("-", str(seqrec.seq)): - #print("DIFF21", str(diff), feat) l1 = len(seqrec.seq) newseq = re.sub(r'-', '', str(seqrec.seq)) seqrec.seq = Seq(newseq, IUPAC.unambiguous_dna) tmdiff = l1 - len(newseq) diff += tmdiff - #print("DIFF22", str(diff), feat) - # print("--") blocks = getblocks(coordinates) rmapping = {k+start: mapping[k] for k in mapping.keys()} - # Print out what blocks haven't been annotated - # if verbose and verbosity > 2: - # logger.info("Blocks not mapped:") - # for b in blocks: - # logger.info(",".join([str(i) for i in b])) - # Print out what features are missing if verbose and verbosity > 0 and len(full_annotation.keys()) > 1: logger.info("Features resolved:") @@ -386,13 +390,27 @@ def resolve_feats(feat_list, seqin, seqref, start, locus, missing, verbose=False seq=seq) +# ** HARD CODED LOGIC ** # def count_diffs(align, feats, inseq, locus, cutoff, verbose=False, verbosity=0): """ - Aligns sequences with clustalw - - :param locus: string containing HLA locus. - :param sequence: string containing sequence data. + count_diffs - Counts the number of mismatches, gaps, and insertions and then determines if those are within an acceptable range. + + :param align: The alignment + :type align: ``List`` + :param feats: Dictonary of the features + :type feats: ``dict`` + :param locus: The gene locus associated with the sequence. + :type locus: ``str`` + :param inseq: The input sequence + :type inseq: ``str`` + :param cutoff: The alignment cutoff + :type cutoff: ``float`` + :param verbose: Flag for running in verbose mode. + :type verbose: ``bool`` + :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. + :type verbosity: ``int`` + :rtype: ``List`` """ nfeats = len(feats.keys()) @@ -404,6 +422,7 @@ def count_diffs(align, feats, inseq, locus, cutoff, lastb = '' l = len(align[0]) if len(align[0]) > len(align[1]) else len(align[1]) + # Counting gaps, mismatches and insertions for i in range(0, l): if align[0][i] == "-" or align[1][i] == "-": if align[0][i] == "-": @@ -443,21 +462,19 @@ def count_diffs(align, feats, inseq, locus, cutoff, logger.info('{:<22}{:<6d}{:<1.2f}'.format("Number of matches: ", match, mper2)) indel = iper + delper + # ** HARD CODED LOGIC ** # if len(inseq) > 6000 and mmper < .10 and mper2 > .80: if verbose: logger.info("Alignment coverage high enough to complete annotation 11") return insr, dels else: - # TODO: - # These numbers need to be fine t + # TODO: These numbers need to be fine tuned indel_mm = indel + mper2 if (indel > 0.5 or mmper > 0.05) and mper2 < cutoff and indel_mm != 1: if verbose: logger.info("Alignment coverage NOT high enough to return annotation") return Annotation(complete_annotation=False) else: - #print("RETURNING HERE 1") - #print(indel,mmper,gper,mper2,cutoff) if verbose: logger.info("Alignment coverage high enough to complete annotation") return insr, dels diff --git a/seqann/blast_cmd.py b/seqann/blast_cmd.py index 88cb3d4..e68ceac 100644 --- a/seqann/blast_cmd.py +++ b/seqann/blast_cmd.py @@ -33,72 +33,38 @@ import logging import re +# TODO: Move to util.py has_hla = lambda x: True if re.search("HLA", x) else False -def get_locus(sequences, kir=False, verbose=False, refdata=None, evalue=10): - """ - Gets the locus of the sequence by running blastn - - :param sequences: sequenences to blast - :param kir: bool whether the sequences are KIR or not - - :return: str. - """ - if not refdata: - refdata = ReferenceData() - - file_id = str(randomid()) - input_fasta = file_id + ".fasta" - output_xml = file_id + ".xml" - SeqIO.write(sequences, input_fasta, "fasta") - blastn_cline = NcbiblastnCommandline(query=input_fasta, - db=refdata.blastdb, - evalue=evalue, - outfmt=5, - reward=1, - penalty=-3, - gapopen=5, - gapextend=2, - dust='yes', - out=output_xml) - - stdout, stderr = blastn_cline() - - blast_qresult = SearchIO.read(output_xml, 'blast-xml') - - # Delete files - cleanup(file_id) - - if len(blast_qresult.hits) == 0: - return '' - - loci = [] - for i in range(0, 3): - if kir: - loci.append(blast_qresult[i].id.split("*")[0]) - else: - loci.append(blast_qresult[i].id.split("*")[0]) - - locus = set(loci) - if len(locus) == 1: - if has_hla(loci[0]) or kir: - return loci[0] - else: - return "HLA-" + loci[0] - else: - return '' - - def blastn(sequences, locus, nseqs, kir=False, verbose=False, refdata=None, evalue=10): """ - Gets the locus of the sequence by running blastn - - :param sequences: sequenences to blast - :param kir: bool whether the sequences are KIR or not + Gets the a list of alleles that are the most similar to the input sequence + + :param sequences: The input sequence record. + :type sequences: SeqRecord + :param locus: The gene locus associated with the sequence. + :type locus: ``str`` + :param nseqs: The incomplete annotation from a previous iteration. + :type nseqs: ``int`` + :param evalue: The evalue to use (default = 10) + :type evalue: ``int`` + :param kir: Run with KIR or not + :type kir: ``bool`` + :param verbose: Run in versboe + :type verbose: ``bool`` + :param refdata: An object with reference data + :type refdata: :ref:`ref` + :rtype: :ref:`bl` + + Example usage: + + >>> from Bio.Seq import Seq + >>> from seqann.blast_cmd import blastn + >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') + >>> blast = blastn(sequence, locus, nseqs) - :return: Blast. """ logger = logging.getLogger("Logger." + __name__) @@ -217,3 +183,64 @@ def blastn(sequences, locus, nseqs, kir=False, blast_o = Blast(match_seqs=final_seqs, alleles=alleles) return blast_o + +def get_locus(sequences, kir=False, verbose=False, refdata=None, evalue=10): + """ + Gets the locus of the sequence by running blastn + + :param sequences: sequenences to blast + :param kir: bool whether the sequences are KIR or not + :rtype: ``str`` + + Example usage: + + >>> from Bio.Seq import Seq + >>> from seqann.blast_cmd import get_locus + >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') + >>> locus = get_locus(sequence) + + """ + if not refdata: + refdata = ReferenceData() + + file_id = str(randomid()) + input_fasta = file_id + ".fasta" + output_xml = file_id + ".xml" + SeqIO.write(sequences, input_fasta, "fasta") + blastn_cline = NcbiblastnCommandline(query=input_fasta, + db=refdata.blastdb, + evalue=evalue, + outfmt=5, + reward=1, + penalty=-3, + gapopen=5, + gapextend=2, + dust='yes', + out=output_xml) + + stdout, stderr = blastn_cline() + + blast_qresult = SearchIO.read(output_xml, 'blast-xml') + + # Delete files + cleanup(file_id) + + if len(blast_qresult.hits) == 0: + return '' + + loci = [] + for i in range(0, 3): + if kir: + loci.append(blast_qresult[i].id.split("*")[0]) + else: + loci.append(blast_qresult[i].id.split("*")[0]) + + locus = set(loci) + if len(locus) == 1: + if has_hla(loci[0]) or kir: + return loci[0] + else: + return "HLA-" + loci[0] + else: + return '' + diff --git a/seqann/gfe.py b/seqann/gfe.py index fbf80cc..68ca23f 100644 --- a/seqann/gfe.py +++ b/seqann/gfe.py @@ -41,6 +41,8 @@ class GFE(object): ''' + This class is used for converting annotations into GFE notations. + Example: >>> from Bio import SeqIO @@ -55,7 +57,7 @@ class GFE(object): >>> seqann = BioSeqAnn(server=server) >>> seq_rec = list(SeqIO.parse(seq_file, 'fasta'))[0] >>> annotation = seqann.annotate(seq_rec, "HLA-DQB1") - >>> gfe = gfe.get_gfe(annotation, "HLA-DQB1") + >>> features, gfe = gfe.get_gfe(annotation, "HLA-DQB1") >>> print(gfe) HLA-DQB1w0-4-0-141-0-12-0-4-0-0-0-0-0 @@ -97,12 +99,7 @@ def __init__(self, url="http://feature.nmdp-bioinformatics.org", def load_features(self): """ - creates GFE from HLA sequence and locus - - :param locus: string containing HLA locus. - :param sequence: string containing sequence data. - - :return: GFEobject. + Loads all the known features from the feature service """ # Loading all loci that # are in self.loci variable defined @@ -123,12 +120,11 @@ def load_features(self): def locus_features(self, locus): """ - creates GFE from HLA sequence and locus + Returns all features associated with a locus :param locus: string containing HLA locus. - :param sequence: string containing sequence data. - - :return: GFEobject. + :type locus: ``str`` + :rtype: ``dict`` """ features = self.api.list_features(locus=locus) feat_dict = {":".join([a.locus, str(a.rank), a.term, a.sequence]): a.accession for a in features} @@ -136,12 +132,17 @@ def locus_features(self, locus): def get_gfe(self, annotation, locus): """ - creates GFE from HLA sequence and locus + creates GFE from a sequence annotation - :param locus: string containing HLA locus. - :param sequence: string containing sequence data. + :param locus: The gene locus + :type locus: ``str`` + :param annotation: An sequence annotation object + :type annotation: ``List`` + :rtype: ``List`` + + Returns: + The GFE notation and the associated features in an array - :return: GFEobject. """ features = [] accessions = {} diff --git a/seqann/models/blast.py b/seqann/models/blast.py index 9638e23..b7fa57e 100644 --- a/seqann/models/blast.py +++ b/seqann/models/blast.py @@ -28,6 +28,8 @@ from ..util import deserialize_model +# NOTE: This really doesn't need to be a class.. + class Blast(Model): ''' classdocs @@ -35,12 +37,10 @@ class Blast(Model): def __init__(self, failed: bool=None, match_seqs: List[DBSeqRecord]=None, alleles: List[str]=None): """ - Blast - TODO: ADD evalues :param match_seqs: The match_seqs of this Blast. - :type match_seqs: List[DBSeqRecord] + :type match_seqs: List[``SeqRecord``] :param alleles: The alleles of this Blast. - :type alleles: List[str] + :type alleles: List[``str``] """ self.data_types = { 'match_seqs': List[DBSeqRecord], diff --git a/seqann/models/reference_data.py b/seqann/models/reference_data.py index 4355f62..6d66ffb 100644 --- a/seqann/models/reference_data.py +++ b/seqann/models/reference_data.py @@ -69,7 +69,7 @@ def download_dat(url, dat): urllib.request.urlretrieve(url, dat) -# TODO: Use pandas to get latest version +# TODO: Use the AWS Lamba API for getting latest IMGT/DB class ReferenceData(Model): ''' @@ -613,10 +613,19 @@ def server_avail(self) -> bool: def search_refdata(self, seq, locus): """ - Gets the Annotation of this ReferenceData. "select ent.name from bioentry limit 10" - :return: The Annotation of this ReferenceData. - :rtype: Annotation - mysql --user=root bioseqdb -e "select * from biodatabase" + This checks to see if a sequence already exists in the reference data. If it does, then it'll return the known annotation. + + :return: The Annotation of associated with the input sequence + :rtype: :ref:`ann` + + Example: + + >>> from Bio.Seq import Seq + >>> from seqann.models.reference_data import ReferenceData + >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') + >>> refdata = ReferenceData() + >>> matched_annotation = refdata.search_refdata(sequence, locus) + """ # TODO: ONLY MAKE ONE CONNECTION # TODO: add try statement @@ -721,15 +730,8 @@ def seqannotation(self, seqrecord, allele, loc): :return: The Annotation from the found sequence :rtype: Annotation """ - #seqrecord = self.seqrecord(allele, loc) complete_annotation = get_features(seqrecord) - # print("&&&&&&") - # for f in seqrecord.features: - # #print(f) - # size = f.location.end - f.location.start - # print(f.type, str(f.location.start), str(f.location.end), str(size)) - # print("&&&&&&") annotation = Annotation(annotation=complete_annotation, method='match', complete_annotation=True) diff --git a/seqann/seq_search.py b/seqann/seq_search.py index ebc25dc..69748bb 100644 --- a/seqann/seq_search.py +++ b/seqann/seq_search.py @@ -36,7 +36,6 @@ from Bio.SeqFeature import FeatureLocation from Bio.SeqFeature import ExactPosition -from seqann.models.reference_data import ReferenceData from seqann.models.annotation import Annotation from seqann.models.base_model_ import Model from seqann.util import deserialize_model @@ -47,48 +46,25 @@ from seqann.util import is_classII -def loctype(s1, e1, s2, e2): - if s1 < s2 and e1 < e2: - return True - else: - return False +# NOTE: This doesn't need to be a class anymore and could be reduced to functions +class SeqSearch(Model): + ''' + This is a class for annotating a BioPython sequence without using alignment -# TODO: Add documentation -def getblocks(coords): - block = [] - blocks = [] - sorted_i = sorted(coords.keys()) - if len(sorted_i) == 1: - return [sorted_i] - for i in range(0, len(sorted_i)-1): - j = i+1 - if i == 0: - block.append(sorted_i[i]) - if(j <= len(sorted_i)-1): - if(sorted_i[i] == sorted_i[j]-1): - block.append(sorted_i[j]) - else: - blocks.append(block) - block = [] - block.append(sorted_i[j]) - else: - block.append(sorted_i[j]) - if len(block) > 1: - blocks.append(block) - return blocks + :param verbose: Flag for running in verbose mode. + :type verbose: ``bool`` + :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. + :type verbosity: ``int`` + + Example usage: + + >>> from seqann.seq_search import SeqSearch + >>> seqsrch = SeqSearch() -class SeqSearch(Model): - ''' - classdocs ''' def __init__(self, verbose: bool=False, verbosity: int=0): - """ - ReferenceData - :param refdata: The reference data of this SeqSearch. - :type refdata: ReferenceData - """ self.data_types = { 'verbose': bool, 'verbosity': int @@ -116,7 +92,30 @@ def from_dict(cls, dikt) -> 'SeqSearch': return deserialize_model(dikt, cls) def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): + """ + search_seqs - method for annotating a BioPython sequence without alignment + + :param seqrec: The reference sequence + :type seqrec: SeqRecord + :param locus: The gene locus associated with the sequence. + :type locus: str + :param in_seq: The input sequence + :type in_seq: SeqRecord + :param run: The number of runs that have been done + :type run: int + :param partial_ann: A partial annotation from a previous step + :type partial_ann: :ref:`ann` + :rtype: :ref:`ann` + + Example usage: + + >>> from Bio.Seq import Seq + >>> from seqann.seq_search import SeqSearch + >>> inseq = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') + >>> sqsrch = SeqSearch() + >>> ann = sqsrch.search_seqs(refseqs, inseq) + """ # Extract out the sequences and feature names # from the reference sequences @@ -202,17 +201,6 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): # is not found, then record that feature as missing. seq_search = nt_search(str(in_seq.seq), str(feats[feat_name])) - #skip = False - # if(feat_name == "exon_5" and locus == "HLA-DQB1" - # and len(in_seq.seq) < 7000 and not "intron_5" in found_feats - # and not "intron_4" in found_feats): - # skip = True - - # if(feat_name == "intron_4" and locus == "HLA-DQB1" - # and len(in_seq.seq) > 7000): - # skip = True - #print("SKIPPING INTRON 4 seq_search") - if len(seq_search) == 2: if self.verbose and self.verbosity > 0: @@ -224,7 +212,7 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): if feat_name == 'three_prime_UTR' \ and len(str(in_seq.seq)) > end: end = len(str(in_seq.seq)) - #print("HERE 1") + # If the feature is found and it's a five_prime_UTR then # the start should always be 0, so insertions at the # beinging of the sequence will be found. @@ -296,6 +284,11 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): seq_search = new_seq if(partial_ann and feat_name == "exon_8" and run > 0): missing_feats = sorted(list(partial_ann.missing.keys())) + + # * HARD CODED LOGIC * # + # > exon8 in class I maps to multiple spots in a sequence, + # often in the 3' UTR. These features need to be mapped + # last to make sure it's not mapping exon8 incorrectly. if(missing_feats == ['exon_8', 'three_prime_UTR'] and len(seq_search) <= 3): if self.verbose and self.verbosity > 0: @@ -352,13 +345,13 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): blocks = getblocks(coordinates) exact_matches = list(found_feats.keys()) - ## TODO: Add case for + # * HARD CODED LOGIC * # + # > + # # HLA-DRB1 exon3 exact match - with intron1 and 3 missing if('exon_3' in exact_matches and run == 99 and locus == 'HLA-DRB1' and 'exon_2' in feat_missing and (len(blocks) == 1 or len(blocks) == 2)): - #self.logger.info("++++++++++++++++++++++++++++++++ EXON 3 EXACT MATCH ++++++++++++++++++++++++++++++++") - for b in blocks: x = b[len(b)-1] if x == max(list(mapping.keys())): @@ -392,8 +385,16 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): method=method, mapping=mapping, exact_match=exact_matches) + # If it's a class II sequence and # exon_2 is an exact match + # * HARD CODED LOGIC * # + # > It's common for exon2 to be fully sequenced + # but intron_2 and intron_1 to be partially sequenced, + # which can make it hard to annotate those to features. + # If there are two missing blocks that is small enough + # and they are before and after exon2, then it's very + # very likely to be intron_2 and intron_1. if 'exon_2' in exact_matches and len(blocks) == 2 \ and is_classII(locus) and seq_covered < 300: @@ -445,8 +446,6 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): mapping=mapping, exact_match=exact_matches) - # TODO: pass seq_covered and mapping, so the - # final annotation contains the updated values annotated_feats, mb, mapping = self._resolve_unmapped(blocks, feat_missing, ambig_map, @@ -456,6 +455,7 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): seq_covered ) + # * HARD CODED LOGIC * # if(not mb and blocks and len(feat_missing.keys()) == 0 and len(ambig_map.keys()) == 0): mb = blocks @@ -488,15 +488,6 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): if self.verbose and self.verbosity > 1: self.logger.info("* Annotation not complete *") - #print(coordinates) - #print(mapping) - # Print out what blocks haven't been annotated - # if self.verbose and self.verbosity > 2: - # self.logger.info("Number of blocks not annotated = " + str(len(mb))) - # self.logger.info("Blocks not annotated:") - # for b in mb: - # self.logger.info(",".join([str(i) for i in b])) - # Print out what features were missing by the ref if self.verbose and self.verbosity > 2: self.logger.info("Refseq was missing these features = " + ",".join(list(refmissing))) @@ -584,6 +575,9 @@ def search_seqs(self, seqrec, in_seq, locus, run=0, partial_ann=None): return annotation + # TODO: This should be cleaned up and reduced.. + # There is some repeating logic and overlap in + # this section. def _resolve_unmapped(self, blocks, feat_missing, ambig_map, mapping, found_feats, loc, covered, rerun=False): @@ -706,8 +700,6 @@ def _resolve_unmapped(self, blocks, feat_missing, ambig_map, and start_i >= 0 \ and feat_num-add_num > 0: - # print(str(end_i),str(max(mapping.keys())),str(min(mapping.keys()))) - # print(mapping) expected_p = struct_order[loc][feat_num-add_num] expected_n = struct_order[loc][feat_num+add_num] previous_feat = mapping[start_i] @@ -800,7 +792,7 @@ def verbose(self) -> bool: Gets the verbose of this SeqSearch. :return: The verbose of this SeqSearch. - :rtype: ReferenceData + :rtype: bool """ return self._verbose @@ -810,7 +802,7 @@ def verbose(self, verbose: bool): Sets the verbose of this SeqSearch. :param verbose: The verbose of this SeqSearch. - :type verbose: ReferenceData + :type verbose: bool """ self._verbose = verbose @@ -820,7 +812,7 @@ def verbosity(self) -> int: Gets the verbosity of this SeqSearch. :return: The verbosity of this SeqSearch. - :rtype: ReferenceData + :rtype: int """ return self._verbosity @@ -830,6 +822,39 @@ def verbosity(self, verbosity: int): Sets the verbosity of this SeqSearch. :param verbosity: The verbosity of this SeqSearch. - :type verbosity: ReferenceData + :type verbosity: int """ self._verbosity = verbosity + + +# TODO: Move to util.py +def loctype(s1, e1, s2, e2): + if s1 < s2 and e1 < e2: + return True + else: + return False + + +# TODO: Move to util.py +def getblocks(coords): + block = [] + blocks = [] + sorted_i = sorted(coords.keys()) + if len(sorted_i) == 1: + return [sorted_i] + for i in range(0, len(sorted_i)-1): + j = i+1 + if i == 0: + block.append(sorted_i[i]) + if(j <= len(sorted_i)-1): + if(sorted_i[i] == sorted_i[j]-1): + block.append(sorted_i[j]) + else: + blocks.append(block) + block = [] + block.append(sorted_i[j]) + else: + block.append(sorted_i[j]) + if len(block) > 1: + blocks.append(block) + return blocks diff --git a/seqann/sequence_annotation.py b/seqann/sequence_annotation.py index 8f1f6d1..ac9c773 100644 --- a/seqann/sequence_annotation.py +++ b/seqann/sequence_annotation.py @@ -59,62 +59,41 @@ import re -# TODO: Add documentation -def getblocks(coords): - block = [] - blocks = [] - sorted_i = sorted(coords.keys()) - if len(sorted_i) == 1: - return [sorted_i] - for i in range(0, len(sorted_i)-1): - j = i+1 - if i == 0: - block.append(sorted_i[i]) - if(j <= len(sorted_i)-1): - if(sorted_i[i] == sorted_i[j]-1): - block.append(sorted_i[j]) - else: - blocks.append(block) - block = [] - block.append(sorted_i[j]) - else: - block.append(sorted_i[j]) - if len(block) > 1: - blocks.append(block) - return blocks - - class BioSeqAnn(Model): ''' :: from seqann import BioSeqAnn - seqann = BioSeqAnn() + seqann1 = BioSeqAnn() + seqann2 = BioSeqAnn(dbversion="3300", verbose=True, verbosity=3) + seqann3 = BioSeqAnn(debug={"align":4}, safe) :param server: A BioSQL database to use for retriving the sequence features. Using a BioSQL DB will speed up the annotations dramatically. - :type server: BioSeqDatabase + :type server: :ref:`bio` :param dbversion: The IPD-IMGT/HLA or KIR database release. - :type dbversion: str + :type dbversion: ``str`` :param datfile: The IPD-IMGT/HLA or KIR dat file to use in place of the server parameter. - :type datfile: str + :type datfile: ``str`` :param pid: A process label that can be provided to help track the logging output. - :type pid: str + :type pid: ``str`` :param load_features: Flag for downloading all gene features and accessions from the feature service. - :type load_features: bool + :type load_features: ``bool`` :param store_features: Flag for caching all features and their corresponding accessions. - :type store_features: bool + :type store_features: ``bool`` :param cached_features: Dictionary containing all the features from the feature service. - :type cached_features: Dict + :type cached_features: ``dict`` :param kir: Flag for indicating the input sequences are from the KIR gene system. - :type kir: bool + :type kir: ``bool`` :param align: Flag for producing the alignments along with the annotations. - :type align: bool + :type align: ``bool`` :param verbose: Flag for running in verbose mode. - :type verbose: bool + :type verbose: ``bool`` :param verbosity: Numerical value to indicate how verbose the output will be in verbose mode. - :type verbosity: int + :type verbosity: ``int`` :param debug: Dictionary containing names of steps that you want to debug. - :type debug: Dict + :type debug: ``dict`` + :param safemode: Flag for running the annotations in safemode. No alignments will be done if no feature matches were made. This can prevent the alignment step for running for too long on bad sequences. + :type safemode: ``bool`` ''' def __init__(self, server: BioSeqDatabase=None, dbversion: str='3310', datfile: str='', verbose: bool=False, verbosity: int=0, @@ -218,14 +197,14 @@ def annotate(self, sequence: Seq=None, locus: str=None, :param sequence: The input consensus sequence. :type sequence: Seq :param locus: The gene locus associated with the sequence. - :type locus: str + :type locus: ``str`` :param nseqs: The number of blast sequences to use. - :type nseqs: int + :type nseqs: ``int`` :param alignseqs: The number of sequences to use for targeted alignments. - :type alignseqs: int + :type alignseqs: ``int`` :param skip: A list of alleles to skip for using as a reference. This is used for validation and testing. - :type skip: List - :rtype: Annotation + :type skip: ``List`` + :rtype: :ref:`ann` Returns: The annotate function return an ``Annotation`` object that @@ -250,7 +229,6 @@ def annotate(self, sequence: Seq=None, locus: str=None, Example usage: >>> from Bio.Seq import Seq - >>> from Bio.SeqRecord import SeqRecord >>> from seqann import BioSeqAnn >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC') >>> seqann = BioSeqAnn() @@ -446,6 +424,10 @@ def annotate(self, sequence: Seq=None, locus: str=None, if alignseqs > len(found): alignseqs = len(found)-1 + # * HARD CODED LOGIC * # + # > After testing with multiple thresholds + # this value seemed to work best. + # # Aligned % cutoff align_cutoff = .90 if((not hasattr(partial_ann, 'features') or @@ -482,8 +464,13 @@ def annotate(self, sequence: Seq=None, locus: str=None, annotation=partial_ann, run=i, cutoff=align_cutoff) - align_cutoff -= .01 + # * HARD CODED LOGIC * # + # > If sequences are very novel, then the alignment + # cutoff may be to stringent. Incrementally decreasing + # the cutoff improves the likelihood of these sequences + # being annotated. + align_cutoff -= .01 if aligned_ann and aligned_ann.complete_annotation: if self.align: if self.verbose: @@ -552,8 +539,8 @@ def annotate(self, sequence: Seq=None, locus: str=None, # Don't run full # annotation if flag is passed - # if not full: - # return + if not full: + return if self.verbose: self.logger.info(self.logname + " running full alignment") @@ -570,6 +557,7 @@ def annotate(self, sequence: Seq=None, locus: str=None, + " Finished ref_align annotation using full " + leastmissing_feat.name) + # Check to see if an annotation was returned if(not isinstance(full_align, Annotation) or isinstance(full_align, str)): if(not rerun or len(sequence) > 4000): @@ -586,9 +574,11 @@ def annotate(self, sequence: Seq=None, locus: str=None, skip=[found[0].name], rerun=False) + # Check if the annotation is complete if not full_align.complete_annotation and self.verbose: self.logger.info(self.logname + " Incomplete annotation!") + # Add the alignment to the annotation if self.align and full_align.complete_annotation: if self.verbose: self.logger.info(self.logname + " Adding alignment") @@ -639,12 +629,12 @@ def ref_align(self, found_seqs, sequence: Seq=None, :param sequence: The input sequence record. :type sequence: Seq :param locus: The gene locus associated with the sequence. - :type locus: str + :type locus: ``str`` :param annotation: The incomplete annotation from a previous iteration. - :type annotation: Annotation + :type annotation: :ref:`ann` :param partial_ann: The partial annotation after looping through all of the blast sequences. - :type partial_ann: Annotation - :rtype: Annotation + :type partial_ann: :ref:`ann` + :rtype: :ref:`ann` """ if annotation and isinstance(annotation, Annotation): @@ -688,10 +678,8 @@ def ref_align(self, found_seqs, sequence: Seq=None, ExactPosition(b[len(b)-1]), strand=1), type="unmapped") - #print("1 - STARTING VALUE = " + str(start)) - #print("1 - END VALUE = " + str(b[len(b)-1])) + feat = seq_feat.extract(annotation.seq) - #print(feat) combosrecs, exons, fullrec = self._refseqs(locus, start, annotation, @@ -1026,8 +1014,6 @@ def ref_align(self, found_seqs, sequence: Seq=None, # print("HERE order!!!!") # annotation.missing = missing_f - #print("BEFORE SEQ SEARCH BLOCKS") - #print(annotation.blocks) # Rerunning seqsearch with # new annotation from alignment tmpann = self.seqsearch.search_seqs(found_seqs, @@ -1306,12 +1292,9 @@ def _refseqs(self, locus, start_pos, annotation, feat, block): end_pos = start_pos + len(feat.seq) missing_feats = annotation.missing mapping = annotation.mapping - #print(start_pos, end_pos) - #print(mapping) # TODO: Do not use combos that # don't have missing features - # if start_pos == 0: prv_order = -1 else: @@ -1524,30 +1507,27 @@ def _make_seqfeat(self, start, sequence, featname): return seq_feat, start -class SeqAnnException(Exception): - - def __init__(self, status=None, reason=None, http_resp=None): - if http_resp: - self.status = http_resp.status - self.reason = http_resp.reason - self.body = http_resp.data - self.headers = http_resp.getheaders() +# TODO: Add to util +def getblocks(coords): + block = [] + blocks = [] + sorted_i = sorted(coords.keys()) + if len(sorted_i) == 1: + return [sorted_i] + for i in range(0, len(sorted_i)-1): + j = i+1 + if i == 0: + block.append(sorted_i[i]) + if(j <= len(sorted_i)-1): + if(sorted_i[i] == sorted_i[j]-1): + block.append(sorted_i[j]) + else: + blocks.append(block) + block = [] + block.append(sorted_i[j]) else: - self.status = status - self.reason = reason - self.body = None - self.headers = None - - def __str__(self): - """ - Custom error messages for exception - """ - error_message = "({0})\n"\ - "Reason: {1}\n".format(self.status, self.reason) - if self.headers: - error_message += "HTTP response headers: {0}\n".format(self.headers) - - if self.body: - error_message += "HTTP response body: {0}\n".format(self.body) + block.append(sorted_i[j]) + if len(block) > 1: + blocks.append(block) + return blocks - return error_message diff --git a/tests/test_seqann.py b/tests/test_seqann.py index ef3b08d..68e5d1d 100644 --- a/tests/test_seqann.py +++ b/tests/test_seqann.py @@ -776,80 +776,90 @@ def test_018_nogfe(self): # TODO: Break alleles into separate tests # based on what they are testing - @ignore_warnings - @unittest.skipUnless(conn(), "TestBioSeqAnn 015 Requires MySQL connection") - def test_019_skipserv(self): - # import logging - # logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s', - # datefmt='%m/%d/%Y %I:%M:%S %p', - # level=logging.INFO) - server = BioSeqDatabase.open_database(driver="pymysql", - user=biosqluser, - passwd=biosqlpass, - host=biosqlhost, - db=biosqldb, - port=biosqlport) - - seqann1 = BioSeqAnn(verbose=False, verbosity=0) - seqann = BioSeqAnn(server=server, verbose=False, verbosity=0) - refdata = seqann1.refdata - - # removed 'HLA-DRB1*04:04:01' because it's - # too large to test with travis - test_list = ['HLA-C*07:241', 'HLA-A*01:07', 'HLA-A*01:01:59', - 'HLA-A*01:09:01:01', 'HLA-A*02:545', 'HLA-B*40:02:05', - 'HLA-A*29:13', 'HLA-A*24:03:02', 'HLA-A*02:544', - 'HLA-DQA1*04:01:01:01', 'HLA-A*01:217', 'HLA-A*01:22N', - 'HLA-B*51:42', 'HLA-C*03:04:05', 'HLA-A*01:01:01:04', - 'HLA-A*01:09:01:01', 'HLA-B*82:01', 'HLA-A*03:04:01', - 'HLA-C*07:06:01:01', 'HLA-A*03:51', 'HLA-A*29:109', - 'HLA-A*02:01:130', 'HLA-B*07:271', "HLA-DRB1*13:247", - "HLA-DRB4*01:03:05", "HLA-DRB4*01:03:06", - "HLA-DQA1*05:01:02", "HLA-DRB1*13:02:02", - "HLA-DRB4*01:03:04"] - - for seqname in refdata.hlaref: - if seqname not in test_list: - continue + # @ignore_warnings + # @unittest.skipUnless(conn(), "TestBioSeqAnn 015 Requires MySQL connection") + # def test_019_skipserv(self): + # # import logging + # # logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s', + # # datefmt='%m/%d/%Y %I:%M:%S %p', + # # level=logging.INFO) + # server = BioSeqDatabase.open_database(driver="pymysql", + # user=biosqluser, + # passwd=biosqlpass, + # host=biosqlhost, + # db=biosqldb, + # port=biosqlport) + + # seqann1 = BioSeqAnn(verbose=False, verbosity=0) + # seqann = BioSeqAnn(server=server, verbose=False, verbosity=0) + # refdata = seqann1.refdata + + # # removed 'HLA-DRB1*04:04:01' because it's + # # # too large to test with travis + # # test_list = ['HLA-C*07:241', 'HLA-A*01:07', 'HLA-A*01:01:59', + # # 'HLA-A*01:09:01:01', 'HLA-A*02:545', 'HLA-B*40:02:05', + # # 'HLA-A*29:13', 'HLA-A*24:03:02', 'HLA-A*02:544', + # # 'HLA-DQA1*04:01:01:01', 'HLA-A*01:217', 'HLA-A*01:22N', + # # 'HLA-B*51:42', 'HLA-C*03:04:05', 'HLA-A*01:01:01:04', + # # 'HLA-A*01:09:01:01', 'HLA-B*82:01', 'HLA-A*03:04:01', + # # 'HLA-C*07:06:01:01', 'HLA-A*03:51', 'HLA-A*29:109', + # # 'HLA-A*02:01:130', 'HLA-B*07:271', "HLA-DRB1*13:247", + # # "HLA-DRB4*01:03:05", "HLA-DRB4*01:03:06", + # # "HLA-DQA1*05:01:02", "HLA-DRB1*13:02:02", + # # "HLA-DRB4*01:03:04"] + # test_list = ['HLA-C*07:241', 'HLA-A*01:07', 'HLA-A*01:01:59', + # 'HLA-A*01:09:01:01', 'HLA-A*02:545', 'HLA-B*40:02:05', + # 'HLA-A*29:13', 'HLA-A*24:03:02', 'HLA-A*02:544', + # 'HLA-DQA1*04:01:01:01', 'HLA-A*01:217', 'HLA-A*01:22N', + # 'HLA-B*51:42', 'HLA-C*03:04:05', 'HLA-A*01:01:01:04', + # 'HLA-A*01:09:01:01', 'HLA-B*82:01', 'HLA-A*03:04:01', + # 'HLA-C*07:06:01:01', 'HLA-A*03:51', 'HLA-A*29:109', + # 'HLA-A*02:01:130', 'HLA-B*07:271', "HLA-DRB1*13:247", + # "HLA-DRB4*01:03:05", "HLA-DRB4*01:03:06", + # "HLA-DQA1*05:01:02", "HLA-DRB1*13:02:02", + # "HLA-DRB4*01:03:04"] + # for seqname in refdata.hlaref: + # if seqname not in test_list: + # continue - seqrec = refdata.hlaref[seqname] - locus = seqrec.description.split("*")[0] - ann1 = seqann.annotate(seqrec, locus=locus) - ann2 = seqann.annotate(seqrec, locus=locus, skip=[seqname]) - self.assertTrue(ann1.exact) - self.assertEqual(len(ann2.annotation), len(ann1.annotation)) + # print(seqname) + # seqrec = refdata.hlaref[seqname] + # locus = seqrec.description.split("*")[0] + # ann1 = seqann.annotate(seqrec, locus=locus) + # ann2 = seqann.annotate(seqrec, locus=locus, skip=[seqname]) + # self.assertTrue(ann1.exact) + # self.assertEqual(len(ann2.annotation), len(ann1.annotation)) - #for feat in ann2.structure: - # self.assertIsInstance(feat, Feature) + # #for feat in ann2.structure: + # # self.assertIsInstance(feat, Feature) - for f in ann1.annotation: - self.assertTrue(f in ann2.annotation) - seq1 = str(ann1.annotation[f]) - # seq2 = '** NA **' - # if f in ann2.annotation: - # if f not in ann2.annotation: - # print(seqname, "MISSING", f) - # else: - # if f in ann2.annotation: - seq2 = str(ann2.annotation[f].seq) - # if seq1 != seq2: - # print(seqname, "NOT EQUAL", f) - #print(f, seq1, seq2) - self.assertEqual(seq1, seq2) - #print(f,seq2) + # for f in ann1.annotation: + # self.assertTrue(f in ann2.annotation) + # seq1 = str(ann1.annotation[f]) + # # seq2 = '** NA **' + # # if f in ann2.annotation: + # # if f not in ann2.annotation: + # # print(seqname, "MISSING", f) + # # else: + # # if f in ann2.annotation: + # seq2 = str(ann2.annotation[f].seq) + # # if seq1 != seq2: + # # print(seqname, "NOT EQUAL", f) + # #print(f, seq1, seq2) + # self.assertEqual(seq1, seq2) + # #print(f,seq2) - self.assertEqual(ann1.gfe, ann2.gfe) + # self.assertEqual(ann1.gfe, ann2.gfe) - server.close() - pass + # server.close() + # pass def test_020_skip(self): # import logging # logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s', # datefmt='%m/%d/%Y %I:%M:%S %p', # level=logging.INFO) - seqann = BioSeqAnn(verbose=False, - verbosity=3) + seqann = BioSeqAnn(verbose=False) refdata = seqann.refdata test_list = ['HLA-C*07:241', 'HLA-A*01:07', 'HLA-A*01:01:59', 'HLA-A*01:09:01:01', 'HLA-A*02:545', @@ -927,153 +937,3 @@ def test_021_stringseq(self): server.close() pass - # def test_022_dataload(self): - # # import logging - # # logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s', - # # datefmt='%m/%d/%Y %I:%M:%S %p', - # # level=logging.INFO) - # data_dir = '/Users/mhalagan/research/NGS/GFE/gfe/seqann/models' - # featurelength_file = data_dir + "/../data/feature_lengths.csv" - # allele_list = data_dir + '/../data/allele_lists/Allelelist.3310.txt' - # seqref_pickle = data_dir + '/../data/seqref.3310.pickle' - # hlaref_pickle = data_dir + '/../data/hlaref.3310.pickle' - - # with open(seqref_pickle, 'rb') as handle: - # seqdata = pickle.load(handle) - # handle.close() - # with open(hlaref_pickle, 'rb') as handle: - # hladata = pickle.load(handle) - # handle.close() - - # hla_names = [] - # with open(allele_list, 'r') as f: - # for line in f: - # line = line.rstrip() - # accession, name = line.split(" ") - # hla_names.append("HLA-" + name) - # f.close() - - # feature_lengths = {} - # columns = ['mean', 'std', 'min', 'max'] - # with open(featurelength_file, newline='') as csvfile: - # reader = csv.DictReader(csvfile) - # for row in reader: - # ldata = [row[c] for c in columns] - # if row['locus'] in feature_lengths: - # feature_lengths[row['locus']].update({row['feature']: ldata}) - # else: - # feature_lengths.update({row['locus']: {row['feature']: ldata}}) - # csvfile.close() - - # refdata = ReferenceData(seqdata=seqdata, - # hladata=hladata, - # alleles=hla_names, - # featuredata=feature_lengths, - # verbose=True) - - # seqann = BioSeqAnn(verbose=True, verbosity=3, refdata=refdata) - # self.assertIsInstance(seqann, BioSeqAnn) - - # refdata = seqann.refdata - # test_list = ['HLA-C*07:241', 'HLA-A*01:07', 'HLA-A*01:01:59'] - - # for seqname in refdata.hlaref: - # if seqname not in test_list: - # continue - - # seqrec = refdata.hlaref[seqname] - # locus = seqrec.description.split("*")[0] - # ann1 = seqann.annotate(seqrec, locus=locus) - # ann2 = seqann.annotate(seqrec, locus=locus, skip=[seqname]) - # self.assertTrue(ann1.exact) - # self.assertEqual(len(ann2.annotation), len(ann1.annotation)) - # self.assertEqual(ann1.gfe, ann2.gfe) - # self.assertGreater(len(ann2.structure), 1) - # for feat in ann2.structure: - # self.assertIsInstance(feat, Feature) - # for f in ann1.annotation: - # self.assertTrue(f in ann2.annotation) - # seq1 = str(ann1.annotation[f]) - # seq2 = str(ann2.annotation[f].seq) - # self.assertEqual(seq1, seq2) - # pass - - # def test_023_dataload(self): - # import logging - # logging.basicConfig(format='%(asctime)s - %(name)-35s - %(levelname)-5s - %(funcName)s %(lineno)d: - %(message)s', - # datefmt='%m/%d/%Y %I:%M:%S %p', - # level=logging.INFO) - # seqann = BioSeqAnn(verbose=True, verbosity=5) - # input_seq = self.data_dir + '/insertion_seqs.fasta' - # in_seqrec = list(SeqIO.parse(input_seq, "fasta"))[4] - # ann_seq = seqann.annotate(in_seqrec, "HLA-DRB1") - # print(ann_seq) - # self.assertEqual(1, 1) - - # TODO: Tests to add: - # - test safemode - # - test - - # def test_022_diffdbs(self): - - # # diff_d = { - # # 3250: ['HLA-DQB1*06:37'], - # # 3260: ['HLA-DQB1*03:01:17', 'HLA-DQB1*03:01:22', 'HLA-DQB1*03:10:02'] - # # } - # diff_d = { - # 3250: ['HLA-DQB1*06:37'], - # 3260: ['HLA-DQB1*03:01:17', 'HLA-DQB1*03:01:22', 'HLA-DQB1*03:10:02', - # 'HLA-DQB1*03:150', 'HLA-DQB1*03:211', 'HLA-DQB1*04:01:01', 'HLA-DQB1*04:02:01', 'HLA-DQB1*04:11', - # 'HLA-DQB1*04:32', 'HLA-DQB1*05:02:01', 'HLA-DQB1*05:02:07', 'HLA-DQB1*05:02:11', 'HLA-DQB1*05:106', - # 'HLA-DQB1*05:52', 'HLA-DQB1*05:57', 'HLA-DQB1*05:97', 'HLA-DQB1*06:02:01:02', 'HLA-DQB1*06:02:22', - # 'HLA-DQB1*06:03:12', 'HLA-DQB1*06:03:14', 'HLA-DQB1*06:79:01'], - # 3270: ['HLA-DQB1*02:84', 'HLA-DQB1*03:239', 'HLA-DQB1*05:04', 'HLA-DQB1*05:132Q', 'HLA-DQB1*05:134', - # 'HLA-DQB1*06:02:17', 'HLA-DQB1*06:10'], - # 3290: ['HLA-DQB1*05:03:01:03', 'HLA-DQB1*05:149'], - # 3300: ['HLA-DQB1*02:02:01:01', 'HLA-DQB1*02:02:01:03', 'HLA-DQB1*03:01:01:19', 'HLA-DQB1*03:02:01:04', - # 'HLA-DQB1*03:02:01:05', 'HLA-DQB1*03:02:01:06', 'HLA-DQB1*03:03:02:04', 'HLA-DQB1*03:10:02:01', - # 'HLA-DQB1*03:10:02:02', 'HLA-DQB1*03:12', 'HLA-DQB1*04:01:01:02', 'HLA-DQB1*05:12', 'HLA-DQB1*06:02:01:04', - # 'HLA-DQB1*06:09:07', 'HLA-DQB1*06:206:02'], - # 3310: ['HLA-DQB1*06:01:01:02', 'HLA-DQB1*06:243'] - # } - # for db in diff_d.keys(): - # prev_db = db - 10 - # seqann1 = BioSeqAnn(dbversion=str(db)) - # seqann2 = BioSeqAnn(dbversion=str(prev_db)) - # alleles = diff_d[db] - # for seqname in alleles: - # print("Running", str(db), seqname) - # seqrec = seqann1.refdata.hlaref[seqname] - # locus = seqrec.description.split("*")[0] - # ann1 = seqann1.annotate(seqrec, locus=locus) - # ann2 = seqann2.annotate(seqrec, locus=locus) - # if ann1.gfe != ann2.gfe: - # # self.assertTrue(ann1.exact) - # # self.assertEqual(len(ann2.annotation), len(ann1.annotation)) - # # self.assertEqual(ann1.gfe, ann2.gfe) - # # self.assertGreater(len(ann2.structure), 1) - # # for feat in ann2.structure: - # # self.assertIsInstance(feat, Feature) - # features = list(set(list(ann1.annotation.keys()) + list(ann1.annotation.keys()))) - # for f in features: - # seq1 = "** NA 1 **" - # seq2 = "** NA 2 **" - # if f in ann1.annotation: - # seq1 = str(ann1.annotation[f]) - - # if ann2.exact and f in ann2.annotation: - # seq2 = str(ann2.annotation[f]) - # else: - # if f in ann2.annotation: - # seq2 = str(ann2.annotation[f].seq) - - # if seq1 == seq2: - # print(seqname, f, 'EQUAL', len(seq1), len(seq2), ann1.gfe, ann2.gfe) - # else: - # print(seqname, f, 'DIFFER', len(seq1), len(seq2), ann1.gfe, ann2.gfe, seq1, seq2) - # # self.assertTrue(f in ann2.annotation) - # # seq1 = - # # seq2 = str(ann2.annotation[f].seq) - # # self.assertEqual(seq1, seq2) - # pass -