diff --git a/.travis.yml b/.travis.yml index 2d0e18a..23e1bd5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -61,7 +61,7 @@ install: - source activate test-environment # install pysam from conda because I'm having trouble installing Cython # for Python 3 on Travis - - conda install -c bioconda pysam=0.9.0 + - conda install -c bioconda pysam - pip install -r requirements.txt - pip install . - pip install coveralls diff --git a/requirements.txt b/requirements.txt index 6636654..33a49eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ numpy>=1.14.0 pandas pyensembl>=1.5.0 varcode>=0.5.9 -isovar>=0.8.5,<1.0.0 +isovar>=1.1.0 mhctools>=1.5.0 roman jinja2 diff --git a/setup.py b/setup.py index e3ddee7..3e3afa4 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ 'pandas', 'pyensembl>=1.5.0', 'varcode>=0.5.9', - 'isovar>=0.8.5,<1.0.0', + 'isovar>=1.1.0', 'mhctools>=1.5.0', 'roman', 'jinja2', diff --git a/test/test_mutant_protein_sequence.py b/test/test_mutant_protein_sequence.py index 989b1dc..619bff6 100644 --- a/test/test_mutant_protein_sequence.py +++ b/test/test_mutant_protein_sequence.py @@ -13,11 +13,9 @@ from __future__ import absolute_import, print_function, division from nose.tools import eq_, assert_almost_equal -from vaxrank.core_logic import run_vaxrank +from vaxrank.cli import make_vaxrank_arg_parser, run_vaxrank_from_parsed_args +from vaxrank.mutant_protein_fragment import MutantProteinFragment from mhctools import RandomBindingPredictor -from isovar.cli.variant_sequences_args import make_variant_sequences_arg_parser -from isovar.cli.rna_args import allele_reads_generator_from_args -from varcode.cli import variant_collection_from_args from .testing_helpers import data_path @@ -49,25 +47,19 @@ def test_mutant_amino_acids_in_mm10_chrX_8125624_refC_altA_pS460I(): # there are two co-occurring variants in the RNAseq data but since # they don't happen in the same codon then we're considering the Varcode # annotation to be correct - # TODO: deal with phasing of variants explicitly so that both - # variant positions are considered mutated - arg_parser = make_variant_sequences_arg_parser() + # TODO: + # deal with phasing of variants explicitly so that both + # variant positions are considered mutated + arg_parser = make_vaxrank_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Wdr13.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), + "--vaccine-peptide-length", "15", + "--padding-around-mutation", "5", + "--mhc-predictor", "random", + "--mhc-alleles", "HLA-A*02:01", ]) - reads_generator = allele_reads_generator_from_args(args) - variants = variant_collection_from_args(args) - results = run_vaxrank( - variants=variants, - reads_generator=reads_generator, - mhc_predictor=random_binding_predictor, - vaccine_peptide_length=15, - padding_around_mutation=5, - max_vaccine_peptides_per_variant=1, - min_alt_rna_reads=1, - min_variant_sequence_coverage=1, - variant_sequence_assembly=True) + results = run_vaxrank_from_parsed_args(args) ranked_list = results.ranked_vaccine_peptides for variant, vaccine_peptides in ranked_list: @@ -85,23 +77,16 @@ def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): # mentions the G>T variant but doesn't include the subsequent nucleotide # change T>G. To avoid having to think about phasing of variants I changed # the VCF in vaxrank to contain a GT>TG variant. - arg_parser = make_variant_sequences_arg_parser() + arg_parser = make_vaxrank_arg_parser() args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), + "--vaccine-peptide-length", "15", + "--padding-around-mutation", "5", + "--mhc-predictor", "random", + "--mhc-alleles", "HLA-A*02:01", ]) - reads_generator = allele_reads_generator_from_args(args) - variants = variant_collection_from_args(args) - results = run_vaxrank( - reads_generator=reads_generator, - mhc_predictor=random_binding_predictor, - variants=variants, - vaccine_peptide_length=15, - padding_around_mutation=5, - min_alt_rna_reads=1, - min_variant_sequence_coverage=1, - variant_sequence_assembly=True, - max_vaccine_peptides_per_variant=1) + results = run_vaxrank_from_parsed_args(args) ranked_list = results.ranked_vaccine_peptides for variant, vaccine_peptides in ranked_list: @@ -112,25 +97,19 @@ def test_mutant_amino_acids_in_mm10_chr9_82927102_refGT_altTG_pT441H(): mutant_protein_fragment) def test_keep_top_k_epitopes(): - arg_parser = make_variant_sequences_arg_parser() + arg_parser = make_vaxrank_arg_parser() + keep_k_epitopes = 3 args = arg_parser.parse_args([ "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), "--bam", data_path("b16.f10/b16.combined.sorted.bam"), + "--vaccine-peptide-length", "15", + "--padding-around-mutation", "5", + "--num-epitopes-per-vaccine-peptide", str(keep_k_epitopes), + "--mhc-predictor", "netmhc", + "--mhc-alleles", "HLA-A*02:01", ]) - reads_generator = allele_reads_generator_from_args(args) - variants = variant_collection_from_args(args) - keep_k_epitopes = 3 - results = run_vaxrank( - reads_generator=reads_generator, - mhc_predictor=random_binding_predictor, - variants=variants, - vaccine_peptide_length=15, - padding_around_mutation=5, - min_alt_rna_reads=1, - min_variant_sequence_coverage=1, - variant_sequence_assembly=True, - max_vaccine_peptides_per_variant=1, - num_mutant_epitopes_to_keep=keep_k_epitopes) + results = run_vaxrank_from_parsed_args(args) + ranked_list = results.ranked_vaccine_peptides for variant, vaccine_peptides in ranked_list: @@ -141,3 +120,25 @@ def test_keep_top_k_epitopes(): mutant_epitope_score = sum( p.logistic_epitope_score() for p in vaccine_peptide.mutant_epitope_predictions) assert_almost_equal(mutant_epitope_score, vaccine_peptide.mutant_epitope_score) + +def test_mutant_protein_fragment_serialization(): + arg_parser = make_vaxrank_arg_parser() + keep_k_epitopes = 3 + args = arg_parser.parse_args([ + "--vcf", data_path("b16.f10/b16.f10.Phip.vcf"), + "--bam", data_path("b16.f10/b16.combined.sorted.bam"), + "--vaccine-peptide-length", "15", + "--padding-around-mutation", "5", + "--num-epitopes-per-vaccine-peptide", str(keep_k_epitopes), + "--mhc-predictor", "netmhc", + "--mhc-alleles", "HLA-A*02:01", + ]) + results = run_vaxrank_from_parsed_args(args) + + ranked_list = results.ranked_vaccine_peptides + + for _, vaccine_peptides in ranked_list: + mutant_protein_fragment = vaccine_peptides[0].mutant_protein_fragment + json_str = mutant_protein_fragment.to_json() + deserialized = MutantProteinFragment.from_json(json_str) + eq_(mutant_protein_fragment, deserialized) diff --git a/test/test_shell_script.py b/test/test_shell_script.py index 8f67c9b..8955228 100644 --- a/test/test_shell_script.py +++ b/test/test_shell_script.py @@ -30,7 +30,7 @@ "--mhc-predictor", "random", "--mhc-alleles", "H2-Kb,H2-Db", "--padding-around-mutation", "5", - "--include-mismatches-after-variant" + "--count-mismatches-after-variant", ] cli_args_for_b16_seqdata_real_predictor = [ @@ -41,7 +41,7 @@ "--mhc-alleles", "H2-Kb,H2-Db", "--mhc-epitope-lengths", "8", "--padding-around-mutation", "5", - "--include-mismatches-after-variant" + "--count-mismatches-after-variant" ] diff --git a/vaxrank/__init__.py b/vaxrank/__init__.py index c68196d..67bc602 100644 --- a/vaxrank/__init__.py +++ b/vaxrank/__init__.py @@ -1 +1 @@ -__version__ = "1.2.0" +__version__ = "1.3.0" diff --git a/vaxrank/cli.py b/vaxrank/cli.py index 8594d1c..bf09a2d 100644 --- a/vaxrank/cli.py +++ b/vaxrank/cli.py @@ -17,9 +17,8 @@ import pkg_resources from argparse import ArgumentParser -from isovar.cli.rna_args import allele_reads_generator_from_args -from isovar.cli.translation_args import add_translation_args -from isovar.cli.variant_sequences_args import make_variant_sequences_arg_parser + +from isovar.cli import (make_isovar_arg_parser, run_isovar_from_parsed_args,) from mhctools.cli import ( add_mhc_args, mhc_alleles_from_args, @@ -46,16 +45,19 @@ logger = logging.getLogger(__name__) -def new_run_arg_parser(): +def make_vaxrank_arg_parser(): + # create common parser with the --version flag + parent_parser = ArgumentParser('parent', add_help=False) + parent_parser.add_argument('--version', action='version', version='Vaxrank %s' % (__version__,)) + # inherit commandline options from Isovar - arg_parser = make_variant_sequences_arg_parser( + arg_parser = make_isovar_arg_parser( prog="vaxrank", description=( "Select personalized vaccine peptides from cancer variants, " "expression data, and patient HLA type."), + parents=[parent_parser], ) - add_version_args(arg_parser) - add_translation_args(arg_parser) add_mhc_args(arg_parser) add_vaccine_peptide_args(arg_parser) add_output_args(arg_parser) @@ -71,7 +73,6 @@ def cached_run_arg_parser(): "Select personalized vaccine peptides from cancer variants, " "expression data, and patient HLA type."), ) - add_version_args(arg_parser) arg_parser.add_argument( "--input-json-file", default="", @@ -82,13 +83,6 @@ def cached_run_arg_parser(): return arg_parser -def add_version_args(parser): - parser.add_argument( - "--version", - help="Print Vaxrank version and immediately exit", - default=False, - action="store_true") - # Lets the user specify whether they want to see particular sections in the report. def add_optional_output_args(arg_parser): @@ -97,6 +91,7 @@ def add_optional_output_args(arg_parser): "--include-manufacturability-in-report", dest="manufacturability", action="store_true") + manufacturability_args.add_argument( "--no-manufacturability-in-report", dest="manufacturability", @@ -165,12 +160,6 @@ def add_output_args(arg_parser): help="Path to XLSX neoepitope report, containing information focusing on short peptide " "sequences.") - output_args_group.add_argument( - "--num-epitopes-per-peptide", - type=int, - help="Number of top-ranking epitopes for each vaccine peptide to include in the " - "neoepitope report.") - output_args_group.add_argument( "--output-reviewed-by", default="", @@ -188,6 +177,7 @@ def add_output_args(arg_parser): output_args_group.add_argument( "--max-mutations-in-report", + default=None, type=int, help="Number of mutations to report") @@ -204,22 +194,25 @@ def add_vaccine_peptide_args(arg_parser): "--vaccine-peptide-length", default=25, type=int, - help="Number of amino acids in the vaccine peptides (default %(default)s)") + help="Number of amino acids in the vaccine peptides. (default: %(default)s)") vaccine_peptide_group.add_argument( "--padding-around-mutation", - default=0, + default=5, type=int, help=( "Number of off-center windows around the mutation to consider " - "as vaccine peptides (default %(default)s)" + "as vaccine peptides. (default: %(default)s)" )) vaccine_peptide_group.add_argument( "--max-vaccine-peptides-per-mutation", default=1, type=int, - help="Number of vaccine peptides to generate for each mutation") + help=( + "Number of vaccine peptides to generate for each mutation. " + "(default: %(default)s)" + )) vaccine_peptide_group.add_argument( "--min-epitope-score", @@ -227,7 +220,14 @@ def add_vaccine_peptide_args(arg_parser): type=float, help=( "Ignore predicted MHC ligands whose normalized binding score " - "falls below this threshold")) + "falls below this threshold. (default: %(default)s)")) + + vaccine_peptide_group.add_argument( + "--num-epitopes-per-vaccine-peptide", + type=int, + help=( + "Maximum number of mutant epitopes to consider when scoring " + "each vaccine peptide. (default: %(default)s)")) def add_supplemental_report_args(arg_parser): @@ -257,8 +257,25 @@ def check_args(args): "--output-json-file, " "--output-passing-variants-csv") +def run_vaxrank_from_parsed_args(args): + + mhc_predictor = mhc_binding_predictor_from_args(args) -def ranked_variant_list_with_metadata(args): + args.protein_sequence_length = ( + args.vaccine_peptide_length + 2 * args.padding_around_mutation + ) + # Vaxrank is going to evaluate multiple vaccine peptides containing + # the same mutation so need a longer sequence from Isovar + isovar_results = run_isovar_from_parsed_args(args) + return run_vaxrank( + isovar_results=isovar_results, + mhc_predictor=mhc_predictor, + vaccine_peptide_length=args.vaccine_peptide_length, + max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation, + min_epitope_score=args.min_epitope_score, + num_mutant_epitopes_to_keep=args.num_epitopes_per_vaccine_peptide) + +def ranked_vaccine_peptides_with_metadata_from_parsed_args(args): """ Computes all the data needed for report generation. @@ -272,8 +289,10 @@ def ranked_variant_list_with_metadata(args): - a dictionary of command-line arguments used to generate it - patient info object """ + if hasattr(args, 'input_json_file'): with open(args.input_json_file) as f: + data = serializable.from_json(f.read()) # the JSON data from the previous run will have the older args saved, which may need to # be overridden with args from this run (which all be output related) @@ -283,29 +302,14 @@ def ranked_variant_list_with_metadata(args): if len(data['variants']) > args.max_mutations_in_report: data['variants'] = data['variants'][:args.max_mutations_in_report] return data - # get various things from user args mhc_alleles = mhc_alleles_from_args(args) logger.info("MHC alleles: %s", mhc_alleles) + variants = variant_collection_from_args(args) logger.info("Variants: %s", variants) - # generator that for each variant gathers all RNA reads, both those - # supporting the variant and reference alleles - reads_generator = allele_reads_generator_from_args(args) - mhc_predictor = mhc_binding_predictor_from_args(args) - vaxrank_results = run_vaxrank( - variants=variants, - reads_generator=reads_generator, - mhc_predictor=mhc_predictor, - vaccine_peptide_length=args.vaccine_peptide_length, - padding_around_mutation=args.padding_around_mutation, - max_vaccine_peptides_per_variant=args.max_vaccine_peptides_per_mutation, - min_alt_rna_reads=args.min_alt_rna_reads, - min_variant_sequence_coverage=args.min_variant_sequence_coverage, - min_epitope_score=args.min_epitope_score, - num_mutant_epitopes_to_keep=args.num_epitopes_per_peptide, - variant_sequence_assembly=args.variant_sequence_assembly) + vaxrank_results = run_vaxrank_from_parsed_args(args) variants_count_dict = vaxrank_results.variant_counts() assert len(variants) == variants_count_dict['num_total_variants'], \ @@ -318,8 +322,9 @@ def ranked_variant_list_with_metadata(args): df = pd.DataFrame(variant_metadata_dicts) df.to_csv(args.output_passing_variants_csv, index=False) - ranked_list = vaxrank_results.ranked_vaccine_peptides - ranked_list_for_report = ranked_list[:args.max_mutations_in_report] + ranked_variants_with_vaccine_peptides = vaxrank_results.ranked_vaccine_peptides + ranked_variants_with_vaccine_peptides_for_report = \ + ranked_variants_with_vaccine_peptides[:args.max_mutations_in_report] patient_info = PatientInfo( patient_id=args.output_patient_id, vcf_paths=variants.sources, @@ -332,7 +337,10 @@ def ranked_variant_list_with_metadata(args): ) # return variants, patient info, and command-line args data = { - 'variants': ranked_list_for_report, + # TODO: + # change this field to 'ranked_variants_with_vaccine_peptides' but figure out + # how to do it in a backwards compatible way + 'variants': ranked_variants_with_vaccine_peptides_for_report, 'patient_info': patient_info, 'args': vars(args), } @@ -348,6 +356,23 @@ def ranked_variant_list_with_metadata(args): return data +def configure_logging(args): + logging.config.fileConfig( + pkg_resources.resource_filename( + __name__, + 'logging.conf'), + defaults={'logfilename': args.log_path}) + +def choose_arg_parser(args_list): + # TODO: replace this with a command sub-parser + if "--input-json-file" in args_list: + return cached_run_arg_parser() + else: + return make_vaxrank_arg_parser() + +def parse_vaxrank_args(args_list): + arg_parser = choose_arg_parser(args_list) + return arg_parser.parse_args(args_list) def main(args_list=None): """ @@ -364,27 +389,14 @@ def main(args_list=None): if args_list is None: args_list = sys.argv[1:] - if "--version" in args_list: - print("Vaxrank version: %s" % __version__) - return - - if "--input-json-file" in args_list: - arg_parser = cached_run_arg_parser() - else: - arg_parser = new_run_arg_parser() - - args = arg_parser.parse_args(args_list) - logging.config.fileConfig( - pkg_resources.resource_filename( - __name__, - 'logging.conf'), - defaults={'logfilename': args.log_path}) - + args = parse_vaxrank_args(args_list) + configure_logging(args) logger.info(args) check_args(args) - data = ranked_variant_list_with_metadata(args) - ranked_variant_list = data['variants'] + data = ranked_vaccine_peptides_with_metadata_from_parsed_args(args) + + ranked_variants_with_vaccine_peptides = data['variants'] patient_info = data['patient_info'] args_for_report = data['args'] @@ -393,14 +405,14 @@ def main(args_list=None): ################### if args.output_csv or args.output_xlsx_report: make_csv_report( - ranked_variant_list, + ranked_variants_with_vaccine_peptides, excel_report_path=args.output_xlsx_report, csv_report_path=args.output_csv) if args.output_neoepitope_report: make_minimal_neoepitope_report( - ranked_variant_list, - num_epitopes_per_peptide=args.num_epitopes_per_peptide, + ranked_variants_with_vaccine_peptides, + num_epitopes_per_peptide=args.num_epitopes_per_vaccine_peptide, excel_report_path=args.output_neoepitope_report) ######################## @@ -412,7 +424,7 @@ def main(args_list=None): input_json_file = args.input_json_file if hasattr(args, 'input_json_file') else None template_data_creator = TemplateDataCreator( - ranked_variants_with_vaccine_peptides=ranked_variant_list, + ranked_variants_with_vaccine_peptides=ranked_variants_with_vaccine_peptides, patient_info=patient_info, final_review=args.output_final_review, reviewers=args.output_reviewed_by, diff --git a/vaxrank/core_logic.py b/vaxrank/core_logic.py index 85d2f00..5738a2d 100644 --- a/vaxrank/core_logic.py +++ b/vaxrank/core_logic.py @@ -20,31 +20,22 @@ from .epitope_prediction import predict_epitopes, slice_epitope_predictions from .vaccine_peptide import VaccinePeptide from .vaxrank_results import VaxrankResults -from .protein_sequences import create_variant_to_protein_sequence_dict logger = logging.getLogger(__name__) def run_vaxrank( - variants, - reads_generator, + isovar_results, mhc_predictor, vaccine_peptide_length=25, - padding_around_mutation=5, max_vaccine_peptides_per_variant=1, - min_alt_rna_reads=1, - min_variant_sequence_coverage=1, - variant_sequence_assembly=True, num_mutant_epitopes_to_keep=10000, min_epitope_score=0.0): """ Parameters ---------- - variants : VariantCollection - Variant objects to evaluate for vaccine inclusion - - reads_generator : generator - Yields sequence of varcode.Variant objects paired with sequences of - AlleleRead objects that support that variant. + isovar_results : list of isovar.IsovarResult + Each IsovarResult corresponds to one somatic variant and its collection + of protein sequences determined from RNA. mhc_predictor : mhctools.BasePredictor Object with predict_peptides method, used for making pMHC binding @@ -53,26 +44,9 @@ def run_vaxrank( vaccine_peptide_length : int Length of vaccine SLP to construct - padding_around_mutation : int - Number of off-center windows around the mutation to consider as vaccine - peptides. - max_vaccine_peptides_per_variant : int Number of vaccine peptides to generate for each mutation. - min_alt_rna_reads : int - Drop variant sequences at loci with fewer than this number of reads - supporting the alt allele. - - min_variant_sequence_coverage : int - Trim variant sequences to positions supported by at least this number - of RNA reads. - - variant_sequence_assembly : bool - If True, then assemble variant cDNA sequences based on overlap of RNA - reads. If False, then variant cDNA sequences must be fully spanned and - contained within RNA reads. - num_mutant_epitopes_to_keep : int, optional Number of top-ranking epitopes for each vaccine peptide to include in computation. @@ -81,45 +55,33 @@ def run_vaxrank( Ignore peptides with binding predictions whose normalized score is less than this. """ - # total number of amino acids is the vaccine peptide length plus the - # number of off-center windows around the mutation - protein_fragment_sequence_length = ( - vaccine_peptide_length + 2 * padding_around_mutation) - variant_to_protein_sequences_dict = create_variant_to_protein_sequence_dict( - reads_generator=reads_generator, - sequence_length=protein_fragment_sequence_length, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - variant_sequence_assembly=variant_sequence_assembly) variant_to_vaccine_peptides_dict = create_vaccine_peptides_dict( - protein_sequence_dict=variant_to_protein_sequences_dict, + isovar_results=isovar_results, mhc_predictor=mhc_predictor, vaccine_peptide_length=vaccine_peptide_length, - padding_around_mutation=padding_around_mutation, max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant, num_mutant_epitopes_to_keep=num_mutant_epitopes_to_keep, min_epitope_score=min_epitope_score) ranked_list = ranked_vaccine_peptides(variant_to_vaccine_peptides_dict) + return VaxrankResults( - variants=variants, - variant_to_protein_sequences_dict=variant_to_protein_sequences_dict, + isovar_results=isovar_results, variant_to_vaccine_peptides_dict=variant_to_vaccine_peptides_dict, ranked_vaccine_peptides=ranked_list) def create_vaccine_peptides_dict( - protein_sequence_dict, + isovar_results, mhc_predictor, vaccine_peptide_length=25, - padding_around_mutation=10, max_vaccine_peptides_per_variant=1, num_mutant_epitopes_to_keep=10 ** 5, min_epitope_score=0.0): """ Parameters ---------- - protein_sequence_dict : dict - Dictionary from varcode.Variant to isovar ProteinSequence + isovar_results : list of isovar.IsovarResult + List with one object per variant optionally containing protein sequences mhc_predictor : mhctools.BasePredictor Object with predict_peptides method, used for making pMHC binding @@ -128,10 +90,6 @@ def create_vaccine_peptides_dict( vaccine_peptide_length : int Length of vaccine SLP to construct - padding_around_mutation : int - Number of off-center windows around the mutation to consider as vaccine - peptides. - max_vaccine_peptides_per_variant : int Number of vaccine peptides to generate for each mutation. @@ -149,13 +107,12 @@ def create_vaccine_peptides_dict( VaccinePeptides. """ vaccine_peptides_dict = {} - for variant, isovar_protein_sequence in protein_sequence_dict.items(): + for isovar_result in isovar_results: + variant = isovar_result.variant vaccine_peptides = vaccine_peptides_for_variant( - variant=variant, - isovar_protein_sequence=isovar_protein_sequence, + isovar_result=isovar_result, mhc_predictor=mhc_predictor, vaccine_peptide_length=vaccine_peptide_length, - padding_around_mutation=padding_around_mutation, max_vaccine_peptides_per_variant=max_vaccine_peptides_per_variant, num_mutant_epitopes_to_keep=num_mutant_epitopes_to_keep, min_epitope_score=min_epitope_score) @@ -166,20 +123,16 @@ def create_vaccine_peptides_dict( return vaccine_peptides_dict def vaccine_peptides_for_variant( - variant, - isovar_protein_sequence, + isovar_result, mhc_predictor, vaccine_peptide_length, - padding_around_mutation, max_vaccine_peptides_per_variant, num_mutant_epitopes_to_keep=None, min_epitope_score=0.0): """ Parameters ---------- - variant : varcode.Variant - - isovar_protein_sequence : isovar. ProteinSequence + isovar_result : isovar.IsovarResult mhc_predictor : mhctools.BasePredictor Object with predict_peptides method, used for making pMHC binding @@ -188,10 +141,6 @@ def vaccine_peptides_for_variant( vaccine_peptide_length : int Length of vaccine SLP to construct - padding_around_mutation : int - Number of off-center windows around the mutation to consider as vaccine - peptides. - max_vaccine_peptides_per_variant : int Number of vaccine peptides to generate for each mutation. @@ -207,30 +156,30 @@ def vaccine_peptides_for_variant( ------- Sorted list of VaccinePeptide objects. If there are no suitable vaccine peptides (no strong MHC binder subsequences), returns an empty list. - - At this point, we know the variant has RNA support, as per the - isovar_protein_sequence. """ - protein_fragment = MutantProteinFragment.from_isovar_protein_sequence( - variant=variant, - protein_sequence=isovar_protein_sequence) + if not isovar_result.passes_all_filters: + # don't consider candidate vaccine peptides from variants which either + # failed their filters or don't have an RNA-derived protein sequence + return [] + + variant = isovar_result.variant + long_protein_fragment = MutantProteinFragment.from_isovar_result(isovar_result) logger.info( "Mutant protein fragment for %s: %s", variant, - protein_fragment) + long_protein_fragment) epitope_predictions = predict_epitopes( mhc_predictor=mhc_predictor, - protein_fragment=protein_fragment, + protein_fragment=long_protein_fragment, min_epitope_score=min_epitope_score, genome=variant.ensembl).values() candidate_vaccine_peptides = [] - for offset, candidate_fragment in protein_fragment.top_k_subsequences( - subsequence_length=vaccine_peptide_length, - k=2 * padding_around_mutation + 1): + for offset, candidate_fragment in long_protein_fragment.sorted_subsequences( + subsequence_length=vaccine_peptide_length): subsequence_epitope_predictions = slice_epitope_predictions( epitope_predictions, diff --git a/vaxrank/mutant_protein_fragment.py b/vaxrank/mutant_protein_fragment.py index 0b90c4b..4c85bbb 100644 --- a/vaxrank/mutant_protein_fragment.py +++ b/vaxrank/mutant_protein_fragment.py @@ -84,23 +84,34 @@ def __init__( n_alt_reads_supporting_protein_sequence @classmethod - def from_isovar_protein_sequence(cls, variant, protein_sequence): + def from_isovar_result(cls, isovar_result): + """ + Create a MutantProteinFragment from an isovar.IsovarResult object + + Parameters + ---------- + isovar_result : isovar.IsovarResult + + Returns + ------- + MutantProteinFragment + """ + protein_sequence = isovar_result.top_protein_sequence + if protein_sequence is None: + return None return cls( - variant=variant, - gene_name=";".join(protein_sequence.gene), + variant=isovar_result.variant, + gene_name=protein_sequence.gene_name, amino_acids=protein_sequence.amino_acids, - mutant_amino_acid_start_offset=protein_sequence.variant_aa_interval_start, - mutant_amino_acid_end_offset=protein_sequence.variant_aa_interval_end, - n_overlapping_reads=len(protein_sequence.overlapping_reads), - n_alt_reads=len(protein_sequence.alt_reads), - n_ref_reads=len(protein_sequence.ref_reads), - n_alt_reads_supporting_protein_sequence=len( - protein_sequence.alt_reads_supporting_protein_sequence), - supporting_reference_transcripts=[ - variant.ensembl.transcript_by_id(transcript_id) - for transcript_id in - protein_sequence.transcripts_supporting_protein_sequence - ]) + mutant_amino_acid_start_offset=protein_sequence.mutation_start_idx, + mutant_amino_acid_end_offset=protein_sequence.mutation_end_idx, + + # TODO: distinguish reads and fragments in Vaxrank? + n_overlapping_reads=isovar_result.num_total_fragments, + n_alt_reads=isovar_result.num_alt_fragments, + n_ref_reads=isovar_result.num_ref_fragments, + n_alt_reads_supporting_protein_sequence=protein_sequence.num_supporting_fragments, + supporting_reference_transcripts=protein_sequence.transcripts) def __len__(self): return len(self.amino_acids) @@ -174,22 +185,24 @@ def generate_subsequences(self, subsequence_length): supporting_reference_transcripts=self.supporting_reference_transcripts) yield subsequence_start_offset, subsequence_mutant_protein_fragment - def top_k_subsequences( + def sorted_subsequences( self, subsequence_length, - k, + limit=None, sort_key=lambda x: ( -x[1].mutation_distance_from_edge, -x[1].n_mutant_amino_acids)): """ - Returns k subsequences, paired with their offset from the start of the + Returns subsequences, paired with their offset from the start of the protein fragment. The default sort criterion is maximizing the mutation distance from the edge of the sequence and secondarily maximizing the number of mutant amino acids. """ subsequences = list(self.generate_subsequences(subsequence_length)) subsequences.sort(key=sort_key) - return subsequences[:k] + if limit: + subsequences = subsequences[:limit] + return subsequences def predicted_effect(self): effects = [ diff --git a/vaxrank/protein_sequences.py b/vaxrank/protein_sequences.py deleted file mode 100644 index ff3117f..0000000 --- a/vaxrank/protein_sequences.py +++ /dev/null @@ -1,92 +0,0 @@ -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import, print_function, division -import logging - -from isovar.protein_sequences import ( - reads_generator_to_protein_sequences_generator, -) - -logger = logging.getLogger(__name__) - -def create_variant_to_protein_sequence_dict( - reads_generator, - sequence_length, - min_alt_rna_reads, - min_variant_sequence_coverage, - variant_sequence_assembly=True): - """ - This function computes a dictionary of Variant objects to a - single isovar protein sequence that will be used to try to construct - VaccinePeptides. - - Parameters - ---------- - reads_generator : generator - Yields sequence of varcode.Variant objects paired with sequences of - AlleleRead objects that support that variant. - - sequence_length : int - Number of amino acids including mutant residues and padding on either - side. - - min_alt_rna_reads : int - Drop variant sequences at loci with fewer than this number of reads - supporting the alt allele. - - min_variant_sequence_coverage : int - Trim variant sequences to positions supported by at least this number - of RNA reads. - - variant_sequence_assembly : bool - If True, then assemble variant cDNA sequences based on overlap of RNA - reads. If False, then variant cDNA sequences must be fully spanned and - contained within RNA reads. - """ - - variant_to_protein_sequence_dict = {} - - # These sequences are only the ones that overlap the variant and support - # the mutation. - # Right now, this generator yields: - # - (variant, mutant protein sequences) if there's enough alt RNA support - # - # - (variant, None) if the variant is silent or there are ref reads overlapping the - # variant locus but inadequate alt RNA support. - # - does not return the variant if there's no RNA support for ref or alt, - # - # we may miss some coding variants this way unless we check for them - # explicitly - # - # Future intended behavior: returns all passing variants, with a protein - # sequences generator that is non empty if there are enough alt RNA reads - # supporting the variant - protein_sequences_generator = reads_generator_to_protein_sequences_generator( - reads_generator, - transcript_id_whitelist=None, - protein_sequence_length=sequence_length, - min_alt_rna_reads=min_alt_rna_reads, - min_variant_sequence_coverage=min_variant_sequence_coverage, - variant_sequence_assembly=variant_sequence_assembly, - max_protein_sequences_per_variant=1) - - for variant, isovar_protein_sequences in protein_sequences_generator: - if len(isovar_protein_sequences) == 0: - # variant RNA support is below threshold - logger.info("No protein sequences for %s", variant) - continue - - # use the first protein sequence - why? - variant_to_protein_sequence_dict[variant] = \ - isovar_protein_sequences[0] - return variant_to_protein_sequence_dict diff --git a/vaxrank/vaxrank_results.py b/vaxrank/vaxrank_results.py index 96c02b6..7f4f2a6 100644 --- a/vaxrank/vaxrank_results.py +++ b/vaxrank/vaxrank_results.py @@ -20,55 +20,70 @@ class VaxrankResults(Serializable): """ Data class used to represent all results captured by running Vaxrank. """ - def __init__( self, - variants, - variant_to_protein_sequences_dict, + isovar_results, variant_to_vaccine_peptides_dict, ranked_vaccine_peptides): """ Parameters ---------- - variants : varcode.VariantCollection - All variants without any filtering - - variant_to_protein_sequences_dict : dict - Dictionary mapping each variant to a list of candidate - protein sequences + isovar_results : list of isovar.IsovarResult + IsovarResult object for each variant without any filtering variant_to_vaccine_peptides_dict : dict Dictionary mapping variant to a list of possible vaccine peptides ranked_vaccine_peptides : list of VaccinePeptide """ - self.variants = variants - self.variant_to_protein_sequences_dict = variant_to_protein_sequences_dict + self.isovar_results = isovar_results self.variant_to_vaccine_peptides_dict = variant_to_vaccine_peptides_dict self.ranked_vaccine_peptides = ranked_vaccine_peptides + + @property + def variants(self): + """ + Unfiltered list of variants + + Returns + ------- + list of varcode.Variant + """ + return [isovar_result.variant for isovar_result in self.isovar_results] + + + @property + def variant_to_protein_sequences_dict(self): + # TODO: find out if we can safely get rid of this property + return { + isovar_result.variant: isovar_result.sorted_protein_sequences[0] + for isovar_result in self.isovar_results + if isovar_result.passes_all_filters + and len(isovar_result.sorted_protein_sequences) > 0 + } + def variant_counts(self): """ Summarize Vaxrank counts for total variants, variants with coding effects, variants with RNA support, and variants with associated vaccine peptides. - Returns ------- dict - """ # dictionary which will contain some overall variant counts for report display counts_dict = { - 'num_total_variants': len(self.variants), + 'num_total_variants': len(self.isovar_results), 'num_coding_effect_variants': 0, 'num_variants_with_rna_support': 0, 'num_variants_with_vaccine_peptides': 0, } - for variant in self.variants: - if len(variant.effects().drop_silent_and_noncoding()) > 0: + for isovar_result in self.isovar_results: + variant = isovar_result.variant + if isovar_result.predicted_effect_modifies_protein_sequence: counts_dict['num_coding_effect_variants'] += 1 - if variant in self.variant_to_protein_sequences_dict: + if isovar_result.has_mutant_protein_sequence_from_rna: counts_dict['num_variants_with_rna_support'] += 1 if variant in self.variant_to_vaccine_peptides_dict: counts_dict['num_variants_with_vaccine_peptides'] += 1 @@ -79,14 +94,6 @@ def variant_properties(self, gene_pathway_check=None): """ Parameters ---------- - variants : list of varcode.Variant - - protein_sequence_dict : dict - Dictionary from variants to isovar protein sequences - - vaccine_peptides_dict : dict - Dictionary from variants to VaccinePeptide objects - gene_pathway_check : GenePathwayCheck (optional) Used to look up whether a mutation or its affected gene are in some biologically important pathway.