From 2f82fa49bfe07ea06cfed8bcc7c201cbafd154ef Mon Sep 17 00:00:00 2001 From: Cyriac Kandoth Date: Tue, 8 Aug 2017 12:00:42 -0400 Subject: [PATCH] Fix: VCFs need semicolon delim FILTER data, and header lines --- maf2vcf.pl | 8 ++++++-- vcf2maf.pl | 4 ++-- vcf2vcf.pl | 4 ++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/maf2vcf.pl b/maf2vcf.pl index f58b678..bc0d40c 100644 --- a/maf2vcf.pl +++ b/maf2vcf.pl @@ -53,7 +53,7 @@ # Before anything, let's parse the headers of this supposed "MAF-like" file and do some checks my $maf_fh = IO::File->new( $input_maf ) or die "ERROR: Couldn't open input MAF: $input_maf!\n"; -my ( %uniq_regions, %flanking_bps, @tn_pair, %col_idx, $header_line ); +my ( %uniq_regions, %filter_tags, %flanking_bps, @tn_pair, %col_idx, $header_line ); while( my $line = $maf_fh->getline ) { # If the file uses Mac OS 9 newlines, quit with an error @@ -89,10 +89,12 @@ ( %col_idx ) or die "ERROR: Couldn't find a header line (must start with Hugo_Symbol, Chromosome, or Tumor_Sample_Barcode): $input_maf\n"; # For each variant in the MAF, parse out the locus for running samtools faidx later - my ( $chr, $pos, $ref ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele ); + my ( $chr, $pos, $ref, $filter ) = map{ my $c = lc; ( defined $col_idx{$c} ? $cols[$col_idx{$c}] : "" )} qw( Chromosome Start_Position Reference_Allele FILTER ); $ref =~ s/^(\?|-|0)+$//; # Blank out the dashes (or other weird chars) used with indels my $region = "$chr:" . ( $pos - 1 ) . "-" . ( $pos + length( $ref )); $uniq_regions{$region} = 1; + # Also track the unique FILTER tags seen, so we can construct VCF header lines for each + map{ $filter_tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( /,|;/, $filter ); } $maf_fh->close; @@ -152,6 +154,7 @@ $tn_vcf{$vcf_file} .= "##FORMAT=\n"; $tn_vcf{$vcf_file} .= "##FORMAT=\n"; $tn_vcf{$vcf_file} .= "##FORMAT=\n"; + $tn_vcf{$vcf_file} .= "##FILTER=\n" foreach ( sort keys %filter_tags ); $tn_vcf{$vcf_file} .= "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$t_id\t$n_id\n"; } @@ -304,6 +307,7 @@ $vcf_fh->print( "##FORMAT=\n" ); $vcf_fh->print( "##FORMAT=\n" ); $vcf_fh->print( "##FORMAT=\n" ); +$vcf_fh->print( "##FILTER=\n" ) foreach ( sort keys %filter_tags ); $vcf_fh->print( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" . join("\t", @vcf_cols) . "\n" ); # Write each variant into the multi-sample VCF diff --git a/vcf2maf.pl b/vcf2maf.pl index 7a0e134..17ab171 100644 --- a/vcf2maf.pl +++ b/vcf2maf.pl @@ -769,7 +769,7 @@ sub GetBiotypePriority { # Copy FILTER from input VCF, and tag calls with high allele counts in any ExAC subpopulation my $subpop_count = 0; # Remove existing common_variant tags from input, so it's redefined by our criteria here - $filter = join( ",", grep{ $_ ne "common_variant" } split( ",", $filter )); + $filter = join( ";", grep{ $_ ne "common_variant" } split( /,|;/, $filter )); foreach my $subpop ( qw( AFR AMR EAS FIN NFE OTH SAS )) { if( $maf_line{"ExAC_AC_AN_$subpop"} ) { my ( $subpop_ac ) = split( "/", $maf_line{"ExAC_AC_AN_$subpop"} ); @@ -777,7 +777,7 @@ sub GetBiotypePriority { } } if( $subpop_count > 0 ) { - $filter = (( $filter eq "PASS" or $filter eq "." or !$filter ) ? "common_variant" : "$filter,common_variant" ); + $filter = (( $filter eq "PASS" or $filter eq "." or !$filter ) ? "common_variant" : "$filter;common_variant" ); } $maf_line{FILTER} = $filter; diff --git a/vcf2vcf.pl b/vcf2vcf.pl index 812eaf6..bac2a34 100644 --- a/vcf2vcf.pl +++ b/vcf2vcf.pl @@ -300,7 +300,7 @@ # Add more filter tags to the FILTER field, if --add-filters was specified if( $add_filters ) { my %tags = (); - map{ $tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( ",", $filter ); + map{ $tags{$_} = 1 unless( $_ eq "PASS" or $_ eq "." )} split( /,|;/, $filter ); $tags{LowTotalDepth} = 1 if(( $tum_info{DP} ne "." and $tum_info{DP} < $min_tum_depth ) or ( $nrm_info{DP} ne "." and $nrm_info{DP} < $min_nrm_depth )); my @tum_depths = split( /,/, $tum_info{AD} ); my @nrm_depths = split( /,/, $nrm_info{AD} ); @@ -308,7 +308,7 @@ my $nrm_alt_depth = $nrm_depths[$var_allele_idx]; $tags{LowTumorSupport} = 1 if( $tum_alt_depth ne "." and $tum_alt_depth < $min_tum_support ); $tags{HighNormalSupport} = 1 if( $nrm_alt_depth ne "." and $nrm_alt_depth > $max_nrm_support ); - my $tags_to_add = join( ",", sort keys %tags ); + my $tags_to_add = join( ";", sort keys %tags ); $filter = ( $tags_to_add ? $tags_to_add : $filter ); }