Merge pull request #51 from nf-core/50-fix-wittyer

wittyer fixed
nf-core · Jun 24, 2024 · d29cad9 · d29cad9
2 parents ef29e34 + aca8e59
commit d29cad9
Show file tree

Hide file tree

Showing 26 changed files with 422 additions and 389 deletions.
diff --git a/README.md b/README.md
@@ -34,10 +34,26 @@
 1. Standardization of SVs in test VCF files
 2. Normalization of SVs in test VCF files
 3. Normalization of SVs in truth VCF files
-4. SV stats and histograms
-5. Germline benchmarking of SVs
-6. Somatic benchmarking of SVs
-7. Final report and comparisons
+4. SV stats and histograms (Survivor)
+
+5. Germline benchmarking of small variants
+   - Tools:
+     Happy
+     RTGtools
+6. Germline benchmarking of SVs
+
+   - Tools:
+     Truvari
+     Svbenchmark
+     Wittyer: Only works with Truth files annotated with SVTYPE and SVLENGHT
+
+7. Somatic benchmarking of small variants
+
+   - Tools:
+     Happy
+     RTGtools
+
+8. Final report and comparisons
 
 ## Usage
 

diff --git a/assets/samplesheet_HG002_hg37_full.csv b/assets/samplesheet_HG002_hg37_full.csv
@@ -0,0 +1,6 @@
+test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
+/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz,delly,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_DRAGEN_SV_hg19.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.sorted.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+
diff --git a/assets/samplesheet_HG002_hg38_full.csv b/assets/samplesheet_HG002_hg38_full.csv
@@ -0,0 +1,5 @@
+test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
+/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+/Users/w620-admin/Desktop/nf-core/dataset/wittyer_test_cases/HG002.sv.with.corr.vcf.gz,unknown,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
+
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -91,11 +91,41 @@
                 "meta": ["chunksize"],
                 "default": 500
             },
-            "pick": {
-                "type": "string",
-                "default": "ac",
-                "enum": ["single", "ac", "multi"],
-                "meta": ["pick"]
+            "bpDistance": {
+                "type": "integer",
+                "errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.",
+                "meta": ["bpDistance"],
+                "default": 500,
+                "minimum": 0
+            },
+            "percentThreshold": {
+                "type": "number",
+                "errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)",
+                "meta": ["percentThreshold"],
+                "default": 0.25,
+                "minimum": 0,
+                "maximum": 1
+            },
+            "absoluteThreshold": {
+                "type": "integer",
+                "errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)",
+                "meta": ["absoluteThreshold"],
+                "default": 10000,
+                "minimum": 0
+            },
+            "maxMatches": {
+                "type": "integer",
+                "errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).",
+                "meta": ["maxMatches"],
+                "default": 10
+            },
+            "similarityThreshold": {
+                "type": "number",
+                "errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.",
+                "meta": ["similarityThreshold"],
+                "default": 0.7,
+                "minimum": 0,
+                "maximum": 1
             }
         },
         "required": ["test_vcf", "caller", "vartype"]

diff --git a/bin/add_svtype.py b/bin/add_svtype.py
@@ -18,8 +18,8 @@
 
 
 
-in_vcf = pysam.VariantFile(args.graph)
-out_name = os.path.basename(args.graph)
+in_vcf = pysam.VariantFile(args.input)
+out_name = os.path.basename(args.input)
 if out_name.endswith('.gz'):
     out_name = out_name[:-3]
 if out_name.endswith('.vcf'):

diff --git a/bin/merge_reports.py b/bin/merge_reports.py
@@ -11,7 +11,7 @@
 
 def parse_args(args=None):
 	Description = "Merges svbenchmark or truvari bench reports from multiple samples"
-	Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/happy/sompy -v snv/indel -a germline/somatic "
+	Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/wittyer/happy/sompy -v snv/indel -a germline/somatic "
 
 	parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
 	parser.add_argument("inputs", nargs="+", help="List of files to merge")
@@ -98,6 +98,37 @@ def get_truvari_resuls(file_paths):
 
 	return merged_df
 
+def get_wittyer_resuls(file_paths):
+	# Initialize an empty DataFrame to store the merged data
+	merged_df = pd.DataFrame()
+
+	# Iterate over each table file
+	for file in file_paths:
+	# Read the json into a DataFrame
+		filename = os.path.basename(file)
+		with open(file, 'r') as f:
+			data = pd.read_json(f)
+
+			relevant_data = []
+			for sample in data['PerSampleStats']:
+				for stats in sample['OverallStats']:
+					relevant_data.append({
+						"Tool": filename.split(".")[0],
+						"StatsType": stats["StatsType"],
+						"TP_base": stats["TruthTpCount"],
+						"TP_comp": stats["QueryTpCount"],
+						"FP": stats["QueryFpCount"],
+						"FN": stats["TruthFnCount"],
+						"Precision": stats["Precision"],
+						"Recall": stats["Recall"],
+						"F1": stats["Fscore"]}
+					)
+
+		df = pd.DataFrame(relevant_data)
+		merged_df = pd.concat([merged_df, df])
+
+	return merged_df
+
 def get_rtgtools_resuls(file_paths):
 	# Initialize an empty DataFrame to store the merged data
 	merged_df = pd.DataFrame()
@@ -189,13 +220,16 @@ def main(args=None):
 		elif args.bench == "svbenchmark":
 			summ_table = get_svbenchmark_resuls(args.inputs)
 
+		elif args.bench == "wittyer":
+			summ_table = get_wittyer_resuls(args.inputs)
+
 		elif args.bench == "rtgtools":
 			summ_table = get_rtgtools_resuls(args.inputs)
 
 		elif args.bench == "happy":
 			summ_table = get_happy_resuls(args.inputs)
 		else:
-			raise ValueError('Only truvari/svbenchmark/rtgtools/happy results can be merged for germline analysis!!')
+			raise ValueError('Only truvari/svbenchmark/wittyer/rtgtools/happy results can be merged for germline analysis!!')
 
 		summ_table.reset_index(drop=True, inplace=True)
 		summ_table.to_csv(args.output + ".summary.txt", index=False)

diff --git a/conf/modules.config b/conf/modules.config
@@ -27,15 +27,16 @@ process {
     }
     withName: "BCFTOOLS_NORM" {
         ext.prefix = { vcf.baseName - ".vcf" + ".norm"}
-        ext.args   = {"--output-type z -N -m-any -c s" }
+        ext.args   = {"--output-type z -m-any -c w" }
         publishDir = [
             path: { "${params.outdir}/test" },
             enabled: false
         ]
     }
+
     withName: "BCFTOOLS_DEDUP" {
         ext.prefix = { vcf.baseName - ".vcf" + ".dedup"}
-        ext.args   = {"--output-type z --rm-du exact -c s" }
+        ext.args   = {"--output-type z --rm-du exact -c w" }
         publishDir = [
             path: { "${params.outdir}/test" },
             enabled: false
@@ -178,7 +179,7 @@ process {
     }
     withName: "TRUVARI_BENCH" {
         ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
-        ext.args   = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --pick ${meta.pick} --chunksize ${meta.chunksize}"}
+        ext.args   = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --chunksize ${meta.chunksize}"}
         ext.when   = { params.method.split(',').contains('truvari') }
         publishDir = [
             path: {"${params.outdir}/${meta.id}/truvari_bench"},
@@ -198,23 +199,15 @@ process {
     }
     withName: WITTYER {
         ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
-        ext.args   = {"-em cts"}
+        ext.args   = {"-em cts --ef [] --pt ${meta.percentThreshold} --at ${meta.absoluteThreshold} --bpd ${meta.bpDistance} --mm ${meta.maxMatches} --st ${meta.similarityThreshold}"}
         ext.when   = { params.method.split(',').contains('wittyer') }
         publishDir = [
             path: {"${params.outdir}/${meta.id}/wittyer_bench"},
-            pattern: "*{json,vcf.gz.tbi,vcf.gz}",
-            mode: params.publish_dir_mode
-        ]
-    }
-    withName: VCFDIST {
-        ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
-        ext.args   = {"-v 0"}
-        publishDir = [
-            path: {"${params.outdir}/${meta.id}/vcfdist_bench"},
-            pattern: "*{.vcf,tsv}",
+            pattern: "*{.vcf.gz,tbi,json}",
             mode: params.publish_dir_mode
         ]
     }
+
     withName: BAMSURGEON_EVALUATOR {
         ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
         publishDir = [
@@ -255,11 +248,9 @@ process {
             mode: params.publish_dir_mode
         ]
     }
-
-    withName: TABIX_BGZIP_BENCH{
+    withName: "TABIX_BGZIP*"{
         ext.prefix = {input.toString() - ".vcf.gz"}
     }
-
     withName: SURVIVOR_MERGE {
         ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"}
         publishDir = [

diff --git a/conf/test.config b/conf/test.config
@@ -28,8 +28,7 @@ params {
 
     // Processes
     analysis             = 'germline'
-    method               = 'happy,truvari,svanalyzer'  //
-    similarity           = 0
+    method               = 'happy,truvari,svanalyzer,wittyer'  //
     preprocess           = "normalization,deduplication,prepy"
     //variant_filtering    = "include"           // null, include, exclude
     //expression           = 'FILTER="."'

diff --git a/conf/test_full.config b/conf/test_full.config
@@ -28,7 +28,6 @@ params {
     // Processes
     analysis             = 'germline'
     method               = 'happy,truvari,svanalyzer'  //
-    similarity           = 0
     preprocess           = "normalization,deduplication,prepy"
     //variant_filtering    = "include"           // null, include, exclude
     //expression           = 'FILTER="."'

diff --git a/conf/test_hg19.config b/conf/test_hg19.config
@@ -30,7 +30,6 @@ params {
     analysis        = 'germline' //somatic
     method          = 'truvari,svanalyzer'  // --not working for now : wittyer, vcfdist
 
-    similarity      = 0 // determines the sequence similarity level in benchmarking.
     standardization = true
     preprocess      = "normalization, deduplication"
     //bnd_to_inv      = true

diff --git a/conf/test_hg37.config b/conf/test_hg37.config
@@ -30,7 +30,6 @@ params {
     analysis        = 'germline' //somatic
     method          = 'truvari,svanalyzer'  // --not working for now : wittyer, vcfdist
 
-    similarity      = 0 // determines the sequence similarity level in benchmarking.
     standardization = true
     preprocess      = "normalization, deduplication"
     //bnd_to_inv      = true

diff --git a/conf/test_hg37_full.config b/conf/test_hg37_full.config
@@ -0,0 +1,43 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/benchmark -profile <docker/singularity> -config conf/test_hg38.config --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 16
+    max_memory = '100.GB'
+    max_time   = '8.h'
+
+    // Input data
+    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
+    // TODO nf-core: Give any required params for the test so that command line flags are not needed
+    input                = 'assets/samplesheet_HG002_hg37_full.csv'
+    outdir               = 'results'
+
+    // Genome references
+    genome               = 'GRCh37'
+
+    // Processes
+    analysis             = 'germline' //somatic
+    method               = 'truvari,svanalyzer,wittyer'  // --not working for now : vcfdist
+    preprocess           = "normalization, deduplication, filter_contigs"
+    min_sv_size          = 30
+    //variant_filtering    = "include"           // null, include, exclude
+    //expression           = 'FILTER="PASS"'
+
+    sample                = "HG002" // available samples: SEQC2, HG002
+    truth_sv              = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz"
+    high_conf_sv          = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed"
+
+}
diff --git a/conf/test_hg38.config b/conf/test_hg38.config
@@ -31,14 +31,13 @@ params {
     // Processes
     analysis             = 'germline' //somatic
     method               = 'truvari,svanalyzer,wittyer'  // --not working for now : vcfdist
-    similarity           = 0 // determines the sequence similarity level in benchmarking.
     preprocess           = "normalization, deduplication"
     min_sv_size          = 30
     //variant_filtering    = "include"           // null, include, exclude
     //expression           = 'FILTER="PASS"'
 
     sample                = "HG002" // available samples: SEQC2, HG002
-    truth_sv              = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.vcf.gz"
+    truth_sv              = "HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.annotated.vcf.gz"
     high_conf_sv          = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed"
 
 }
diff --git a/modules.json b/modules.json
@@ -122,7 +122,7 @@
                     },
                     "wittyer": {
                         "branch": "master",
-                        "git_sha": "61f2ea506bd87ef436b0086f91a07abc6035fcd0",
+                        "git_sha": "be84844983b332fa7d3d38dec06af6f953d83189",
                         "installed_by": ["modules"]
                     }
                 }