Skip to content

Commit

Permalink
Merge pull request #51 from nf-core/50-fix-wittyer
Browse files Browse the repository at this point in the history
wittyer fixed
  • Loading branch information
kubranarci committed Jun 24, 2024
2 parents ef29e34 + aca8e59 commit d29cad9
Show file tree
Hide file tree
Showing 26 changed files with 422 additions and 389 deletions.
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,26 @@
1. Standardization of SVs in test VCF files
2. Normalization of SVs in test VCF files
3. Normalization of SVs in truth VCF files
4. SV stats and histograms
5. Germline benchmarking of SVs
6. Somatic benchmarking of SVs
7. Final report and comparisons
4. SV stats and histograms (Survivor)

5. Germline benchmarking of small variants
- Tools:
Happy
RTGtools
6. Germline benchmarking of SVs

- Tools:
Truvari
Svbenchmark
Wittyer: Only works with Truth files annotated with SVTYPE and SVLENGHT

7. Somatic benchmarking of small variants

- Tools:
Happy
RTGtools

8. Final report and comparisons

## Usage

Expand Down
6 changes: 6 additions & 0 deletions assets/samplesheet_HG002_hg37_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz,delly,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_DRAGEN_SV_hg19.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.sorted.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000

5 changes: 5 additions & 0 deletions assets/samplesheet_HG002_hg38_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/wittyer_test_cases/HG002.sv.with.corr.vcf.gz,unknown,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000

40 changes: 35 additions & 5 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,41 @@
"meta": ["chunksize"],
"default": 500
},
"pick": {
"type": "string",
"default": "ac",
"enum": ["single", "ac", "multi"],
"meta": ["pick"]
"bpDistance": {
"type": "integer",
"errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.",
"meta": ["bpDistance"],
"default": 500,
"minimum": 0
},
"percentThreshold": {
"type": "number",
"errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)",
"meta": ["percentThreshold"],
"default": 0.25,
"minimum": 0,
"maximum": 1
},
"absoluteThreshold": {
"type": "integer",
"errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)",
"meta": ["absoluteThreshold"],
"default": 10000,
"minimum": 0
},
"maxMatches": {
"type": "integer",
"errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).",
"meta": ["maxMatches"],
"default": 10
},
"similarityThreshold": {
"type": "number",
"errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.",
"meta": ["similarityThreshold"],
"default": 0.7,
"minimum": 0,
"maximum": 1
}
},
"required": ["test_vcf", "caller", "vartype"]
Expand Down
4 changes: 2 additions & 2 deletions bin/add_svtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@



in_vcf = pysam.VariantFile(args.graph)
out_name = os.path.basename(args.graph)
in_vcf = pysam.VariantFile(args.input)
out_name = os.path.basename(args.input)
if out_name.endswith('.gz'):
out_name = out_name[:-3]
if out_name.endswith('.vcf'):
Expand Down
38 changes: 36 additions & 2 deletions bin/merge_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def parse_args(args=None):
Description = "Merges svbenchmark or truvari bench reports from multiple samples"
Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/happy/sompy -v snv/indel -a germline/somatic "
Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/wittyer/happy/sompy -v snv/indel -a germline/somatic "

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("inputs", nargs="+", help="List of files to merge")
Expand Down Expand Up @@ -98,6 +98,37 @@ def get_truvari_resuls(file_paths):

return merged_df

def get_wittyer_resuls(file_paths):
# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Iterate over each table file
for file in file_paths:
# Read the json into a DataFrame
filename = os.path.basename(file)
with open(file, 'r') as f:
data = pd.read_json(f)

relevant_data = []
for sample in data['PerSampleStats']:
for stats in sample['OverallStats']:
relevant_data.append({
"Tool": filename.split(".")[0],
"StatsType": stats["StatsType"],
"TP_base": stats["TruthTpCount"],
"TP_comp": stats["QueryTpCount"],
"FP": stats["QueryFpCount"],
"FN": stats["TruthFnCount"],
"Precision": stats["Precision"],
"Recall": stats["Recall"],
"F1": stats["Fscore"]}
)

df = pd.DataFrame(relevant_data)
merged_df = pd.concat([merged_df, df])

return merged_df

def get_rtgtools_resuls(file_paths):
# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()
Expand Down Expand Up @@ -189,13 +220,16 @@ def main(args=None):
elif args.bench == "svbenchmark":
summ_table = get_svbenchmark_resuls(args.inputs)

elif args.bench == "wittyer":
summ_table = get_wittyer_resuls(args.inputs)

elif args.bench == "rtgtools":
summ_table = get_rtgtools_resuls(args.inputs)

elif args.bench == "happy":
summ_table = get_happy_resuls(args.inputs)
else:
raise ValueError('Only truvari/svbenchmark/rtgtools/happy results can be merged for germline analysis!!')
raise ValueError('Only truvari/svbenchmark/wittyer/rtgtools/happy results can be merged for germline analysis!!')

summ_table.reset_index(drop=True, inplace=True)
summ_table.to_csv(args.output + ".summary.txt", index=False)
Expand Down
25 changes: 8 additions & 17 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ process {
}
withName: "BCFTOOLS_NORM" {
ext.prefix = { vcf.baseName - ".vcf" + ".norm"}
ext.args = {"--output-type z -N -m-any -c s" }
ext.args = {"--output-type z -m-any -c w" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
]
}

withName: "BCFTOOLS_DEDUP" {
ext.prefix = { vcf.baseName - ".vcf" + ".dedup"}
ext.args = {"--output-type z --rm-du exact -c s" }
ext.args = {"--output-type z --rm-du exact -c w" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
Expand Down Expand Up @@ -178,7 +179,7 @@ process {
}
withName: "TRUVARI_BENCH" {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --pick ${meta.pick} --chunksize ${meta.chunksize}"}
ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --chunksize ${meta.chunksize}"}
ext.when = { params.method.split(',').contains('truvari') }
publishDir = [
path: {"${params.outdir}/${meta.id}/truvari_bench"},
Expand All @@ -198,23 +199,15 @@ process {
}
withName: WITTYER {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"-em cts"}
ext.args = {"-em cts --ef [] --pt ${meta.percentThreshold} --at ${meta.absoluteThreshold} --bpd ${meta.bpDistance} --mm ${meta.maxMatches} --st ${meta.similarityThreshold}"}
ext.when = { params.method.split(',').contains('wittyer') }
publishDir = [
path: {"${params.outdir}/${meta.id}/wittyer_bench"},
pattern: "*{json,vcf.gz.tbi,vcf.gz}",
mode: params.publish_dir_mode
]
}
withName: VCFDIST {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"-v 0"}
publishDir = [
path: {"${params.outdir}/${meta.id}/vcfdist_bench"},
pattern: "*{.vcf,tsv}",
pattern: "*{.vcf.gz,tbi,json}",
mode: params.publish_dir_mode
]
}

withName: BAMSURGEON_EVALUATOR {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
publishDir = [
Expand Down Expand Up @@ -255,11 +248,9 @@ process {
mode: params.publish_dir_mode
]
}

withName: TABIX_BGZIP_BENCH{
withName: "TABIX_BGZIP*"{
ext.prefix = {input.toString() - ".vcf.gz"}
}

withName: SURVIVOR_MERGE {
ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"}
publishDir = [
Expand Down
3 changes: 1 addition & 2 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ params {

// Processes
analysis = 'germline'
method = 'happy,truvari,svanalyzer' //
similarity = 0
method = 'happy,truvari,svanalyzer,wittyer' //
preprocess = "normalization,deduplication,prepy"
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="."'
Expand Down
1 change: 0 additions & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ params {
// Processes
analysis = 'germline'
method = 'happy,truvari,svanalyzer' //
similarity = 0
preprocess = "normalization,deduplication,prepy"
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="."'
Expand Down
1 change: 0 additions & 1 deletion conf/test_hg19.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ params {
analysis = 'germline' //somatic
method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist

similarity = 0 // determines the sequence similarity level in benchmarking.
standardization = true
preprocess = "normalization, deduplication"
//bnd_to_inv = true
Expand Down
1 change: 0 additions & 1 deletion conf/test_hg37.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ params {
analysis = 'germline' //somatic
method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist

similarity = 0 // determines the sequence similarity level in benchmarking.
standardization = true
preprocess = "normalization, deduplication"
//bnd_to_inv = true
Expand Down
43 changes: 43 additions & 0 deletions conf/test_hg37_full.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.
Use as follows:
nextflow run nf-core/benchmark -profile <docker/singularity> -config conf/test_hg38.config --outdir <OUTDIR>
----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 16
max_memory = '100.GB'
max_time = '8.h'

// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'assets/samplesheet_HG002_hg37_full.csv'
outdir = 'results'

// Genome references
genome = 'GRCh37'

// Processes
analysis = 'germline' //somatic
method = 'truvari,svanalyzer,wittyer' // --not working for now : vcfdist
preprocess = "normalization, deduplication, filter_contigs"
min_sv_size = 30
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="PASS"'

sample = "HG002" // available samples: SEQC2, HG002
truth_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz"
high_conf_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed"

}
3 changes: 1 addition & 2 deletions conf/test_hg38.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ params {
// Processes
analysis = 'germline' //somatic
method = 'truvari,svanalyzer,wittyer' // --not working for now : vcfdist
similarity = 0 // determines the sequence similarity level in benchmarking.
preprocess = "normalization, deduplication"
min_sv_size = 30
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="PASS"'

sample = "HG002" // available samples: SEQC2, HG002
truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.vcf.gz"
truth_sv = "HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.annotated.vcf.gz"
high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed"

}
2 changes: 1 addition & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
},
"wittyer": {
"branch": "master",
"git_sha": "61f2ea506bd87ef436b0086f91a07abc6035fcd0",
"git_sha": "be84844983b332fa7d3d38dec06af6f953d83189",
"installed_by": ["modules"]
}
}
Expand Down
Loading

0 comments on commit d29cad9

Please sign in to comment.