Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wittyer fixed #51

Merged
merged 9 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,26 @@
1. Standardization of SVs in test VCF files
2. Normalization of SVs in test VCF files
3. Normalization of SVs in truth VCF files
4. SV stats and histograms
5. Germline benchmarking of SVs
6. Somatic benchmarking of SVs
7. Final report and comparisons
4. SV stats and histograms (Survivor)

5. Germline benchmarking of small variants
- Tools:
Happy
RTGtools
6. Germline benchmarking of SVs

- Tools:
Truvari
Svbenchmark
Wittyer: Only works with Truth files annotated with SVTYPE and SVLENGHT

7. Somatic benchmarking of small variants

- Tools:
Happy
RTGtools

8. Final report and comparisons

## Usage

Expand Down
6 changes: 6 additions & 0 deletions assets/samplesheet_HG002_hg37_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_delly_SV_hg19.vcf.gz,delly,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_DRAGEN_SV_hg19.vcf.gz,dragen,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_lumpy_SV_hg19.sorted.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg37/dragen_paper/HG002_manta_SV_hg19_genotype.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000

5 changes: 5 additions & 0 deletions assets/samplesheet_HG002_hg38_full.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
test_vcf,caller,vartype,pctsize,pctseq,pctovl,refdist,chunksize,normshift,normdist,normsizediff,maxdist
/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/ajtrio.lumpy.svtyper.HG002.md.sorted.recal.vcf.gz,lumpy,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/hg38/GIAB_GRCh38_SVs_06252018/manta.HG002.vcf.gz,manta,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000
/Users/w620-admin/Desktop/nf-core/dataset/wittyer_test_cases/HG002.sv.with.corr.vcf.gz,unknown,sv,0.3,0,0,100000,100000,0.3,0.3,0.3,100000

40 changes: 35 additions & 5 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -91,11 +91,41 @@
"meta": ["chunksize"],
"default": 500
},
"pick": {
"type": "string",
"default": "ac",
"enum": ["single", "ac", "multi"],
"meta": ["pick"]
"bpDistance": {
"type": "integer",
"errorMessage": "bpDistance is a wittyer parameter. Upper bound of boundary distance when comparing truth and query. By default it is 500bp for all types except for Insertions, which are 100bp.Please note that if you set this value in the command line, it overrides all the defaults, so Insertions and other types will have the same bpd.",
kubranarci marked this conversation as resolved.
Show resolved Hide resolved
"meta": ["bpDistance"],
"default": 500,
"minimum": 0
},
"percentThreshold": {
"type": "number",
"errorMessage": "percentThreshold is a wittyer parameter. This is used for percentage thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use for large tandem repeats. For all other SVs, in order to match between query and truth, the distance between boundaries should be within a number thats proportional to total SV (default 0.25)",
kubranarci marked this conversation as resolved.
Show resolved Hide resolved
"meta": ["percentThreshold"],
"default": 0.25,
"minimum": 0,
"maximum": 1
},
"absoluteThreshold": {
"type": "integer",
"errorMessage": "absoluteThreshold is a wittyer parameter. This is used for absolute thresholding. For CopyNumberTandemRepeats, this determines how large of a RepeatUnitCount (RUC) threshold to use. For all other SVs, this is the upper bound of boundary distance when comparing truth and query. (default 10000)",
kubranarci marked this conversation as resolved.
Show resolved Hide resolved
"meta": ["absoluteThreshold"],
"default": 10000,
"minimum": 0
},
"maxMatches": {
"type": "integer",
"errorMessage": "maxMatches is a wittyer parameter. This is used for matching behaviour. Negative value means to match any number (for large SVs it is not recommended).",
"meta": ["maxMatches"],
"default": 10
},
"similarityThreshold": {
"type": "number",
"errorMessage": "similarityThreshold is a wittyer parameter. This is used for sequence similarity thresholding.",
kubranarci marked this conversation as resolved.
Show resolved Hide resolved
"meta": ["similarityThreshold"],
"default": 0.7,
"minimum": 0,
"maximum": 1
}
},
"required": ["test_vcf", "caller", "vartype"]
Expand Down
4 changes: 2 additions & 2 deletions bin/add_svtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@



in_vcf = pysam.VariantFile(args.graph)
out_name = os.path.basename(args.graph)
in_vcf = pysam.VariantFile(args.input)
out_name = os.path.basename(args.input)
if out_name.endswith('.gz'):
out_name = out_name[:-3]
if out_name.endswith('.vcf'):
Expand Down
38 changes: 36 additions & 2 deletions bin/merge_reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def parse_args(args=None):
Description = "Merges svbenchmark or truvari bench reports from multiple samples"
Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/happy/sompy -v snv/indel -a germline/somatic "
Epilog = "Example usage: python merge_reports.py file1 file2 file3 -o merged_table.csv -b truvari/svbenchmark/wittyer/happy/sompy -v snv/indel -a germline/somatic "

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("inputs", nargs="+", help="List of files to merge")
Expand Down Expand Up @@ -98,6 +98,37 @@ def get_truvari_resuls(file_paths):

return merged_df

def get_wittyer_resuls(file_paths):
# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()

# Iterate over each table file
for file in file_paths:
# Read the json into a DataFrame
filename = os.path.basename(file)
with open(file, 'r') as f:
data = pd.read_json(f)

relevant_data = []
for sample in data['PerSampleStats']:
for stats in sample['OverallStats']:
relevant_data.append({
"Tool": filename.split(".")[0],
"StatsType": stats["StatsType"],
"TP_base": stats["TruthTpCount"],
"TP_comp": stats["QueryTpCount"],
"FP": stats["QueryFpCount"],
"FN": stats["TruthFnCount"],
"Precision": stats["Precision"],
"Recall": stats["Recall"],
"F1": stats["Fscore"]}
)

df = pd.DataFrame(relevant_data)
merged_df = pd.concat([merged_df, df])

return merged_df

def get_rtgtools_resuls(file_paths):
# Initialize an empty DataFrame to store the merged data
merged_df = pd.DataFrame()
Expand Down Expand Up @@ -189,13 +220,16 @@ def main(args=None):
elif args.bench == "svbenchmark":
summ_table = get_svbenchmark_resuls(args.inputs)

elif args.bench == "wittyer":
summ_table = get_wittyer_resuls(args.inputs)

elif args.bench == "rtgtools":
summ_table = get_rtgtools_resuls(args.inputs)

elif args.bench == "happy":
summ_table = get_happy_resuls(args.inputs)
else:
raise ValueError('Only truvari/svbenchmark/rtgtools/happy results can be merged for germline analysis!!')
raise ValueError('Only truvari/svbenchmark/wittyer/rtgtools/happy results can be merged for germline analysis!!')

summ_table.reset_index(drop=True, inplace=True)
summ_table.to_csv(args.output + ".summary.txt", index=False)
Expand Down
25 changes: 8 additions & 17 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,16 @@ process {
}
withName: "BCFTOOLS_NORM" {
ext.prefix = { vcf.baseName - ".vcf" + ".norm"}
ext.args = {"--output-type z -N -m-any -c s" }
ext.args = {"--output-type z -m-any -c w" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
]
}

withName: "BCFTOOLS_DEDUP" {
ext.prefix = { vcf.baseName - ".vcf" + ".dedup"}
ext.args = {"--output-type z --rm-du exact -c s" }
ext.args = {"--output-type z --rm-du exact -c w" }
publishDir = [
path: { "${params.outdir}/test" },
enabled: false
Expand Down Expand Up @@ -178,7 +179,7 @@ process {
}
withName: "TRUVARI_BENCH" {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --pick ${meta.pick} --chunksize ${meta.chunksize}"}
ext.args = {"--pctsize ${meta.pctsize} --pctovl ${meta.pctovl} --pctseq ${meta.pctseq} --refdist ${meta.refdist} --chunksize ${meta.chunksize}"}
ext.when = { params.method.split(',').contains('truvari') }
publishDir = [
path: {"${params.outdir}/${meta.id}/truvari_bench"},
Expand All @@ -198,23 +199,15 @@ process {
}
withName: WITTYER {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"-em cts"}
ext.args = {"-em cts --ef [] --pt ${meta.percentThreshold} --at ${meta.absoluteThreshold} --bpd ${meta.bpDistance} --mm ${meta.maxMatches} --st ${meta.similarityThreshold}"}
ext.when = { params.method.split(',').contains('wittyer') }
publishDir = [
path: {"${params.outdir}/${meta.id}/wittyer_bench"},
pattern: "*{json,vcf.gz.tbi,vcf.gz}",
mode: params.publish_dir_mode
]
}
withName: VCFDIST {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
ext.args = {"-v 0"}
publishDir = [
path: {"${params.outdir}/${meta.id}/vcfdist_bench"},
pattern: "*{.vcf,tsv}",
pattern: "*{.vcf.gz,tbi,json}",
mode: params.publish_dir_mode
]
}

withName: BAMSURGEON_EVALUATOR {
ext.prefix = {"${meta.id}.${params.sample}.${meta.vartype}"}
publishDir = [
Expand Down Expand Up @@ -255,11 +248,9 @@ process {
mode: params.publish_dir_mode
]
}

withName: TABIX_BGZIP_BENCH{
withName: "TABIX_BGZIP*"{
ext.prefix = {input.toString() - ".vcf.gz"}
}

withName: SURVIVOR_MERGE {
ext.prefix = {"${meta.id}.${meta.vartype}.${meta.tag}"}
publishDir = [
Expand Down
3 changes: 1 addition & 2 deletions conf/test.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@ params {

// Processes
analysis = 'germline'
method = 'happy,truvari,svanalyzer' //
similarity = 0
method = 'happy,truvari,svanalyzer,wittyer' //
preprocess = "normalization,deduplication,prepy"
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="."'
Expand Down
1 change: 0 additions & 1 deletion conf/test_full.config
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ params {
// Processes
analysis = 'germline'
method = 'happy,truvari,svanalyzer' //
similarity = 0
preprocess = "normalization,deduplication,prepy"
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="."'
Expand Down
1 change: 0 additions & 1 deletion conf/test_hg19.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ params {
analysis = 'germline' //somatic
method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist

similarity = 0 // determines the sequence similarity level in benchmarking.
standardization = true
preprocess = "normalization, deduplication"
//bnd_to_inv = true
Expand Down
1 change: 0 additions & 1 deletion conf/test_hg37.config
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ params {
analysis = 'germline' //somatic
method = 'truvari,svanalyzer' // --not working for now : wittyer, vcfdist

similarity = 0 // determines the sequence similarity level in benchmarking.
standardization = true
preprocess = "normalization, deduplication"
//bnd_to_inv = true
Expand Down
43 changes: 43 additions & 0 deletions conf/test_hg37_full.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Nextflow config file for running minimal tests
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Defines input files and everything required to run a fast and simple pipeline test.

Use as follows:
nextflow run nf-core/benchmark -profile <docker/singularity> -config conf/test_hg38.config --outdir <OUTDIR>

----------------------------------------------------------------------------------------
*/

params {
config_profile_name = 'Test profile'
config_profile_description = 'Minimal test dataset to check pipeline function'

// Limit resources so that this can run on GitHub Actions
max_cpus = 16
max_memory = '100.GB'
max_time = '8.h'

// Input data
// TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
// TODO nf-core: Give any required params for the test so that command line flags are not needed
input = 'assets/samplesheet_HG002_hg37_full.csv'
outdir = 'results'

// Genome references
genome = 'GRCh37'

// Processes
analysis = 'germline' //somatic
method = 'truvari,svanalyzer,wittyer' // --not working for now : vcfdist
preprocess = "normalization, deduplication, filter_contigs"
min_sv_size = 30
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="PASS"'

sample = "HG002" // available samples: SEQC2, HG002
truth_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.vcf.gz"
high_conf_sv = "/Users/w620-admin/Desktop/nf-core/dataset/hg37/NIST_SV/HG002_SVs_Tier1_v0.6.bed"

}
3 changes: 1 addition & 2 deletions conf/test_hg38.config
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ params {
// Processes
analysis = 'germline' //somatic
method = 'truvari,svanalyzer,wittyer' // --not working for now : vcfdist
similarity = 0 // determines the sequence similarity level in benchmarking.
preprocess = "normalization, deduplication"
min_sv_size = 30
//variant_filtering = "include" // null, include, exclude
//expression = 'FILTER="PASS"'

sample = "HG002" // available samples: SEQC2, HG002
truth_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.vcf.gz"
truth_sv = "HG002_GRCh38_difficult_medical_gene_SV_benchmark_v0.01.chr21.annotated.vcf.gz"
high_conf_sv = "https://raw.githubusercontent.com/kubranarci/benchmark_datasets/main/SV_testdata/hg38/truth/HG002_GRCh38_difficult_medical_gene_SV_benchmark_v01.ch21.bed"

}
2 changes: 1 addition & 1 deletion modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
},
"wittyer": {
"branch": "master",
"git_sha": "61f2ea506bd87ef436b0086f91a07abc6035fcd0",
"git_sha": "be84844983b332fa7d3d38dec06af6f953d83189",
"installed_by": ["modules"]
}
}
Expand Down
Loading