Skip to content

Commit

Permalink
Merge pull request #108 from jonasscheid/dsl2
Browse files Browse the repository at this point in the history
Update check_samplesheet.py script for new format #102
  • Loading branch information
christopher-mohr committed Oct 29, 2021
2 parents b001306 + 0950d01 commit f6dbaa9
Showing 1 changed file with 82 additions and 75 deletions.
157 changes: 82 additions & 75 deletions bin/check_samplesheet.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
#!/usr/bin/env python

# TODO nf-core: Update the script to check the samplesheet
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv

import os
import sys
import errno
import argparse
import re


def parse_args(args=None):
Expand All @@ -15,19 +13,10 @@ def parse_args(args=None):

parser = argparse.ArgumentParser(description=Description, epilog=Epilog)
parser.add_argument("FILE_IN", help="Input samplesheet file.")
parser.add_argument("FILE_OUT", help="Output file.")
parser.add_argument("FILE_OUT", help="Output samplesheet file.")
return parser.parse_args(args)


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
if context != "" and context_str != "":
Expand All @@ -38,103 +27,121 @@ def print_error(error, context="Line", context_str=""):
sys.exit(1)


# TODO nf-core: Update the check_samplesheet function
def check_allele_nomenclature(allele):
pattern = re.compile("(^[A-Z][\*][0-9][0-9][:][0-9][0-9])$")
return pattern.match(allele) is not None


def make_dir(path):
if len(path) > 0:
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise exception


def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
sample,alleles,filename
GBM_1,A*01:01;A*02:01;B*07:02;B*24:02;C*03:01;C*04:01,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,A*02:01;A*24:01;B*07:02;B*08:01;C*04:01;C*07:01,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
or
sample,alleles,filename
GBM_1,gbm_1_alleles.txt,gbm_1_anno.vcf|gbm_1_peps.tsv|gbm_1_prot.fasta
GBM_2,gbm_2_alleles.txt,gbm_2_anno.vcf|gbm_2_peps.tsv|gbm_2_prot.fasta
where the FileName column contains EIHTER a vcf file, a tsv file (peptides), or a fasta file (proteins)
and the Alleles column contains EITHER a string of alleles separated by semicolon or the path to a text file
containing one allele per line (no header)
Furhter Examples:
- Class2 allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.DRB1_01_01.txt
- Mouse allele format => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/alleles/alleles.H2.txt
- pep.tsv => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/peptides/peptides.tsv
- annotated_variants.tsv => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.tsv
- annotated_variants.vcf => https://raw.githubusercontent.com/nf-core/test-datasets/epitopeprediction/testdata/variants/variants.vcf
For an example see:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
"""

sample_mapping_dict = {}
sample_run_dict = {}
with open(file_in, "r") as fin:

## Check header
MIN_COLS = 2
# TODO nf-core: Update the column names for the input samplesheet
HEADER = ["sample", "fastq_1", "fastq_2"]
COL_NUM = 3
HEADER = ["sample", "alleles", "filename"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
print("ERROR: Please check samplesheet header -> {} != {}".format("\t".join(header), "\t".join(HEADER)))
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
lspl = [x.strip('"').replace(" ","") for x in line.strip().split(",")]
## Check valid number of columns per row
if len(lspl) != len(HEADER):
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
"Invalid number of columns (valid = {})!".format(len(HEADER)),
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:
if num_cols != COL_NUM:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
"Invalid number of populated columns (valid = {})!".format(COL_NUM),
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
if fastq.find(" ") != -1:
print_error("FastQ file contains spaces!", "Line", line)
if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"):
print_error(
"FastQ file does not have extension '.fastq.gz' or '.fq.gz'!",
"Line",
line,
)

## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]

sample, alleles, filename = lspl[: len(HEADER)]


## Check if the alleles given in the text file are in the right format
if alleles.endswith(".txt"):
with open(alleles, "r") as af:
alleles = ';'.join([al.strip('\n') if check_allele_nomenclature(al) else \
print_error("Allele format is not matching the nomenclature", "Line", line) for al in af.readlines()])


## Get annotation of filename column
if filename.endswith(".vcf"):
anno = "variant"
elif filename.endswith(".tsv"):
## Check if it is a variant annotation file or a peptide file
with open(filename, "r") as tsv:
first_header_col = [col.lower() for col in tsv.readlines()[0].split('\t')][0]
if first_header_col == "id":
anno = "pep"
elif first_header_col == "#chr":
anno = "variant"
else:
print_error("Invalid combination of columns provided!", "Line", line)
anno = "prot"

## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
sample_info = [sample, alleles, filename, anno]
## Create sample mapping dictionary
if sample not in sample_run_dict:
sample_run_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
if sample_info in sample_run_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)
sample_run_dict[sample].append(sample_info)

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
if len(sample_run_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
fout.write(",".join(["sample", "alleles", "filename", "anno"]) + "\n")

for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
for sample in sorted(sample_run_dict.keys()):
for idx, val in enumerate(sample_run_dict[sample]):
fout.write(",".join(val) + "\n")


def main(args=None):
Expand All @@ -143,4 +150,4 @@ def main(args=None):


if __name__ == "__main__":
sys.exit(main())
sys.exit(main())

0 comments on commit f6dbaa9

Please sign in to comment.