Merge pull request #230 from RoanKanninga/master

added indexing.sh script and some changes in the pseudo bam and vcf scripts
molgenis · Mar 25, 2021 · b468dde · b468dde
2 parents c979aa9 + 254faed
commit b468dde
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 58 deletions.
diff --git a/indexing.sh b/indexing.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -eu
+
+
+### simple script that indexes all the available vcf files on prm in the umcg-gd group 
+## first 2 find commands are for searching through the GAVIN folder and the regular vcf files
+### third 'find' command is to search for all the available manta diploidSV files
+#
+# Script can be used as input with the pseudo_bam and pseudo_vcf.sh scripts in the ngs-utils repo
+
+mkdir "${HOME}/indexing"
+
+if [ -e "/groups/umcg-gd/prm06" ]
+then 
+	echo "yes there is prm"
+else 
+	echo "no prm available here, please run this on a cluster where prm is available" 
+	exit 1
+fi
+
+echo "starting with gavin vcfs"
+find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/GAVIN/ -maxdepth 1 -name '*.vcf.gz' > "${HOME}/indexing/AllVCFs.txt"
+echo "starting with no Gavin files"
+find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/ -maxdepth 1 -name '*final.vcf.gz' -o -name '*.final.vcf'  > "${HOME}/indexing/AllVCFs_noGAVIN.txt"
+echo "starting with manta files"
+find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/cnv/ -maxdepth 1 -name '*_diploidSV.vcf.gz' > "${HOME}/indexing/AllVCFs_Manta.txt"
diff --git a/pseudo_bam.sh b/pseudo_bam.sh
@@ -6,32 +6,30 @@ set -u
 
 
 function reheader(){
-
 	workDir="${1}"
 	sampleName="${2}"
 	fileName="${3}"
 	pseudo="${4}"
 
 	if [ -f "${workDir}/${sampleName}.finished" ]
-        then
-                echo "skipping ${workDir}/input/${fileName} conversion, already converted"
-        else
-                samtools view -H "${workDir}/input/${fileName}" > "${workDir}/tmp/${sampleName}.header.sam"
-                perl -pi -e s"|$sampleName|$pseudo|"g "${workDir}/tmp/${sampleName}.header.sam"
-                echo "starting to reheader ${workDir}/input/${fileName} to ${pseudo}.reheader.bam"
-                samtools reheader -P "${workDir}/tmp/${sampleName}.header.sam" "${workDir}/input/${fileName}" >  "${workDir}/tmp/${pseudo}.reheader.bam"
-                mv -v "${workDir}/tmp/${pseudo}.reheader.bam" "${workDir}/output/${pseudo}.bam"
-                touch "${workDir}/${sampleName}.finished"
-        fi
-
+		then
+				echo "skipping ${workDir}/input/${fileName} conversion, already converted"
+		else
+				samtools view -H "${workDir}/input/${fileName}" > "${workDir}/tmp/${sampleName}.header.sam"
+				perl -pi -e s"|$sampleName|$pseudo|"g "${workDir}/tmp/${sampleName}.header.sam"
+				echo "starting to reheader ${workDir}/input/${fileName} to ${pseudo}.reheader.bam"
+				samtools reheader -P "${workDir}/tmp/${sampleName}.header.sam" "${workDir}/input/${fileName}" >  "${workDir}/tmp/${pseudo}.reheader.bam"
+				mv -v "${workDir}/tmp/${pseudo}.reheader.bam" "${workDir}/output/${pseudo}.bam"
+				touch "${workDir}/${sampleName}.finished"
+		fi
 
 }
 
 function showHelp() {
-        #
-        # Display commandline help on STDOUT.
-        #
-        cat <<EOH
+		#
+		# Display commandline help on STDOUT.
+		#
+		cat <<EOH
 ===============================================================================================================
 Script to pseudo anonimize vcf files. 
 
@@ -41,28 +39,31 @@ Usage:
 	$(basename $0) [-i FILE] [-f FILE] [-m]
 
 Options:
-        -h   Show this help.
+		-h   Show this help.
 
    required:
-        -s   search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
-	-d   database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/ngs.csv)
-        -i   input folder containing bam files (not in combination with -s) 
-        -f   mapping file (in combination with -i) [
-        -g   which group (default = umcg-gd)
-
-    optional:
-        -p   which prm (e.g. prm06) default is both prm05 and prm06
+		-s   search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
+		-d   database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/AllVcfs.txt)
+		-i   input folder containing bam files (not in combination with -s) 
+		-f   mapping file (in combination with -i) [
+		-g   which group (default = umcg-gd)
+
+	optional:
+		-p   which prm (e.g. prm06) default is both prm05 and prm06
 	-w   working directory
 
-   example:
+
+EXTRA INFO:
+there is the script for creating a database file (ngs-utils repo; indexing.sh)
+Then the file is copied to /groups/umcg-gd/tmp06/pseudo/
 ===============================================================================================================
 EOH
-        exit 0
+		exit 0
 }
 
 while getopts "i:g:s:d:f:p:w:h" opt;
 do
-        case $opt in h)showHelp;; i)input="${OPTARG}";; g)group="${OPTARG}";; s)search="${OPTARG}";; d)database="${OPTARG}";; f)mapping="${OPTARG}";;  p)prm="${OPTARG}";; w)workDir="${OPTARG}";;
+		case $opt in h)showHelp;; i)input="${OPTARG}";; g)group="${OPTARG}";; s)search="${OPTARG}";; d)database="${OPTARG}";; f)mapping="${OPTARG}";;  p)prm="${OPTARG}";; w)workDir="${OPTARG}";;
 		esac
 done
 
@@ -110,24 +111,23 @@ then
 		echo "workdir"
 		workDir="$(pwd)/pseudoTmp/bam/"
 	else
-		echo "yello"
 		workDir="${workDir}/pseudoTmp/bam/"
 	fi
-        mkdir -p "${workDir}"/{input,tmp,output}
+		mkdir -p "${workDir}/"{input,tmp,output}
 	module load BCFtools
 	## read input file
 	while read line
 	do
 		# get dna number from first column
-		dnaNumber=$(echo ${line} | awk '{print $1}')
+		dnaNumber=$(echo "${line}" | awk '{print $1}')
 		# get pseudo id from second column
-		pseudo=$(echo ${line} | awk '{print $2}')
-		# get
-		filePath=$(grep -m 1 ${dnaNumber} ${database} | awk '{print $1}')
-		vcfFile=$(basename ${filePath})
+		pseudo=$(echo "${line}" | awk '{print $2}')
+		filePath=$(grep -m 1 "${dnaNumber}" "${database}" | awk '{print $1}')
+		vcfFile=$(basename "${filePath}")
 		sample=${vcfFile%%.*}
 		notFound="false"
-		if grep -m 1 ${dnaNumber} ${database}
+
+		if grep -m 1 "${dnaNumber}" "${database}"
 		then
 			resultsDir=$(dirname "$(dirname ${filePath})")
 			if ssh -n chaperone "test -e ${resultsDir}/alignment/${sample}*bam"
@@ -146,8 +146,6 @@ then
 		sampleName=${fileName%%.*} ## 20000000_DNA12345_0000000_1231244
 
 		reheader "${workDir}" "${sampleName}" "${fileName}" "${pseudo}"
-
-
 	done<"${search}"
 fi
 
@@ -161,14 +159,14 @@ then
 	fi
 
 	mkdir -p "${workDir}"/{input,tmp,output}
-
+	mv "${input}/"*".bams" "${workDir}/input/"
 	module load SAMtools
 
 	while read line
 	do
-		oldKey=$(echo "${line}" | awk '{print $1}')     ## DNA12345
-		pseudo=$(echo "${line}" | awk '{print $2}')     ##sample1
-		bam=$(ls ${workDir}/input/*${oldKey}*.bam) ## 20000_DNA12345_000_12312.merged.bam
+		oldKey=$(echo "${line}" | awk '{print $1}')	 ## DNA12345
+		pseudo=$(echo "${line}" | awk '{print $2}')	 ##sample1
+		bam=$(ls "${workDir}/input/"*"${oldKey}"*".bam") ## 20000_DNA12345_000_12312.merged.bam
 		fileName=$(basename "${bam}") ## 20000000_DNA12345_0000000_1231244
 		sampleName=${fileName%%.*} ## 20000000_DNA12345_0000000_1231244
 		echo "BAM: ${bam} ${oldKey} ${pseudo} ${sampleName}"

diff --git a/pseudo_vcf.sh b/pseudo_vcf.sh
@@ -5,10 +5,10 @@ set -u
 
 
 function showHelp() {
-        #
-        # Display commandline help on STDOUT.
-        #
-        cat <<EOH
+	#
+	# Display commandline help on STDOUT.
+	#
+	cat <<EOH
 ===============================================================================================================
 Script to pseudo anonimize vcf files. 
 
@@ -18,23 +18,23 @@ Usage:
 	$(basename $0) [-i FILE] [-f FILE] [-m]
 
 Options:
-        -h   Show this help.
+	-h   Show this help.
 
    required:
-        -s   search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
+	-s   search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
 	-d   database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/AllVcfs.txt)
-        -i   input folder containing vcf files (not in combination with -s) 
-        -f   mapping file (in combination with -i)
-        -g   which group (default = umcg-gd)
+	-i   input folder containing vcf files (not in combination with -s) 
+	-f   mapping file (in combination with -i)
+	-g   which group (default = umcg-gd)
 
     optional:
-        -m   manta data (default disabled) (manta file should be in the same place as the database file, naming should be {DATABASEFILE}_manta.txt e.g. AllVcfs_manta.txt)
-        -p   which prm (e.g. prm06) default is both prm05 and prm06
+	-m   manta data (default disabled) (manta file should be in the same place as the database file, naming should be {DATABASEFILE}_manta.txt e.g. AllVcfs_manta.txt)
+	-p   which prm (e.g. prm06) default is both prm05 and prm06
 	-w   working directory
 
 
 EXTRA INFO:
-database file is created on chaperone as umcg-gd-dm user, in /home/umcg-gd-dm/indexing/ there is the script for creating a database file
+there is the script for creating a database file (ngs-utils repo; indexing.sh)
 Then the file is copied to /groups/umcg-gd/tmp06/pseudo/
 ===============================================================================================================
 EOH
@@ -93,16 +93,16 @@ then
 	## red input file
 	while read line
 	do
-		echo "LINE: ${line}"
+
 		# get dna number from first column
 		dnaNumber=$(echo "${line}" | awk '{print $1}')
 		# get pseudo id from second column
 		pseudo=$(echo "${line}" | awk '{print $2}')
-		# get
 		echo "${dnaNumber} ${database}"
+
 		if grep -m 1 "${dnaNumber}" "${database}"
 		then
-			vcfFilePath=$(grep -m 1 ${dnaNumber} ${database} | awk '{print $1}')
+			vcfFilePath=$(grep -m 1 "${dnaNumber}" "${database}" | awk '{print $1}')
 			## first check if there is a Gavin file, if so then use it
 			resultsDir=$(dirname "${vcfFilePath}")
 			if ssh -n chaperone "test -e ${resultsDir}/GAVIN/*${dnaNumber}*vcf.gz"

diff --git a/tortilla.sh b/tortilla.sh
@@ -52,7 +52,7 @@ then
 elif [[ "${1}" == "--validateNGS" || "${1}" == "-n" ]]
 then
 	shift
-	${EBROOTNGSMINUTILS}/checkValidationNGS_DNA.sh ${@}
+	${EBROOTNGSMINUTILS}/checkValidationNGS_DNA_v2.sh ${@}
 elif [[ "${1}" == "--revertBamToFastQ" || "${1}" == "-r" ]]
 then
 	${EBROOTNGSMINUTILS}/revertFromBamToFastQ.sh ${@}