Skip to content

Commit

Permalink
Merge pull request #230 from RoanKanninga/master
Browse files Browse the repository at this point in the history
added indexing.sh script and some changes in the pseudo bam and vcf scripts
  • Loading branch information
BenjaminsM committed Mar 25, 2021
2 parents c979aa9 + 254faed commit b468dde
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 58 deletions.
26 changes: 26 additions & 0 deletions indexing.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
set -eu


### simple script that indexes all the available vcf files on prm in the umcg-gd group
## first 2 find commands are for searching through the GAVIN folder and the regular vcf files
### third 'find' command is to search for all the available manta diploidSV files
#
# Script can be used as input with the pseudo_bam and pseudo_vcf.sh scripts in the ngs-utils repo

mkdir "${HOME}/indexing"

if [ -e "/groups/umcg-gd/prm06" ]
then
echo "yes there is prm"
else
echo "no prm available here, please run this on a cluster where prm is available"
exit 1
fi

echo "starting with gavin vcfs"
find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/GAVIN/ -maxdepth 1 -name '*.vcf.gz' > "${HOME}/indexing/AllVCFs.txt"
echo "starting with no Gavin files"
find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/ -maxdepth 1 -name '*final.vcf.gz' -o -name '*.final.vcf' > "${HOME}/indexing/AllVCFs_noGAVIN.txt"
echo "starting with manta files"
find /groups/umcg-gd/prm0*/projects/*/run01/results/variants/cnv/ -maxdepth 1 -name '*_diploidSV.vcf.gz' > "${HOME}/indexing/AllVCFs_Manta.txt"
82 changes: 40 additions & 42 deletions pseudo_bam.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,32 +6,30 @@ set -u


function reheader(){

workDir="${1}"
sampleName="${2}"
fileName="${3}"
pseudo="${4}"

if [ -f "${workDir}/${sampleName}.finished" ]
then
echo "skipping ${workDir}/input/${fileName} conversion, already converted"
else
samtools view -H "${workDir}/input/${fileName}" > "${workDir}/tmp/${sampleName}.header.sam"
perl -pi -e s"|$sampleName|$pseudo|"g "${workDir}/tmp/${sampleName}.header.sam"
echo "starting to reheader ${workDir}/input/${fileName} to ${pseudo}.reheader.bam"
samtools reheader -P "${workDir}/tmp/${sampleName}.header.sam" "${workDir}/input/${fileName}" > "${workDir}/tmp/${pseudo}.reheader.bam"
mv -v "${workDir}/tmp/${pseudo}.reheader.bam" "${workDir}/output/${pseudo}.bam"
touch "${workDir}/${sampleName}.finished"
fi

then
echo "skipping ${workDir}/input/${fileName} conversion, already converted"
else
samtools view -H "${workDir}/input/${fileName}" > "${workDir}/tmp/${sampleName}.header.sam"
perl -pi -e s"|$sampleName|$pseudo|"g "${workDir}/tmp/${sampleName}.header.sam"
echo "starting to reheader ${workDir}/input/${fileName} to ${pseudo}.reheader.bam"
samtools reheader -P "${workDir}/tmp/${sampleName}.header.sam" "${workDir}/input/${fileName}" > "${workDir}/tmp/${pseudo}.reheader.bam"
mv -v "${workDir}/tmp/${pseudo}.reheader.bam" "${workDir}/output/${pseudo}.bam"
touch "${workDir}/${sampleName}.finished"
fi

}

function showHelp() {
#
# Display commandline help on STDOUT.
#
cat <<EOH
#
# Display commandline help on STDOUT.
#
cat <<EOH
===============================================================================================================
Script to pseudo anonimize vcf files.
Expand All @@ -41,28 +39,31 @@ Usage:
$(basename $0) [-i FILE] [-f FILE] [-m]
Options:
-h Show this help.
-h Show this help.
required:
-s search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
-d database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/ngs.csv)
-i input folder containing bam files (not in combination with -s)
-f mapping file (in combination with -i) [
-g which group (default = umcg-gd)
optional:
-p which prm (e.g. prm06) default is both prm05 and prm06
-s search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
-d database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/AllVcfs.txt)
-i input folder containing bam files (not in combination with -s)
-f mapping file (in combination with -i) [
-g which group (default = umcg-gd)
optional:
-p which prm (e.g. prm06) default is both prm05 and prm06
-w working directory
example:
EXTRA INFO:
there is the script for creating a database file (ngs-utils repo; indexing.sh)
Then the file is copied to /groups/umcg-gd/tmp06/pseudo/
===============================================================================================================
EOH
exit 0
exit 0
}

while getopts "i:g:s:d:f:p:w:h" opt;
do
case $opt in h)showHelp;; i)input="${OPTARG}";; g)group="${OPTARG}";; s)search="${OPTARG}";; d)database="${OPTARG}";; f)mapping="${OPTARG}";; p)prm="${OPTARG}";; w)workDir="${OPTARG}";;
case $opt in h)showHelp;; i)input="${OPTARG}";; g)group="${OPTARG}";; s)search="${OPTARG}";; d)database="${OPTARG}";; f)mapping="${OPTARG}";; p)prm="${OPTARG}";; w)workDir="${OPTARG}";;
esac
done

Expand Down Expand Up @@ -110,24 +111,23 @@ then
echo "workdir"
workDir="$(pwd)/pseudoTmp/bam/"
else
echo "yello"
workDir="${workDir}/pseudoTmp/bam/"
fi
mkdir -p "${workDir}"/{input,tmp,output}
mkdir -p "${workDir}/"{input,tmp,output}
module load BCFtools
## read input file
while read line
do
# get dna number from first column
dnaNumber=$(echo ${line} | awk '{print $1}')
dnaNumber=$(echo "${line}" | awk '{print $1}')
# get pseudo id from second column
pseudo=$(echo ${line} | awk '{print $2}')
# get
filePath=$(grep -m 1 ${dnaNumber} ${database} | awk '{print $1}')
vcfFile=$(basename ${filePath})
pseudo=$(echo "${line}" | awk '{print $2}')
filePath=$(grep -m 1 "${dnaNumber}" "${database}" | awk '{print $1}')
vcfFile=$(basename "${filePath}")
sample=${vcfFile%%.*}
notFound="false"
if grep -m 1 ${dnaNumber} ${database}

if grep -m 1 "${dnaNumber}" "${database}"
then
resultsDir=$(dirname "$(dirname ${filePath})")
if ssh -n chaperone "test -e ${resultsDir}/alignment/${sample}*bam"
Expand All @@ -146,8 +146,6 @@ then
sampleName=${fileName%%.*} ## 20000000_DNA12345_0000000_1231244

reheader "${workDir}" "${sampleName}" "${fileName}" "${pseudo}"


done<"${search}"
fi

Expand All @@ -161,14 +159,14 @@ then
fi

mkdir -p "${workDir}"/{input,tmp,output}

mv "${input}/"*".bams" "${workDir}/input/"
module load SAMtools

while read line
do
oldKey=$(echo "${line}" | awk '{print $1}') ## DNA12345
pseudo=$(echo "${line}" | awk '{print $2}') ##sample1
bam=$(ls ${workDir}/input/*${oldKey}*.bam) ## 20000_DNA12345_000_12312.merged.bam
oldKey=$(echo "${line}" | awk '{print $1}') ## DNA12345
pseudo=$(echo "${line}" | awk '{print $2}') ##sample1
bam=$(ls "${workDir}/input/"*"${oldKey}"*".bam") ## 20000_DNA12345_000_12312.merged.bam
fileName=$(basename "${bam}") ## 20000000_DNA12345_0000000_1231244
sampleName=${fileName%%.*} ## 20000000_DNA12345_0000000_1231244
echo "BAM: ${bam} ${oldKey} ${pseudo} ${sampleName}"
Expand Down
30 changes: 15 additions & 15 deletions pseudo_vcf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ set -u


function showHelp() {
#
# Display commandline help on STDOUT.
#
cat <<EOH
#
# Display commandline help on STDOUT.
#
cat <<EOH
===============================================================================================================
Script to pseudo anonimize vcf files.
Expand All @@ -18,23 +18,23 @@ Usage:
$(basename $0) [-i FILE] [-f FILE] [-m]
Options:
-h Show this help.
-h Show this help.
required:
-s search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
-s search database (in combination with -d) (file containing DNA numbers and mapping in second column, tab seperated) (not in combination with -m)
-d database file (containing all the variant vcf files) (default is /groups/umcg-gd/tmp06/pseudo/AllVcfs.txt)
-i input folder containing vcf files (not in combination with -s)
-f mapping file (in combination with -i)
-g which group (default = umcg-gd)
-i input folder containing vcf files (not in combination with -s)
-f mapping file (in combination with -i)
-g which group (default = umcg-gd)
optional:
-m manta data (default disabled) (manta file should be in the same place as the database file, naming should be {DATABASEFILE}_manta.txt e.g. AllVcfs_manta.txt)
-p which prm (e.g. prm06) default is both prm05 and prm06
-m manta data (default disabled) (manta file should be in the same place as the database file, naming should be {DATABASEFILE}_manta.txt e.g. AllVcfs_manta.txt)
-p which prm (e.g. prm06) default is both prm05 and prm06
-w working directory
EXTRA INFO:
database file is created on chaperone as umcg-gd-dm user, in /home/umcg-gd-dm/indexing/ there is the script for creating a database file
there is the script for creating a database file (ngs-utils repo; indexing.sh)
Then the file is copied to /groups/umcg-gd/tmp06/pseudo/
===============================================================================================================
EOH
Expand Down Expand Up @@ -93,16 +93,16 @@ then
## red input file
while read line
do
echo "LINE: ${line}"

# get dna number from first column
dnaNumber=$(echo "${line}" | awk '{print $1}')
# get pseudo id from second column
pseudo=$(echo "${line}" | awk '{print $2}')
# get
echo "${dnaNumber} ${database}"

if grep -m 1 "${dnaNumber}" "${database}"
then
vcfFilePath=$(grep -m 1 ${dnaNumber} ${database} | awk '{print $1}')
vcfFilePath=$(grep -m 1 "${dnaNumber}" "${database}" | awk '{print $1}')
## first check if there is a Gavin file, if so then use it
resultsDir=$(dirname "${vcfFilePath}")
if ssh -n chaperone "test -e ${resultsDir}/GAVIN/*${dnaNumber}*vcf.gz"
Expand Down
2 changes: 1 addition & 1 deletion tortilla.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ then
elif [[ "${1}" == "--validateNGS" || "${1}" == "-n" ]]
then
shift
${EBROOTNGSMINUTILS}/checkValidationNGS_DNA.sh ${@}
${EBROOTNGSMINUTILS}/checkValidationNGS_DNA_v2.sh ${@}
elif [[ "${1}" == "--revertBamToFastQ" || "${1}" == "-r" ]]
then
${EBROOTNGSMINUTILS}/revertFromBamToFastQ.sh ${@}
Expand Down

0 comments on commit b468dde

Please sign in to comment.