-
Notifications
You must be signed in to change notification settings - Fork 2
/
script.sh
executable file
·58 lines (49 loc) · 1.54 KB
/
script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/bin/bash
main() {
#
# Fetch and index genome
#
echo "Fetching & indexing reference genome"
dx download "$genome_fastagz" -o genome.fa.gz
gunzip genome.fa.gz
samtools faidx genome.fa
#
# Fetch and index mappings
#
echo "Fetching & indexing BAM"
input=""
for i in "${!sorted_bams[@]}"; do
dx download "${sorted_bams[$i]}" -o "input-$i.bam"
samtools index "input-$i.bam"
input="$input -b input-$i.bam"
done
# for parallelization: make list of reference sequences in order of
# decreasing size
# TODO: even better would be order of decreasing mapped reads, which
# we could get from samtools idxstats...tricky with multiple
# input bams, though.
( IFS=$'\n'; echo "${exclude_sequences[*]}" ) > /tmp/exclude_sequences
sort -k 2rn,2rn genome.fa.fai | cut -f1 | egrep -v -f /tmp/exclude_sequences > /tmp/rseqs
sleep 1
echo "Reference sequences:"
cat /tmp/rseqs
sleep 1
#
# Run freebayes
#
echo "Executing parallel freebayes"
xargs -a /tmp/rseqs -n 1 -P 8 -r -I XXX --verbose \
freebayes $input -v XXX.vcf -f genome.fa -r XXX $advanced_options
echo "Results:"
ls -1sh *.vcf
# for vcfcombine: make ordered list of .vcf file names
cut -f1 genome.fa.fai | egrep -v -f /tmp/exclude_sequences | awk '{printf("%s.vcf\n",$0)}' > /tmp/rseqs.vcfs
#
# Upload results
#
name=`dx describe "${sorted_bams[0]}" --name`
name="${name%.bam}"
echo "Uploading results"
file_id=`xargs -a /tmp/rseqs.vcfs -n 999999 vcfcombine | bgzip -c | dx upload -o "$name.vcf.gz" --brief -`
dx-jobutil-add-output "variants_vcfgz" "$file_id"
}