-
Notifications
You must be signed in to change notification settings - Fork 0
/
call
144 lines (114 loc) · 2.99 KB
/
call
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#####
# ragweed-selection
# Battlay et. al. 2023
# call variants
#####
# variant calling with GATK unified genotyper
cd ~/scratch/ragweed/
# list BAM files (n=359)
ls bam/*.bam > bam/bam-UG.list
# list the 19 'big' scaffolds; >1Mb in length
cat ref/ragweed-dipasm-hap1.fasta.fai | awk '$2 > 1000000 {print $1}' > UG-BIG-scafflist.txt
# list the 59 'small' scaffolds; <1Mb in length, but larger than 100000
cat ref/ragweed-dipasm-hap1.fasta.fai | awk '$2 < 1000000 {print $0}' | awk '$2 > 100000 {print $1}' > UG-SML-scafflist.txt
# array job to call variants on small scaffolds
# UG-SML.sh
#!/bin/bash
#SBATCH --job-name=UG-SML
#SBATCH --account=def-rieseber
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=10G
#SBATCH --time=8:00:00
#SBATCH --output=UG-SML-%a.out
#SBATCH --error=UG-SML-%a.err
#SBATCH --array=1-59
cd ~/scratch/ragweed/
N=$SLURM_ARRAY_TASK_ID
scaff=$(cat UG-SML-scafflist.txt | head -n $N | tail -n 1)
module purge
module load nixpkgs/16.09
module load gatk/3.8
echo $scaff > $scaff.list
java -jar $EBROOTGATK/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-nt 8 \
-R ref/ragweed-dipasm-hap1.fasta \
-I bam/bam-UG.list \
-L $scaff.list \
-glm BOTH \
-o vcf-SML/$scaff.raw.vcf
###
# array job to call variants on a big scaffold in 1Mb windows
# UG-BIG-h1s1.sh
#!/bin/bash
#SBATCH --job-name=UG-BIG-h1s1
#SBATCH --account=def-rieseber
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=10G
#SBATCH --time=12:00:00
#SBATCH --output=UG-BIG-h1s1-%a.out
#SBATCH --error=UG-BIG-h1s1-%a.err
#SBATCH --array=1-67
cd ~/scratch/ragweed/
scaff=h1s1
N=$SLURM_ARRAY_TASK_ID
start=$(((N-1)*1000000+1))
end=$((N*1000000))
module purge
module load nixpkgs/16.09
module load gatk/3.8
echo $scaff:$start-$end > $scaff-$N.list
java -jar $EBROOTGATK/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-nt 8 \
-R ref/ragweed-dipasm-hap1.fasta \
-I bam/bam-UG.list \
-L $scaff-$N.list \
-glm BOTH \
-o vcf-BIG/$scaff-$N.raw.vcf
###
# last part of a big scaffold; set scaffold end manually
# UG-BIG-h1s1-68.sh
#!/bin/bash
#SBATCH --job-name=UG-BIG-h1s1-68
#SBATCH --account=def-rieseber
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=8
#SBATCH --mem-per-cpu=10G
#SBATCH --time=12:00:00
#SBATCH --output=UG-BIG-h1s1-68.out
#SBATCH --error=UG-BIG-h1s1-68.err
cd ~/scratch/ragweed/
scaff=h1s1
N=68
start=67000001
end=67187999
module purge
module load nixpkgs/16.09
module load gatk/3.8
echo $scaff:$start-$end > $scaff-$N.list
java -jar $EBROOTGATK/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-nt 8 \
-R ref/ragweed-dipasm-hap1.fasta \
-I bam/bam-UG.list \
-L $scaff-$N.list \
-glm BOTH \
-o vcf-BIG/$scaff-$N.raw.vcf
###
# check vcf parts are there
for i in h1s1 h1s2 h1s3 h1s4 h1s5 h1s6 h1s7 h1s8 h1s9 h1s10 h1s11 h1s12 h1s13 h1s14 h1s15 h1s16 h1s17 h1s18 h1s20
do
for j in {1..100}
do
ls vcf-BIG/$i-$j.raw.vcf.idx
done
done
# merge into single VCF
ls vcf-BIG/*.vcf > vcf-all.list
ls vcf-SML/*.vcf >> vcf-all.list