/
processReadsPipeline.sh
203 lines (166 loc) · 6.79 KB
/
processReadsPipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
wd=$(pwd)
#module load star
#module load umitools
#module load samtools
#module load bamtools
#module load cutadapt
#module load bedtools
trim() {
if [ ${S: -3} == ".gz" ]; then
zcat "$S" | umi_tools extract --bc-pattern="$bcPattern" -L extract.log > "$file".umi.fastq
else
cat "$S" | umi_tools extract --bc-pattern="$bcPattern" -L extract.log > "$file".umi.fastq
fi
cutadapt -a "$adapter" --minimum-length=23 --overlap=5 -o "$file".trim.fastq "$file".umi.fastq
}
runStar() {
mkdir "$file"
STAR --runThreadN "$nthreads" --genomeDir "$STARREF" --readFilesIn "$file".trim.fastq --outFileNamePrefix "$file"/"$file" --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --outSAMtype BAM SortedByCoordinate
samtools index "$file"/"$file"Aligned.sortedByCoord.out.bam "$file"/"$file"Aligned.sortedByCoord.out.bam.bai
}
dedup(){
#dedup on full aligned file is too big
#umitools dedup -I "$file"/"$file"Aligned.sortedByCoord.out.bam --spliced-is-unique -S "$file"/"$file".dedup.bam -L "$file"/dedup.log --output-stats="$file"/dedup.stats
cd "$wd"/"$file"
##take out unmapped reads
samtools view -b -F 4 "$file"Aligned.sortedByCoord.out.bam -o "$file"mapped.bam
samtools sort "$file"mapped.bam -o "$file"mapped.sort.bam
samtools index "$file"mapped.sort.bam "$file"mapped.sort.bam.bai
##split the bam file up by chromosome and dedup each separately
mkdir splitBam
mv "$file"mapped.sort.bam splitBam
cd splitBam
bamtools split -in "$file"mapped.sort.bam -reference -refPrefix REF_
mv "$file"mapped.sort.bam ../
mkdir dedup
##for each split bam, index it and dedup it
for bam in *.bam; do
samtools index "$bam" "$bam".bai
name=$(echo $bam | sed 's/.bam//')
loc=$(pwd)
umi_tools dedup -I "$bam" --spliced-is-unique --mapping-quality=3 -S dedup/"$name".sort.bam
cd dedup ##make sorted deduped bam:
#samtools view -q 3 "$name".bam -o temp.bam #Filter to leave out reads with poor mapping quality.
#samtools sort temp.bam -o "$name".sort.bam
samtools index "$name".sort.bam "$name".sort.bam.bai
cd ../
done
##re-combine split, deduped bams
cd ../
ls splitBam/dedup/*.sort.bam > deduped_bams.txt
samtools merge -O BAM -b deduped_bams.txt "$file".dedup.bam
cd $wd
}
mappingStats(){
start=$(wc -l "$file".umi.fastq | awk '{print $1/4}')
trim=$(wc -l "$file".trim.fastq | awk '{print $1/4}')
#-F 256 to avoid counting secondary alignments
mapped=$(samtools view -c -F 256 "$file"/"$file"mapped.bam)
uniq=$(samtools view -c -F 256 "$file"/"$file".dedup.bam)
yield=$(echo "scale=2; $uniq/$start*100" | bc)
duprate=$(echo "scale=2; (1-$uniq/$mapped)*100" | bc)
if [[ -f "$LOGFILE" ]]; then
echo "$file Starting:$start Trimmed:$trim Mapped:$mapped Unique:$uniq %Yield:$yield %Duplicates:$duprate" >> "$wd"/"$LOGFILE"
fi
}
genomeCov(){
mkdir "$wd"/"$file"/coverage/
cd "$wd"/"$file"/splitBam/dedup
for bam in *.sort.bam; do
chrom=$(echo $bam | sed s/"$file"mapped.sort.REF_// | sed s/.sort.bam//) #the chromosome name of this bam file
bedtools genomecov -split -strand + -dz -ibam "$bam" > "$wd"/"$file"/coverage/"$chrom"+.coverage
bedtools genomecov -split -strand - -dz -ibam "$bam" > "$wd"/"$file"/coverage/"$chrom"-.coverage
bedtools genomecov -5 -strand + -dz -ibam "$bam" > "$wd"/"$file"/coverage/"$chrom".5prime+.coverage
bedtools genomecov -5 -strand - -dz -ibam "$bam" > "$wd"/"$file"/coverage/"$chrom".5prime-.coverage
done
cd $wd
}
#########Begin command line options#########
if [ $# -lt 1 ] ; then
echo "Usage: processReadsPipeline.sh -i file1.fastq,...,filen.fastq -r STAR_reference [...]"
echo "processReadsPipeline.sh -h for more information"
exit
fi
usage()
{
cat << EOF
/bin/bash processReadsPipeline.sh -i file1.fastq,...filen.fastq -r STAR_reference [...]
This pipeline processes input .fastq files from fSHAPE libraries to produce coverage bedgraph files for each chromosome-strand.
Fastq files can also be in .gz format.
If .fastq files are very large it is recommended to run multiple processReadPipeline.sh instances in parallel, rather than providing
all samples as a input in a list. Very large files may take 48 hours to be processed.
This pipeline assumes an RT-stop SHAPE method was used (as opposed to mutational profiling (MaP) method), such that both total coverage
and 5' read end coverage is calculated for each chromosome and strand. (for ex: chr1+.cov, chr1+.5cov, chr1-.cov, chr1-.5cov).
Mutational profiling methods are also possible with fSHAPE, but require different analysis steps than implemented in this pipeline.
OPTIONS:
REQUIRED
-i, --inputs A .fastq file of fSHAPE sequencing reads or comma separated list of .fastq files: file1.fastq,...filen.fastq
-r, --reference Path to the STAR reference to align reads to.
OPTIONAL(has defaults)
-h, --help Print this help message.
-a, --adapter The adapter sequence that will be trimmed by CutAdapt. Default is AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG.
-u, --UMI The UMI sequence pattern to be recognized by UMItools. Default is NNNNCCCCNNNNN.
-p, --threads The number of threads to use during read aligning by STAR.
-l, --log Name of logfile that records completion status of major steps of pipeline. Path assumed to be current working directory.
EOF
}
file="" #use processReadsPipelineMaster.sh to run all samples at once
STARREF=""
bcPattern=NNNNCCCCNNNNN
adapter=AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG
nthreads=1
SAMPLES=""
LOGFILE=""
while [[ -n $1 ]]; do
case $1 in
-h | --help) usage; exit 1;;
-i | --input) shift; SAMPLES="$1";;
-r | --reference) shift; STARREF="$1";;
-a | --adapter) shift; adapter="$1" ;;
-u | --UMI) shift; bcPattern="$1";;
-p | --threads) shift; nthreads="$1";;
-l | --log) shift; LOGFILE="$1";;
-* | --*) usage; exit 1;;
*) echo "Argument $1 needs switch..."; exit ;;
esac
shift
done
###############End command line options###########
##echo "$STARREF $SAMPLES $nthreads $bcPattern $adapter"
if [[ "$SAMPLES" == "" ]]; then echo "No input files provided."
exit
fi
if [[ -f "$LOGFILE" ]]; then
rm $LOGFILE
fi
for S in $(echo $SAMPLES | sed "s/,/ /g"); do
if [[ ! -f "$S" ]]; then echo "Cannot find file $S. Please check filename/filepath."; exit ; fi
file=$(echo $S | sed "s/.fastq/ /" | awk '{print$1}' | tr '/' '\t' | rev | cut -f1 | rev)
cd $wd
trim
if [[ "$LOGFILE" != "" ]]; then
echo "TRIMMED $file" >> "$wd"/"$LOGFILE"
echo "" >> "$wd"/"$LOGFILE"
fi
cd $wd
runStar
if [[ -f "$LOGFILE" ]]; then
echo "MAPPED with STAR on $file" >> "$wd"/"$LOGFILE"
echo "STAR mapping stats are in $wd/$file/$fileLog.final.out" >> "$wd"/"$LOGFILE"
echo "" >> "$wd"/"$LOGFILE"
fi
cd $wd
dedup
if [[ -f "$LOGFILE" ]]; then
echo "DEDUPED $file with Umi_tools" >> "$wd"/"$LOGFILE"
echo "" >> "$wd"/"$LOGFILE"
fi
cd $wd
mappingStats
cd $wd
genomeCov
if [[ -f "$LOGFILE" ]]; then
echo "GENOMECOV on $file" >> "$wd"/"$LOGFILE"
echo "" >> "$wd"/"$LOGFILE"
fi
done