-
Notifications
You must be signed in to change notification settings - Fork 0
/
4.Genotyping_QC
502 lines (402 loc) · 19.6 KB
/
4.Genotyping_QC
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
#############################################################
Script written by: Ruth Chia (NIH/NIA/LNG)
Reference github: https://github.com/ruthchia/MyastheniaGravis_AnalysisCode/blob/main/01.QC_and_imputation/scripts/QC_preimputation_v2_keepRelated_part1of2.mg.sh
Updated on: 07-07-2020
#############################################################
plinkFILE=$1
outFILENAME=$2
email=$3
# Warning message to indicate argument requirement for script to run
if [ $# -eq 3 ]
then
echo "QC_preimputation_v2_keepRelated_1part1o2 running"
echo "This script should be executed in biowulf. If not, please terminate job."
echo "Additional file requirement: hapmap3_EUR_ASIA_AFR_only in .bed/.bim/fam format for PCA plotting and sample-level filtering for European samples"
echo "Notification email will be sent when task is complete."
else
echo "Need plink input file for merged case/control data and output file name"
echo "Note: This script is written to be executed in biowulf."
exit
fi
## How to use: sh QC_preimputation_v3_keepRelated_wgs.sh $plinkINPUT $outputNAME $email
## How to run as a swarm job. Be sure to specify processor that supports AVX2/Haswell
echo "sh ./scripts/QC_preimputation_v2_keepRelated_part1of2.updatedJuly2020.sh $plinkINPUT $outputNAME $email" > qc.swarm
swarm --file qc.swarm --logdir swarmOE_qc --gres=lscratch:200 -g 120 -t auto --time 03:00:00 --sbatch '--constraint=ibfdr'
##################
## QC OVERVIEW ###
##################
## PART 1of2 ###
STEP 1 : SAMPLE-LEVEL FILTERING (HETEROZYGOSITY, CALL RATE, GENDER CHECK, ANCESTRY)
STEP 1.1: Sample-level filtering (Heterozygosity check, removal of het outliers)
STEP 1.2: Sample-level filtering (sample call rate check, maximum missing calls per sample --mind 0.05)
STEP 1.3: Sample-level filtering (Gender check, remove samples that failed gender check)
STEP 1.4: Sample-level filtering (Ancestry check, remove samples that are not european)
STEP 1.5: Generate list of duplicates; remove duplicates
STEP 1.6: Generate list of related samples
STEP 2 : VARIANT-LEVEL FILTERING (Variant inclusion criteria to account for genotyping batch differences; then make final bfiles with --geno 0.05)
STEP 2.1: Case/control nonrandom missingness test \(i.e. exclude SNPs with P \< 1E-4\)
STEP 2.2: Haplotype-based test for non-random missing genotype data \(i.e. exclude SNPs with P \< 1E-4\)
STEP 2.3: Make list of varaints that failed HWE \(controls\) \(i.e. exclude SNPs with midP \< 1E-6\)
STEP 2.4: Final --geno 0.05 to account for final variant missingness
# Load necessary modules on biowulf
module load plink/1.9.0-beta4.4
module load R/3.5.2
module load GCTA/1.92.0beta3
module load flashpca/2.0
# Date stamp to record when the script run was started
start_date=$(date)
echo "Start date and time = $start_date"
res1=$(date +%s.%N)
mkdir QC
cd QC
## Sample cleaning inclusion:
NOTE do all samples have a gender??
NOTE do all your samples have an affection status??
# Make lists for plink to remove samples with uncertain/unknown gender, monomorphic SNPs, palindromic SNPs
echo List of samples with uncertain/unknown gender
awk '{if($6 == -9 || $5 == 0) print $1, $2}' ../${plinkFILE}.fam > SamplesToDelete.txt
echo List of monomorphic SNPs
awk '{if(($5 == $6) || $5 == 0 || $6 == 0) print $2}' ../${plinkFILE}.bim > MonomorphicBadSNPs.txt
echo List of palindromic SNPs
awk '{if(($5 == "A" && $6 == "T") || ($5 == "T" && $6 == "A") || ($5 == "G" && $6 == "C") || ($5 == "C" && $6 == "G")) print $2}' ../${plinkFILE}.bim > PalindromicSNPs.txt
echo Remove bad samples and bad SNPs and update bfiles ready for downstream QC of sample and SNP filtering
plink --bfile ../${plinkFILE} --remove SamplesToDelete.txt --exclude MonomorphicBadSNPs.txt --make-bed --out merged2
echo Remove palindromic SNPs
plink --bfile merged2 --exclude PalindromicSNPs.txt --make-bed --out merged3
Optional: TO GET REPORT FOR SAMPLE-BASED and VARIANT-BASED MISSING DATA
plink --bfile merged2 --missing --out call_rates
Rename file
mv call_rates.imiss CALL_RATES_ALL_SAMPLES.txt
rm call_rates.lmiss
rm call_rates.log
rm call_rates.hh
rm call_rates.nosex
mv call_rates.imiss CALL_RATES_ALL_SAMPLES.txt
# STEP 1: SAMPLE-LEVEL FILTERING (HETEROZYGOSITY, CALL RATE, GENDER CHECK, ANCESTRY)
echo
echo STEP 1.1: Sample-level filtering \(Heterozygosity check, removal of het outliers\)
echo
## What happens here:
## Check sample heterozygosity -> remove heterozygosity outliers --> remake binary files removing het_outliers
## No heterozygosity outliers: --het from LD pruned data > use F cut-off of -0.15 and <- 0.15 for inclusion
plink --bfile merged3 --geno 0.01 --maf 0.05 --indep-pairwise 50 5 0.5 --out pruning
plink --bfile merged3 --extract pruning.prune.in --make-bed --out pruned_data
plink --bfile pruned_data --het --out prunedHet
awk '{if ($6 <= -0.15 || $6 >= 0.15 ) print $1"\t"$2}' prunedHet.het > het_outliers.txt
OPTIONAL: rename .het file as a record of this step
mv prunedHet.het HETEROZYGOSITY_DATA.txt
# Remove samples with bad heterozygosity from genotype dataset, and make new bfiles
plink --bfile merged2 --remove het_outliers.txt --make-bed --out after_heterozyg
echo
echo STEP 1.2: Sample-level filtering \(sample call rate check, maximum missing calls per sample --mind 0.05\)
echo
## Filter out samples with missing call rates (i.e. maximum missing calls per sample) set at --mind 0.05 (i.e. maximum missing calls per sample is 5%)
## No call rate outliers
plink --bfile after_heterozyg --mind 0.05 --make-bed --out after_heterozyg_call_rate
OPTIONAL: rename .irem file as a record of this step
mv after_heterozyg_call_rate.irem CALL_RATE_OUTLIERS.txt
echo
echo STEP 1.3: Sample-level filtering \(Gender check, remove samples that failed gender check\)
echo
# (Note for neuroX data that does not have GWAS back bone, use F cut-off of 0.50 instead of 0.25/0.75 and only use the PAR region’s common variants always)
# PAR (hg19) = --chr 23 --from-bp 2699520 --to-bp 154931043 --maf 0.05 --geno 0.05 --hwe 1E-5
plink --bfile after_heterozyg_call_rate --check-sex 0.25 0.75 --maf 0.05 --out gender_check1
plink --bfile after_heterozyg_call_rate --chr 23 --from-bp 2699520 --to-bp 154931043 --maf 0.05 --geno 0.05 --hwe 1E-5 --check-sex 0.25 0.75 --out gender_check2
### Make list of samples (FID, IID) that failed gender check
grep "PROBLEM" gender_check1.sexcheck | awk '{print $1"\t"$2}' > failedGenderCheck_samples_to_remove.txt
grep "PROBLEM" gender_check2.sexcheck | awk '{print $1"\t"$2}' >> failedGenderCheck_samples_to_remove.txt
sort failedGenderCheck_samples_to_remove.txt | uniq > failedGenderCheck_samples_to_remove_noDups.txt
### Merge .fam file info with list of samples that failed gender check to figure which case or control failed
Rscript ../scripts/gender_checkINFO.R --no-save
R
require("data.table")
library(tidyverse)
fam <- fread("after_heterozyg_call_rate.fam", header = F)
check1 <- fread("gender_check1.sexcheck",header=T)
check2 <- fread("gender_check2.sexcheck",header=T)
merged_problem <- merge(check1,check2,by=c("FID","IID","PEDSEX","SNPSEX","STATUS")) %>% filter(STATUS == "PROBLEM")
merged_problemINFO <- merge(merged_problem, fam[,c(1,2,6)], by.x=c("FID","IID"),by.y=c("V1","V2"))
colnames(merged_problemINFO)[6:8] <- c("check1_F","check2_F","PHENO")
write.table(merged_problemINFO, "../failedGenderCheck_stats_INFO.txt",sep="\t",quote=F,col.names=T,row.names=F)
q()
n
# Remove samples that failed sex check --> remake plink2 binary files
plink --bfile after_heterozyg_call_rate --remove failedGenderCheck_samples_to_remove.txt --make-bed --out after_gender
OPTIONAL: rename .sexcheck file as a record of this step
mv gender_check1.sexcheck GENDER_CHECK1
mv gender_check2.sexcheck GENDER_CHECK1
echo
echo STEP 1.4: Sample-level filtering \(Ancestry check, remove samples that are not european\)
echo
## No ancestry outliers (i.e. europeans) -> based on Hapmap3 PCA plot, should be near combined CEU/TSI
# First, download hapmap3 data. Prep data to include only EUROPEAN (TSI,CEU), ASIAN (CHB, CHD, JPT) and AFRICAN (LWK, YRI, ASW) and update IID to a format that can be used with the PCA_in_R.R. The hapmap3 version that is ready for us is hapmap3_EUR_ASIA_AFR_only in .bed/.bim/fam format.
# The script used to generate the working hapmap3 version is hapmap3_prep_v1.sh
# Make a list of common SNPs from after_gender.bim and hapmap3.
awk '{print $2}' ../scripts/hapmap3_EUR_ASIA_AFR_only.bim > temp1.txt
awk '{print $2}' after_gender.bim >> temp1.txt
awk '{print $1}' temp1.txt | sort | uniq -d > commonSNPs.txt
# Pull common SNPs out from after_gender.bim
plink --bfile after_gender --extract commonSNPs.txt --make-bed --out temp2 --allow-no-sex
# Make SNP map update; then update common SNPs positions to build37/hg19
## hapmap3 is in build36. case/control data is in build37. Update hapmap positions based on case/control temp2.bim file
awk '{print $2, $4}' temp2.bim > updateMap.txt
plink --bfile ../scripts/hapmap3_EUR_ASIA_AFR_only --extract commonSNPs.txt --update-map updateMap.txt --make-bed --out hapmap3_hg19_new --allow-no-sex
# Merge hapmap3_EUR_ASIA_AFR_only bfiles with after_gender bfiles
plink --bfile after_gender --bmerge hapmap3_hg19_new --out hapmap3_DISEASE_snplis --make-bed --allow-no-sex
plink --bfile after_gender --flip hapmap3_DISEASE_snplis-merge.missnp --make-bed --out after_gender3 --allow-no-sex
plink --bfile after_gender3 --bmerge hapmap3_hg19_new --out hapmap3_DISEASE_snplis --make-bed --allow-no-sex
plink --bfile after_gender3 --exclude hapmap3_DISEASE_snplis-merge.missnp --out after_gender4 --make-bed --allow-no-sex
plink --bfile after_gender4 --bmerge hapmap3_hg19_new --out hapmap3_DISEASE_snplis --make-bed --allow-no-sex
# Generate pca with additional filter to account for genotype missingness set at --geno 0.05.
plink --bfile hapmap3_DISEASE_snplis --geno 0.05 --out pca --make-bed --pca --allow-no-sex
# Prepare files for input into R for PCA plots
grep "EUROPE" pca.eigenvec > eur.txt
grep "ASIA" pca.eigenvec > asia.txt
grep "AFRICA" pca.eigenvec > afri.txt
grep -v -f eur.txt pca.eigenvec | grep -v -f asia.txt | grep -v -f afri.txt | awk '{print "0 "$0}' > pca.eigenvec2
awk '{print "1 "$0}' eur.txt >> pca.eigenvec2
awk '{print "2 "$0}' asia.txt >> pca.eigenvec2
awk '{print "3 "$0}' afri.txt >> pca.eigenvec2
# Run R script for PCA plotting and filtering; then make new bfiles
echo
echo \(continued\) STEP 2.4: Run PCA_in_R.R to filter out non-europeans
echo
Rscript ../scripts/PCA_in_R_AncestryCheck_SD6.R --no-save
echo
plink --bfile after_gender --keep ../PCA_filtered_europeans.txt --make-bed --out after_gender_heterozyg_hapmap_keepEUR
echo
echo STEP 1.5: Generate list of duplicate; remove duplicates
echo
mkdir GRM
# Prune genotype for GRM
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR \
--geno 0.01 \
--maf 0.05 \
--hwe 5e-6 \
--indep-pairwise 50 5 0.5 \
--autosome \
--out GRM/temp.pruning
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR \
--extract GRM/temp.pruning.prune.in \
--keep-allele-order \
--allow-no-sex \
--make-bed \
--out GRM/pruned_temp
rm GRM/temp.pruning*
# Generate grm matrix with GCTA
gcta \
--bfile GRM/pruned_temp \
--make-grm-gz \
--out GRM/GRM_matrix \
--autosome \
--maf 0.05 \
--thread-num 32
# Identify duplicates (i.e. pihat > 0.8)
gcta --grm-cutoff 0.8 \
--grm-gz GRM/GRM_matrix \
--out GRM/GRM_matrix_08 \
--make-grm \
--thread-num 32
# Make list of duplicates
awk '{print $2}' GRM/GRM_matrix_08.grm.id > GRM/SampleList.noDups.IID.txt
# Run script to generate sample list of duplicates
Rscript ../scripts/grm_duplicates.R
R
!/usr/bin/env Rscript
require(data.table)
require(tidyverse)
Read in data
noDups <- fread("GRM/SampleList.noDups.IID.txt",header=F)
fam <- fread("GRM/pruned_temp.fam",header=F)
Dups <- subset(fam, !(fam$V2 %in% noDups$V1))
dim(noDups)
dim(fam)
dim(Dups)
write.table(Dups[,c("V1","V2")], "GRM/Dups.ToRemove.FIDspaceIID.txt", sep=" ", quote=F, col.name=F,row.names=F)
q()
n
# Remove dups (i.e. keep related samples) and remake plink binaries
grep -Ewf GRM/Dups.ToRemove.FIDspaceIID.txt after_gender_heterozyg_hapmap_keepEUR.fam | awk '{print $1,$2}' OFS=" " > Dups.ToRemove.FIDspaceIID.txt
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR \
--remove Dups.ToRemove.FIDspaceIID.txt \
--make-bed \
--out after_gender_heterozyg_hapmap_keepEUR_noDups
echo
echo STEP 1.6: Generate list of related samples
echo
# Remove dups from pruned data in Step 1.5 above, rerun gcta to generate GRM matrix of samples without duplicates)
plink \
--bfile GRM/pruned_temp \
--remove GRM/Dups.ToRemove.FIDspaceIID.txt \
--make-bed \
--out GRM/pruned_temp.noDups
gcta \
--bfile GRM/pruned_temp.noDups \
--make-grm-gz \
--out GRM/GRM_matrix.noDups \
--autosome \
--maf 0.05 \
--thread-num 32
gcta \
--grm-cutoff 0.125 \
--grm-gz GRM/GRM_matrix.noDups \
--out GRM/GRM_matrix.noDups_0125 \
--make-grm \
--thread-num 32
# Run script to generate sample list of related samples
Rscript ../scripts/grm_unrelated_updatedFeb2022.R
grep -Ewf GRM/FILTERED.to.remove.GRM_matrix.noDups.relatedSamples_IID.txt after_gender_heterozyg_hapmap_keepEUR.fam | awk '{print $1,$2}' OFS=" " > ../FILTERED.to.remove.GRM_matrix.noDups.relatedSamples_FIDspaceIID.txt
R
#!/usr/bin/env Rscript
# Load libraries
require(data.table)
require(tidyverse)
# Read in data, filter and save output
grmMASTER <- fread("GRM/GRM_matrix.noDups.grm.gz")
grmMASTER1 <- subset(grmMASTER, grmMASTER$V1 != grmMASTER$V2) %>% filter(V4 > 0.125)
write.table(grmMASTER1, "GRM/GRM_matrix.noDups_related.grm", quote=F, sep="\t", col.names=T,row.names=F)
grm <- fread("GRM/GRM_matrix.noDups_related.grm", header=T)
ids <- fread("GRM/GRM_matrix.noDups.grm.id", header = F)
ids$index <- as.numeric(row.names(ids))
grm1 <- subset(grm, grm$V1 != grm$V2) %>% rename(Sample1_idx = V1, Sample2_idx=V2)
fam <- fread("GRM/pruned_temp.noDups.fam", header = F)
grm0125 <- fread("GRM/GRM_matrix.noDups_0125.grm.id", header = F)
grm0125_related <- subset(fam, !fam$V2 %in% grm0125$V2)
merge1 <- merge(grm1,ids[,c(2,3)],by.x="Sample1_idx",by.y="index",all.x=T) %>% rename(Sample1 = V2)
merge2 <- merge(merge1,ids[,c(2,3)],by.x="Sample2_idx",by.y="index",all.x=T) %>% rename(Sample2 = V2)
merge3 <- merge(merge2[,c(5,6,3,4)],fam[,c(2,5,6)],by.x="Sample1",by.y="V2",all.x=T) %>% rename(Sample1_SEX=V5,Sample1_PHENO=V6)
merge4 <- merge(merge3,fam[,c(2,5,6)],by.x="Sample2",by.y="V2",all.x=T) %>% rename(Sample2_SEX=V5,Sample2_PHENO=V6)
merge4 <- merge4[,c(2,1,3:8)] %>% rename(No.Non.missing.SNP=V3,est.genetic.relatedness=V4)
merge4$BothDiffPHENO <- merge4$Sample1_PHENO != merge4$Sample2_PHENO
write.table(merge4,"GRM/FILTERED.to.remove.GRM_matrix.noDups_related.grm.SampleINFO.txt", sep="\t",quote=F,col.names=T,row.names=F)
list1 <- merge4[,"Sample1",drop=T] %>% data.frame() %>% distinct()
colnames(list1)[1] <- "Sample1"
list2 <- merge4[,"Sample2",drop=T] %>% data.frame() %>% distinct()
colnames(list2)[1] <- "Sample2"
mergeList <- merge(list1,list2,by.x="Sample1",by.y="Sample2") %>% rename(Sample = Sample1)
subset1 <- subset(merge4, !merge4$Sample1 %in% mergeList$Sample)
subset2 <- subset(subset1, !subset1$Sample2 %in% mergeList$Sample)
diffPHENO <- subset(subset2, subset2$BothDiffPHENO == "TRUE")
sampleToExclude1 <- subset(diffPHENO, diffPHENO$Sample1_PHENO == 2) %>% select(Sample2) %>% rename(Sample = Sample2)
sampleToExclude2 <- subset(diffPHENO, diffPHENO$Sample2_PHENO == 2) %>% select(Sample1) %>% rename(Sample = Sample1)
missingSEX1 <- subset(merge4, merge4$Sample1_SEX == "0") %>% select(Sample1) %>% rename(Sample=Sample1)
missingSEX2 <- subset(merge4, merge4$Sample2_SEX == "0") %>% select(Sample2) %>% rename(Sample=Sample2)
sampleToExclude_missingSEX <- unique(rbind(missingSEX1,missingSEX2))
sampleToExclude3 <- rbind(sampleToExclude1,sampleToExclude2,sampleToExclude_missingSEX)
temp01 <- subset(merge4, !merge4$Sample1 %in% sampleToExclude3$Sample)
temp02 <- subset(merge4, !merge4$Sample2 %in% sampleToExclude3$Sample)
temp03 <- merge(temp01,temp02, by=c("Sample1","Sample2","No.Non.missing.SNP","est.genetic.relatedness","Sample1_SEX","Sample1_PHENO","Sample2_SEX","Sample2_PHENO","BothDiffPHENO"))
temp04 <- subset(grm0125_related, grm0125_related$V2 %in% temp03$Sample1)
temp05 <- subset(grm0125_related, grm0125_related$V2 %in% temp03$Sample2)
list04 <- temp04[,"V2",drop=T] %>% rename(Sample=V2)
list05 <- temp05[,"V2",drop=T] %>% rename(Sample=V2)
list06 <- unique(rbind(list04,list05))
sampleToExcludeFINAL <- rbind(sampleToExclude3,list06)
write.table(sampleToExcludeFINAL,"GRM/FILTERED.to.remove.GRM_matrix.noDups.relatedSamples_IID.txt", sep=" ",quote=F,col.names=F,row.names=F)
q()
n
# STEP 2: Variant inclusion criteria to account for genotyping batch differences; then make final bfiles with --geno 0.05
echo
echo STEP 2. Variant inclusion criteria to account for genotyping batch differences
echo STEP 2.1: Case/control nonrandom missingness test \(i.e. exclude SNPs with P \< 1E-4\)
echo
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR \
--test-missing \
--out missing_snps
awk '{if ($5 <= 0.0001) print $2 }' missing_snps.missing > missing_snps_1E4.txt
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR_noDups \
--exclude missing_snps_1E4.txt \
--make-bed \
--out after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness
sort -u missing_snps_1E4.txt > VARIANT_TEST_MISSING_SNPS.txt
echo
echo STEP 2.2: Haplotype-based test for non-random missing genotype data \(i.e. exclude SNPs with P \< 1E-4\)
echo
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness \
--test-mishap \
--out missing_hap
awk '{if ($8 <= 0.0001) print $9 }' missing_hap.missing.hap > missing_haps_1E4.txt
sed 's/|/\
/g' missing_haps_1E4.txt > missing_haps_1E4_final.txt
sort -u missing_haps_1E4_final.txt > HAPLOTYPE_TEST_MISSING_SNPS.txt
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness \
--exclude missing_haps_1E4_final.txt \
--make-bed \
--out after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness_HaploMissingness
echo
echo STEP 2.3: Make list of varaints that failed HWE \(controls\) \(i.e. exclude SNPs with midP \< 1E-6\)
echo
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness_HaploMissingness \
--filter-controls \
--hwe 1e-6 midp \
--write-snplist \
--out Vars.passhwecontrols1e-6
echo
echo STEP 2.4: Final --geno 0.05 to account for final variant missingness
echo
plink \
--bfile after_gender_heterozyg_hapmap_keepEUR_noDups_CaseConMissingness_HaploMissingness \
--extract Vars.passhwecontrols1e-6.snplist \
--geno 0.05 \
--make-bed \
--out ../FILTERED.${outFILENAME}.keepRelated
# Save log files in new log directory
mkdir logFiles
mv *.log ./logFiles
# Gives runtime
res2=$(date +%s.%N)
dt=$(echo "$res2 - $res1" | bc)
dd=$(echo "$dt/86400" | bc)
dt2=$(echo "$dt-86400*$dd" | bc)
dh=$(echo "$dt2/3600" | bc)
dt3=$(echo "$dt2-3600*$dh" | bc)
dm=$(echo "$dt3/60" | bc)
ds=$(echo "$dt3-60*$dm" | bc)
end_date=$(date)
echo "End date and time = $end_date"
printf "Total runtime: %d:%02d:%02d:%02.4f\n" $dd $dh $dm $ds
printf "Total runtime: %d:%02d:%02d:%02.4f\n" $dd $dh $dm $ds > Runtime_summary.txt
mailx -s "biowulf swarm alert message: QC_preimputation_v2_keepRelated_part1of2.updatedJuly2020.sh TASK COMPLETE" $email < Runtime_summary.txt
sleep 10s
rm Runtime_summary.txt
####### REMOVE A LOT...
### Remove intermediate files
rm merged2.*
rm merged3.*
rm pruning.*
rm pruned_data.*
rm prunedHet.*
rm after_heterozyg.*
rm after_heterozyg_call_rate.*
rm gender_check2.*
rm gender_check1.*
rm after_gender.*
rm commonSNPs.txt
rm temp1.txt
rm temp2.*
rm updateMap.txt
rm after_gender3.*
rm after_gender4.*
rm hapmap3_DISEASE_snplis*
rm afri.txt
rm asia.txt
rm eur.txt
rm pca.bed
rm pca.bim
rm pca.fam
rm pca.hh
rm pca.eigenval
rm after_gender_heterozyg_hapmap_keepEUR*
rm missing_snps_1E4.txt
rm missing_snps.*
rm missing_haps_1E4_final.txt
rm missing_haps_1E4.txt
rm missing_hap.*