-
Notifications
You must be signed in to change notification settings - Fork 0
/
abba_baba_freq.17-10-05.py
441 lines (305 loc) · 12 KB
/
abba_baba_freq.17-10-05.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
#!/usr/local/bin/python
# Python 2.7.6
# abba_baba_het.py M
# 02 Jan 2017
# Mathias Scharmann
# partly based on Dtest.py (pyRAD) by Deren Eaton, but IMHO much easier to input data and MUCH faster
# usage example
# python ../abba_baba_freq.17-01-02.py --i single_samples.derived_allele_freq.txt --tests chunk_${LSB_JOBINDEX} --o Dstats_out_${LSB_JOBINDEX}.txt
"""
new feature: will scan an eventual output file and only conduct the tests that have not yet been made in a previous run!
"""
import random
import itertools
import argparse, os
from math import erf, sqrt
# results / return values are:
# patterns_total = 1000 # count of all patterns analysed in all 4 taxa (roughly the number of sites)
# ABBA = 5 # count of ABBA sites
# BABA = 10 # count of BABA sites
# D = float((ABBA-BABA)) / (ABBA+BABA)
# pdisc = float((ABBA+BABA)) / patterns_total # porportion od discordant patterns
# STD = 0.5 # standard deviation of bootstrap resampled D-statistics; bootstrapping on the loci used to calculate D
# Z = (abs(D/STD)) # the Z statistic; absolute value of D divided by the STD
# p_value = p value as converted from the Z statistic
# input: a dictionary with taxa as keys and strings of SNPs as values:
########
########
# diese Funktion gibt command line arguments mit flag an das script weiter und checkt ob die Angaben plausibel / vollstandig sind
def get_commandline_arguments ():
parser = argparse.ArgumentParser()
parser.add_argument("--i", required=True, type=extant_file,
help="name and path of the derived allele frequency file", metavar="FILE")
parser.add_argument("--tests", required=True, type=extant_file,
help="""name and path of the tests to be done; one test per line, taxa tab-separated,\n
p1 p2 p3 o""", metavar="FILE")
parser.add_argument("--o", required=True, help="name and path of the output file", metavar="FILE")
args = parser.parse_args()
# finish
return args
# diese Funktion checkt ob ein file existiert und stoppt das script wenn ein file nicht exisitiert
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
print "Error: {0} does not exist".format(x)
exit()
x = str(x)
return x
def filter_input_data (data_indict, testorder):
# remove sites with missing data for the four taxa of this test, returns a list of lists!
# return also a count of the full-data sites
## here we ascertain that polymorphism in {W,X} AND polymorphism in {Y,Z} because other sites make zero contribution to the D-stat (Patterson et al 2012)
print "filtering sites for test taxa"
indict = {}
for tax in testorder:
indict[tax] = data_indict[tax]
testorder_freqs = []
for column_index in range(0, len(indict.values()[0])): # to loop over all columns
# print "\r" + str(column_index),
freqs = [ indict[tax][column_index] for tax in testorder ]
if not "-" in freqs: # require full data
freqs = [float(x) for x in freqs]
# if not freqs[2] + freqs[3] == 0.0:
if not freqs[2] + freqs[3] == 0.0 and not freqs[0] + freqs[1] == 0.0:
testorder_freqs.append( freqs )
return testorder_freqs, len(testorder_freqs)
def count_informative_sites (testorder_freqs):
"""
return a count of the sites that were ABBA-BABA informative:
sites that contain ABBA-BABA patterns MUST contain the derived allele in p3 AND contain BOTH allels in (p1,p2)
order in testorder_freqs: [p1,p2,p3,o]
"""
cnt = 0
for column_index in range(0, len(testorder_freqs)): # to loop over all columns
freqs = testorder_freqs[column_index]
if freqs[2] > 0.0: # require derived allele presence in p3
if sum( freqs[0:2] ) > 0.0 and sum( freqs[0:2] ) < 2.0: #sites that contain ABBA-BABA patterns MUST contain the derived allele in p3 AND contain BOTH allelea in (p1,p2)
cnt += 1
return cnt
def get_Dstat_light (patterns):
"""
This is applied during bootstrapping:
The Num and Den counts for each site have already been made, so instead of doing it over and over again we just take the recorded count from prev run
"""
Num_sum = sum( [ x[0] for x in patterns])
Den_sum = sum( [ x[1] for x in patterns])
try:
D = Num_sum / Den_sum
except ZeroDivisionError:
D = 0.0
return D
def get_Dstat_Patterson (filtered_freq_data):
"""
order of freqs: [p1, p2, p3, O]
these formulas follow:
Patterson, N. J., Moorjani, P., Luo, Y., Mallick, S., Rohland, N., Zhan, Y., ... Reich, D. (2012). Ancient Admixture in Human History. Genetics, genetics.112.145037. doi:10.1534/genetics.112.145037
=> hence we have here [y,z,w,x]
ABBA = (den - num) / 2.0
BABA = num + ((den - num) / 2.0)
"""
Num_sum = 0
Den_sum = 0
# illustrative only:
ABBA_sum = 0
BABA_sum = 0
patterns = [] # record, for faster D-stat calc in resampling!
for site in filtered_freq_data:
# for D-stat
y = site[0]
z = site[1]
w = site[2]
x = site[3]
num = (w-x)*(y-z)
den = (w+x-2*w*x)*(y+z-2*y*z)
# umgestellt aus Patterson et al p. 1702 top left
ABBA = (den - num) / 2.0
BABA = num + ((den - num) / 2.0)
ABBA_sum += ABBA
BABA_sum += BABA
patterns.append( [ num, den ] )
Num_sum += num
Den_sum += den
try:
D = Num_sum / Den_sum
except ZeroDivisionError:
D = 0.0
f_d = "na"
return D, f_d, ABBA_sum, BABA_sum, patterns
def get_ABBA_BABA_full (filtered_freq_data):
"""
order of freqs: [p1, p2, p3, O]
these formulas follow:
Martin SH, Davey JW, Jiggins CD (2014) Evaluating the use of ABBA-BABA statistics to locate introgressed loci. Molecular Biology and Evolution, msu269.
"""
sum_diff_123O = 0 ## called also S(1,2,3,O)
sum_sum_123O = 0
sum_diff_1DDO = 0 ## called also S(1,D,D,O)
# only for illustrative purpose; not actually used in calc:
ABBA_sum = 0
BABA_sum = 0
patterns = [] # record, for faster D-stat calc in resampling!
for site in filtered_freq_data:
# for D-stat
count_ABBA_123O = ( 1-site[0] )* site[1] * site[2] * ( 1-site[3] )
count_BABA_123O = site[0] * (1-site[1]) * site[2] * ( 1-site[3] )
diff_123O = count_ABBA_123O - count_BABA_123O
sum_123O = count_ABBA_123O + count_BABA_123O
patterns.append( [ diff_123O, sum_123O ] )
ABBA_sum += count_ABBA_123O
BABA_sum += count_BABA_123O
sum_diff_123O += diff_123O
sum_sum_123O += sum_123O
# for f_d:
if count_ABBA_123O + count_BABA_123O > 0.0:
max23 = max( site[1], site[2] )
count_ABBA_1DDO = ( 1-site[0] )* max23 * max23 * ( 1-site[3] )
count_BABA_1DDO = site[0] * (1-max23) * max23 * ( 1-site[3] )
diff_1DDO = count_ABBA_1DDO - count_BABA_1DDO
sum_diff_1DDO += diff_1DDO
try:
D = sum_diff_123O / sum_sum_123O
except ZeroDivisionError:
D = 0.0
try:
f_d = sum_diff_123O / sum_diff_1DDO # the proportion of the genome shared through introgression!! eq. 6 of Martin et al.
except ZeroDivisionError:
D = 0.0
return D, f_d, ABBA_sum, BABA_sum, patterns
def sample_wr(population, k):
"used for bootstrap sampling"
"Chooses k random elements (with replacement) from a population"
"population is a range = a list of ints, k is the length of the range"
n = len(population)
_random, _int = random.random, int # speed hack
return [_int(_random() * n) for i in itertools.repeat(None, k)]
def read_tests_file (testsfile):
tests = []
with open(testsfile, "r") as INFILE:
for line in INFILE:
tests.append(line.strip("\n").split("\t"))
return tests
def read_allele_freqs (alignfile):
print "reading allele freqs"
with open(alignfile, "r") as INFILE:
pops = INFILE.readline().strip("\n").split("\t")
data_indict = {pop : [] for pop in pops}
for line in INFILE:
fields = line.strip("\n").split("\t")
for pop in pops:
data_indict[pop].append( fields[ pops.index(pop) ] )
return data_indict
def z2p(z):
"""
from z-score return p-value
"""
p=0.5*(1+erf(z/sqrt(2)))
two_sided = 2*(1-p)
return two_sided
def write_results_to_file(outfilename, ret):
with open(outfilename, "a") as OUTFILE:
OUTFILE.write("\n" + "\t".join( [str(x) for x in ret ] ))
def append_header(outfilename):
drawing = """
# unrooted trees:
# /\ /\
# / \ - or - /\ \
# /\ /\ /\ \ \
# Y Z W X Y Z W X
#
# if polymorphism ascertained in {W,X} we expect D=0.
# if D != 0 then (W,X) and (Y,Z) are not clades in the population tree
# if D < 0 then BABA-excess = gene flow between Y-W or Z-X
# if D > 0 then ABBA-excess = gene flow between Z-W or Y-X
#
"""
headeritems = ["Y", "Z", "W", "X", "informative_sites", "ABBA", "BABA", "D", "STD", "Z-score", "p_val", "dxy_YW", "dxy_ZX"]
header = drawing + "\n" + "\t".join(headeritems)
with open(outfilename, "r") as INFILE:
fileconts = INFILE.read()
if not fileconts.startswith("p1"): ## do not append header multiple times if already present!
with open(outfilename, "w") as OUTFILE:
OUTFILE.write(header)
OUTFILE.write(fileconts)
def check_already_done_tests (tests, outfilename):
# which tests have already been written to the outfile?
try:
with open(outfilename, "r") as INF:
done_tests = [ x.strip("\n").split("\t")[0:4] for x in INF ]
except IOError: # if output file does not yet exist
done_tests = []
missing_tests = [ x for x in tests if not x in done_tests ]
print "remaining tests: ", len(missing_tests)
return missing_tests
def make_dxy (all_loci_freqs):
"""
Nei & Li (1979): unnumbered eq., between eqs. 24 and 25
Nei 1987 eq 10.20
for a single biallelic SNP (1,2) in two pops it simplifies to: dxy = pop1_freq1 * ( 1.0 - pop2_freq1 ) + pop2_freq1 * ( 1.0 - pop1_freq1 )
pops are ordered in inout as: [Y, Z, W, X]
we want dxy YW and dxy ZX
"""
dxy_YW_list = []
dxy_ZX_list = []
for s in all_loci_freqs:
if 0.0 < sum( s ) < 1.0: ## to exlcude monomorphic sites!!
dxy_YW_list.append( s[0] * ( 1.0 - s[2] ) + s[2] * ( 1.0 - s[0] ) )
dxy_ZX_list.append( s[1] * ( 1.0 - s[3] ) + s[3] * ( 1.0 - s[1] ) )
try:
dxy_YW = sum(dxy_YW_list) / len(dxy_YW_list)
except ZeroDivisionError:
dxy_YW = "na"
try:
dxy_ZX = sum(dxy_ZX_list) / len(dxy_ZX_list)
except ZeroDivisionError:
dxy_ZX = "na"
return [str(dxy_YW), str(dxy_ZX)]
##### main #
############
############
# input: a dictionary with taxa as keys and strings of SNPs as values:
######
args = get_commandline_arguments ()
data_indict = read_allele_freqs (args.i)
# returns a list of lists for the different tests:
tests = read_tests_file (args.tests)
#tests = [["raff","hems","grac","perv"], ["raff","hems","grac","ghost"], ["perv","raff","grac","ghost"], ["perv","hems","grac","ghost"], ["raff","perv","grac","hems"]]
# removes
tests = check_already_done_tests (tests, args.o)
nboots = 10000
testcount = 0
for testorder in tests:
testcount +=1
print "\n*-*-*-*-*-*-*\nD-test {0}/{1}".format(testcount, len(tests))
# assign tesorder and Y Z W X:
Y=testorder[0]
Z=testorder[1]
W=testorder[2]
X=testorder[3]
filtered_freq_data, count_total_sites = filter_input_data (data_indict, testorder)
[dxy_YW, dxy_ZX] = make_dxy(filtered_freq_data)
obs_Dstat, obs_f_d, ABBA, BABA, ABBA_BABA_patterns = get_Dstat_Patterson (filtered_freq_data)
# print obs_Dstat, obs_f_d, ABBA, BABA
# now test significance of deviation of obs_Dstat from zero:
print "bootstrapping"
bs_Ds = []
for i in range(nboots):
# print "\r" + "bootstrapping\t" + str(i),
# get a random list of the sites to choose:
bs_sites = sample_wr( range( len(filtered_freq_data) ), len( filtered_freq_data ) )
bs_data = [ ABBA_BABA_patterns[bs_site] for bs_site in bs_sites ]
bs_D = get_Dstat_light (bs_data)
bs_Ds.append( bs_D )
# with open("bs_Ds_for_hist.{}.txt".format( testcount ), "w" ) as OUTF:
# OUTF.write("\n".join( [str(x) for x in bs_Ds ] ))
mean_bs_D = sum(bs_Ds) / len(bs_Ds)
STD = sqrt( sum( [ (x - mean_bs_D )**2 for x in bs_Ds ]) / len(bs_Ds) )
try:
Zscore = (abs(obs_Dstat/STD))
except ZeroDivisionError:
Zscore = 0.0
p_val = z2p(Zscore)
ret = [Y, Z, W, X, count_total_sites, ABBA, BABA, obs_Dstat, STD, Zscore, p_val, dxy_YW, dxy_ZX]
write_results_to_file(args.o, ret)
append_header (args.o)