-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcf_to_f3_input.py
190 lines (128 loc) · 4.62 KB
/
vcf_to_f3_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/python
# python 2.7
# 15 Dec 2016
# Mathias Scharmann
# Command line outline
# usage example
# python
# Inputs
# Outputs
#module load python/2.7
import os
import sys
#######
# checks if file exists and breaks script if not
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
print "Error: {0} does not exist".format(x)
exit()
x = str(x)
return x
# checks for non-UNIX linebreaks
def linebreak_check(x):
if "\r" in open(x, "rb").readline():
print "Error: classic mac (CR) or DOS (CRLF) linebreaks in {0}".format(x)
exit()
# parses command line arguments
def get_commandline_arguments ():
import os
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--popmap", required=True, dest="popmapfile", type=extant_file, help="name and path of the popmap file: 1st column barcode/sample name separated by tab from second column indicating the population", metavar="FILE")
parser.add_argument("--vcf", required=True, dest="vcf", type=extant_file, help="vcf genotypes", metavar="FILE")
parser.add_argument("--max_missing", required=True, help="site-filter: maximum proportion of individuals with missing genotype per population to include a site, float value")
args = parser.parse_args()
print args
linebreak_check(args.popmapfile)
return args
########
def check_congruence (popmapfile, vcf):
popmapsamples = []
with open(popmapfile, "r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
popmapsamples.append(fields[0])
popmapsamples = set(popmapsamples)
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
vcf_samples = header_line[9:]
for samp in popmapsamples:
if samp not in vcf_samples:
print "vcf does not contain all popmap samples"
exit()
for samp in vcf_samples:
if samp not in popmapsamples:
print "popmap does not contain all vcf samples"
exit()
print "samples in popmap and vcf fully congruent, good to go!"
#######
def read_popmap (popmapfile):
popdict = {}
with open(popmapfile,"r") as INFILE:
for line in INFILE:
fields = line.strip("\n").split("\t")
try:
popdict[fields[1]].append( fields[0] )
except KeyError:
popdict[fields[1]] = [ fields[0] ]
return popdict
def vcf_to_allele_freqs (vcf, popdict, max_missing):
presence_treshold = 1.0 - max_missing
with open(vcf, "r") as INFILE:
for line in INFILE:
if line.startswith("##"):
continue
if line.startswith("#"):
header_line = line.lstrip("#").strip("\n").split("\t")
break
samples = header_line[9:]
# E3379_L96 43 . ATCG ATTG,ATCA,GTCA,GTCG 4179.3 . AB=BLABLA 0/0:1:
popdict_vcf_idxes = {}
for k, v in popdict.items():
popdict_vcf_idxes[ k ] = [ header_line.index(x) for x in v ]
# now start the main business of walking through the vcf:
print "counting allele frequencies"
allele_freq_dict = {k : [] for k in popdict.keys() }
with open(vcf, "r") as INFILE:
cnt = 0
for line in INFILE:
cnt += 1
print str(cnt) + "\r",
if line.startswith("#"):
continue
fields = line.strip("\n").split("\t")
for pop, pop_idxs in popdict_vcf_idxes.items():
a = [ fields[x].split(":")[0].split("/") for x in pop_idxs ]
b = [x for genotype in a for x in genotype if x != "."]
pop_n = float( len( pop_idxs ) )*2
if len(b) >= presence_treshold * pop_n:
p = str( b.count("1")/float(len(b)) )
else:
p = "-"
allele_freq_dict[pop].append( p )
# clean allele_freq_dict from sites that are not present in at least three populations:
print "filtering sites in allele frequency table for presence in >= 3 populations"
pop_order = allele_freq_dict.keys()
output_lines = [pop_order]
for i in range( len( allele_freq_dict[allele_freq_dict.keys()[0]] ) ):
column = [ allele_freq_dict[pop][i] for pop in pop_order ]
if len( [ x for x in column if x != "-" ] ) >= 3:
output_lines.append( column ) # now its a row!
print "retained sites: {0} out of {1}".format( len( output_lines ) -1, len( allele_freq_dict[allele_freq_dict.keys()[0]] ) )
print "Done!"
with open(vcf + ".maxmiss_per_pop." + str(max_missing) + ".allele_freqs.txt", "w") as OUTF:
OUTF.write( "\n".join(["\t".join(x) for x in output_lines ]) )
######################## MAIN
args = get_commandline_arguments ()
check_congruence (args.popmapfile, args.vcf)
popdict = read_popmap (args.popmapfile)
max_missing = float( args.max_missing )
vcf_to_allele_freqs (args.vcf, popdict, max_missing)