-
Notifications
You must be signed in to change notification settings - Fork 3
/
fastPHASE2vcf.py
220 lines (160 loc) · 5.42 KB
/
fastPHASE2vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/usr/bin/env python
## This python script will convert a TASSEL hapmap to a file input
# used by fastPHASE
# Import libraries
import argparse
from datetime import datetime
import sys
#####
# Define the arguments
#####
# Description
DESC = """A Python program to convert the output from fastPHASE to a VCF.\n"""
# Argument parser
parser = argparse.ArgumentParser(description=DESC, add_help=True)
# Add arguments
# Input file
parser.add_argument('-i',
'--filein',
metavar = 'FILEIN',
help = 'The fastPHASE output file to be converted.',
required = True)
# The original TASSEL file
parser.add_argument('-v',
'--vcfin',
metavar = 'VCFI',
help = 'The original VCF file of unimputed genotypes (the number of sites in the VCF must match the number of sites in the fastPHASE output).',
required = True)
# Output file name
parser.add_argument('-o',
'--fileout',
metavar = 'FILEOUT',
help = 'The basename of the output VCF file.',
required = True)
# Parse the arguments
args = parser.parse_args()
# Define the output filename
outfile = str(args.fileout) + '.vcf'
# # Open a handle for writing
handle = open(outfile, 'w')
# Empty list to store the chromosome, position, ID, and ref/alt states of the sites
snp_info = []
# Read in the original fp input file to obtain snp positio
with open(args.vcfin, 'r') as vcfin:
# Iterate over lines
for line in vcfin:
# Skip the first line
if line.startswith('#'):
continue
else:
# We need the SNP name, allele, chromosome, and position
tmp = line.strip().split('\t')
# Append this information
snp_info.append(tmp[0:5])
## The snp_info will be used later to print the VCF
## Now we will parse the fastPHASE output file
# List to store column names / sample names
columns = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']
# Empty list to store genotype matrix components (i.e. alleles)
geno_matrix = []
# Read in the fastPHASE file
with open(args.filein, 'r') as fp:
# Create a switch to activate parsing the genotypic information
line_switch = False
# Iterate over lines
for line in fp:
if not line_switch:
# If the line doesn't start with "BEGIN GENOTYPES", skip it
if line.startswith("BEGIN GENOTYPES"):
# Flip the switch
line_switch = True
continue
else:
continue
# If the line switch is on, begin parsing
else:
# If the line begins with #, it is a sample name
if line.startswith("#"):
# Strip and split on spaces
tmp = line.strip().split(' ')
# The sample name is the second entry
name = tmp[1]
# Append it to the columns list
columns.append(name)
# The last line contains "END GENOTYPES." Skip it
elif line.startswith("END GENOTYPES"):
continue
# Otherwise the line is genotypic information
else:
# Strip and split on spaces
tmp = line.strip().split(' ')
# Append to the genotype matrix list
geno_matrix.append(tmp)
# To format the geno_matrix for a VCF, we need to first iterate
## over entries, then iterate over sites.
# The number of entries is one-half of the length of the geno_matrix (for diploid)
n_entries = len(geno_matrix) / 2
# The number of sites
n_sites = len(snp_info)
# Empty list to store the per-site genotype calls
vcf_matrix = [['.' for i in range(n_entries)] for j in range(n_sites)]
# Iterate over entries
for i in range(n_entries):
# Create indicies to sample the haplotypes
ind1 = 2 * i
ind2 = (2 * i) + 1
# Pull out the alleles for the ith entry
hap_i1 = geno_matrix[ind1]
hap_i2 = geno_matrix[ind2]
# Iterate over the sites
for j in range(n_sites):
# Pull out the site info from the VCF
site_info = snp_info[j]
# Pull out the reference / alternate state
ref = site_info[3]
alt = site_info[4]
# Pull out the alleles in the haplotypes of the ith individual
allele_j1 = hap_i1[j]
allele_j2 = hap_i2[j]
# Empty genotype call
call_j = ''
# If one of the alleles has a bracket, set to missing
if '[' in allele_j1 or '[' in allele_j2:
call_j = './.'
# Otherwise, match the allele to the reference or alternate state
else:
# Match allele 1
if allele_j1 == ref:
call_j1 = '0'
else:
call_j1 = '1'
# Match allele 2
if allele_j2 == ref:
call_j2 = '0'
else:
call_j2 = '1'
# Combine into the call
call_j = call_j1 + '|' + call_j2
# Add the genotype call to the empty vector
vcf_matrix[j][i] = call_j
## Now we can print out the VCF file
handle = open(outfile, 'w')
# First print some header information
handle.write('##fileformat=VCFv4.2' + '\n')
# Date
date = datetime.now().date().strftime('%Y%m%d')
handle.write('##fileDate=' + date + '\n')
# Others
handle.write('##source=fastPHASEv1.4.8' + '\n')
handle.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' + '\n')
handle.write('\t'.join(columns) + '\n')
# Iterate over sites
for j in range(n_sites):
# Start with the site information
toprint = snp_info[j]
# Add other information for QUAL, FILTER, INFO, and FORMAT
toprint.extend(['.', '.', 'PR', 'GT'])
# # Add genotype calls
toprint.extend(vcf_matrix[j])
# # Write to the handle
handle.write('\t'.join(toprint) + '\n')