-
Notifications
You must be signed in to change notification settings - Fork 3
/
fastPHASE2tassel.py
195 lines (140 loc) · 4.68 KB
/
fastPHASE2tassel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python
## This python script will convert a TASSEL hapmap to a file input
# used by fastPHASE
# Import libraries
import argparse
#####
# Define the arguments
#####
# Description
DESC = """A Python program to convert the output from fastPHASE to a TASSEL-encoded
hapmap file.\n"""
# Argument parser
parser = argparse.ArgumentParser(description=DESC, add_help=True)
# Add arguments
# Input file
parser.add_argument('-i',
'--filein',
metavar = 'FILEIN',
help = 'The fastPHASE output file to be converted.',
required = True)
# The original TASSEL file
parser.add_argument('-t',
'--tinput',
metavar = 'TIN',
help = 'The original TASSEL file converted to the fastPHASE input file.',
required = True)
# Should alleles filtered by the probability threshold be removed?
parser.add_argument('-f',
'--filter',
help = 'Flag: should those genotype calls that did not meet the fastPHASE probability threshold be removed?',
action = 'store_true')
# Output file name
parser.add_argument('-o',
'--fileout',
metavar = 'FILEOUT',
help = 'The name of the output file. Default extension is ".hmp.txt',
required = True)
# Parse the arguments
args = parser.parse_args()
# Define the output filename
outfile = str(args.fileout) + '.hmp.txt'
# # Open a handle for writing
handle = open(outfile, 'w')
# Empty list to store the initial TASSEL data
snp_info = []
# Read in the original fp input file to obtain snp positions
with open(args.tinput, 'r') as tin:
# Iterate over lines
for line in tin:
# Skip the first line
if line.startswith('rs#'):
continue
else:
# We need the SNP name, allele, chromosome, and position
tmp = line.strip().split('\t')
# Append this information
snp_info.append(tmp[0:4])
# List to store column names / sample names
columns = ['rs#', 'alleles', 'chrom', 'pos', 'strand', 'assembly#', 'center', 'protLSID', 'assayLSID', 'panelLSID', 'QCcode']
# Empty list to store matrix components
geno_matrix = []
# Read in the fastPHASE file
with open(args.filein, 'r') as fp:
# Create a switch to activate parsing the genotypic information
line_switch = False
# Iterate over lines
for line in fp:
if not line_switch:
# If the line doesn't start with "BEGIN GENOTYPES", skip it
if line.startswith("BEGIN GENOTYPES"):
# Flip the switch
line_switch = True
continue
else:
continue
# If the line switch is on, begin parsing
else:
# If the line begins with #, it is a sample name
if line.startswith("#"):
# Strip and split on spaces
tmp = line.strip().split(' ')
# The sample name is the second entry
name = tmp[1]
# Append it to the columns list
columns.append(name)
# The last line contains "END GENOTYPES." Skip it
elif line.startswith("END GENOTYPES"):
continue
# Otherwise the line is genotypic information
else:
# Strip and split on spaces
tmp = line.strip().split(' ')
# Append to the genotype matrix list
geno_matrix.append(tmp)
# Print the column names line
handle.write('\t'.join(columns) + '\n')
# The number of entries is one-half of the length of the geno_matrix (for diploid)
n_entries = len(geno_matrix) / 2
# The number of sites
n_sites = len(snp_info)
## Now lets play around with this new geno_matrix
# Iterate over the number of snps
for i in range(len(snp_info)):
# Empty toprint list
toprint = []
# Add the snp_info
toprint.extend(snp_info[i])
# Buffer with NAs
toprint.extend(['NA'] * 7)
# Collect the genotype information for the i-th snp across all samples
# Use slice notation to go every other entry
for j in range(len(geno_matrix))[::2]:
# Collect allele information
allele1 = geno_matrix[j][i]
allele2 = geno_matrix[j+1][i]
# If one of the alleles has a bracket, set to missing
if '[' in allele1 or '[' in allele2:
# If the flag is true, set to NN
if (args.filter):
geno = 'NN'
# Otherwise just remove the brackets
else:
allele1 = allele1.strip('[]')
allele2 = allele2.strip('[]')
geno = str(allele1) + str(allele2)
else:
# Convert to N if necessary
if allele1 == "?":
allele1 = "N"
if allele2 == "?":
allele1 = "N"
# Concatenate allele information
geno = str(allele1) + str(allele2)
# Extend the toprint list
toprint.append(geno)
# Write the list to the handle
# print '\t'.join(toprint)
handle.write('\t'.join(toprint) + '\n')
# Close the file
handle.close()