-
Notifications
You must be signed in to change notification settings - Fork 0
/
downsample.py
124 lines (105 loc) · 4.82 KB
/
downsample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import random
import sys
import gzip
import argparse
import subprocess
def lowcov(f, type, f_out):
# no enough data
with open(f_out, "w") as output:
if type == "gz":
with gzip.open(f, "rt") as myfile:
for line in myfile:
output.write(line)
output.close()
sys.exit("Expects more than original contains, no downsampling is performed")
else:
with open(f, "r") as myfile:
for line in myfile:
output.write(line)
output.close()
sys.exit("Expects more than original contains, no downsampling is performed")
def downsample(expected, ifsort, f, f_out):
filetype = f.split(".")[-1]
# r is the read length, i.e. how many base in 1 read
if filetype in ['fq', 'fastq', 'fasta', 'fa']:
# uncompressed file
myfile = open(f, "r")
x = myfile.readline() # skip first line
r = len(myfile.readline()) - 1 # read length
# with open(f, "r") as input:
# file_len = len(input.readlines())
process = subprocess.Popen(['wc', '-l', f], stdout=subprocess.PIPE)
stdout = process.communicate()[0]
file_len=int(stdout.split()[0].decode("utf-8", "ignore"))
elif filetype == 'gz':
# compressed file
filetype = f.split('.')[-2] #reset file type
myfile = gzip.open(f, "rt")
x = myfile.readline() # skip first line
r = len(myfile.readline()) - 1
# with open(f, "r") as input:
# file_len = len(input.readlines())
p1 = subprocess.Popen(['zcat', f], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['wc', '-l'], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout = p2.communicate()[0]
file_len=int(stdout.split()[0].decode("utf-8", "ignore"))
else:
sys.exit('file type is not supported {}'.format(f.split(".")[-2:]))
# get probabilities
if filetype in ['fq', 'fastq']:
p = (expected / r) / file_len * 4. # calculated expected probability based on #lines
print("Probability is {} \n b contained is {}".format(p, r*file_len/4.))
elif filetype in ['fa', 'fasta']:
p = (expected / r) / file_len * 2. # calculated expected probability based on #lines
print("Probability is {} \n b contained is {}".format(p, r * file_len / 2.))
# if expected data is larger than original contains, no downsampling is performed
if p > 1:
lowcov(f, f.split(".")[-1], f_out)
print("No down sample is performed on {}, file length is {}".format(f_out, file_len))
output = open(f_out, 'w') # open the output file
if ifsort.lower() in ['f', 'fasle']:
# if not sorted, pick first x lines of file is enough
if f.split()[-1] != "gz":
process = subprocess.Popen(['head', '-{}'.format(expected), f], stdout=subprocess.PIPE)
stdout = process.communicate()[0].decode("utf-8", "ignore")
with open(f_out, 'w') as myfile:
myfile.write(stdout)
else:
p1 = subprocess.Popen(['zcat', f], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['head', '-{}'.format(expected)], stdin=p1.stdout, stdout=subprocess.PIPE)
stdout = p2.communicate()[0].decode("utf-8", "ignore")
with open(f_out, 'w') as myfile:
myfile.write(stdout)
elif ifsort.lower() in ['t', 'true']:
# if sorted, perform random downsample
if filetype in ['fq', 'fastq']:
i = 0
inOrNot = True
for line in myfile:
if i % 4 == 0:
inOrNot = random.random() <= p
if (inOrNot):
output.write(line)
i += 1
elif filetype in ['fa', 'fasta']:
for line in myfile:
# p = expected/(len(lines)*len(lines[1]))
out = ''
# for line in lines:
if line[0] == ">":
out = line
if random.random() <= p:
output.write(out)
else:
out += line
myfile.close()
output.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser("Random downsample sequencing file:")
parser.add_argument("--expected", help="expected of total amount of nucleotides", type=int, default=10 ** 7)
parser.add_argument("--ifsort", help="if the sequence file is sorted or random, T/t/True/true or F/f/False/false", type=str)
# parser.add_argument("--len", help="# file line by $(wc -l $file)", type=int)
parser.add_argument("--input", help="# input file", type=str)
parser.add_argument("--output", help="# output file", type=str)
args = parser.parse_args()
downsample(args.expected, args.ifsort, args.input, args.output)