forked from Micaella/RNA-seq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
RNA-seq.py
123 lines (98 loc) · 4.47 KB
/
RNA-seq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python
import sys, os, parsl
from parsl import load, python_app, bash_app
#from configs.config_ThreadPool import config
from configs.config_HTEx import config
from parsl.data_provider.files import File
from pathlib import Path
load(config)
@bash_app
def bowtie(multithread_parameter, base, infile, outputs=[], stderr=None):
return 'bowtie2 -p {0} -x {1} -U {2} -S {3}'.format(multithread_parameter, base, infile, outputs[0])
@bash_app
def sort(multithread_parameter, inputs=[], outputs=[], stderr=None):
return 'samtools sort -@ {0} -o {1} {2}'.format(multithread_parameter, outputs[0], inputs[0])
@bash_app
def picard_split(infile, output_dir, split_to_n_files, prefix, inputs=[], outputs=[], stdout=None, stderr=None):
import os
os.mkdir(output_dir)
return 'java -jar $PICARD SplitSamByNumberOfReads I={} OUTPUT={} SPLIT_TO_N_FILES={} CREATE_INDEX=true OUT_PREFIX={}'.format(infile, output_dir, split_to_n_files, prefix)
@bash_app
def htSeq_count(gtf, diretorio, nprocesses, inputs=[], outputs=[], stderr=None):
from pathlib import Path
splited_files = list(Path(diretorio).glob('*.bam'))
all_files = [str(i) for i in splited_files]
bigstr = ' '.join(all_files)
return 'htseq-count --stranded reverse --type=exon --idattr=gene_id --mode=union --nprocesses={0} -c {1} {2} {3}'.format(nprocesses, outputs[0], bigstr, gtf)
@python_app
def HTSeq_Merge(n_colummns, inputs=[], outputs=[], stderr=None):
import sys
file_name_in = inputs[0]
file_name_out = outputs[0]
n_colummns = int(n_colummns)
HTSeq_file = open(file_name_in, "r")
HTSeq_new_file = open(file_name_out, "w")
n_colummns += 1
for linha in HTSeq_file:
soma = 0
valores = linha.split()
for i in range(1,n_colummns):
soma = int(valores[i]) + soma
a = valores[0] + '\t' + str(soma) + '\n'
HTSeq_new_file.write(a)
HTSeq_file.close()
HTSeq_new_file.close()
@bash_app
def DESeq(scriptR, pathInputs, inputs=[], outputs=[], stdout = None, stderr=None):
return '{0} {1}'.format(scriptR, pathInputs)
base_bowtie = sys.argv[1]
multithread = sys.argv[2]
inputs_bowtie = sys.argv[3]
dir_outputs = sys.argv[4]
gtf = sys.argv[5]
script_DESeq2 = sys.argv[6]
p = Path(inputs_bowtie)
fasta = list(p.glob('*'))
bow_futures, sort_futures, split_futures, htseq_futures, merge_futures = [], [], [], [], []
# BOWTIE
for i in fasta:
prefix = Path(Path(i).stem).stem
output_bowtie = '{}/{}.sam'.format(dir_outputs, prefix)
stderr_bowtie = '{}/stderr/{}.bowtie'.format(dir_outputs, prefix)
infile = str(i)
bow_futures.append(bowtie(multithread, base_bowtie, infile, outputs=[File(output_bowtie)], stderr=stderr_bowtie))
# SORT
for k in bow_futures:
prefix = Path(k.outputs[0].filename).stem
output_sort = '{}/{}.sort.bam'.format(dir_outputs, prefix)
stderr_sort = '{}/stderr/{}.sort'.format(dir_outputs, prefix)
sort_futures.append(sort(multithread, inputs=[k.outputs[0]], outputs=[File(output_sort)], stderr=stderr_sort))
dir_splited = list()
# PICARD
for p in sort_futures:
prefix = str(Path(Path(p.outputs[0].filename).stem).stem)
split_files_dir = '{}/{}_splited/'.format(dir_outputs, prefix)
dir_splited.append(split_files_dir)
stderr_split = '{}/stderr/{}.picard'.format(dir_outputs, prefix)
split_futures.append(picard_split(p.outputs[0], split_files_dir, multithread, prefix, stderr=stderr_split))
dir_splited.reverse()
# HTSEQ
for l in split_futures:
diretorio = dir_splited.pop()
prefix = Path(diretorio).name.split('_')[0]
saida_htseq = '{}/{}.count'.format(dir_outputs, prefix)
stderr_htseq = '{}/stderr/{}.htseq_count'.format(dir_outputs, prefix)
htseq_futures.append(htSeq_count(gtf, diretorio, multithread, inputs=[l], outputs=[File(saida_htseq)], stderr = stderr_htseq))
# HTSEQ-MERGE
for m in htseq_futures:
prefix = Path(m.outputs[0].filename).stem
output_merge = '{}/{}.merge.count'.format(dir_outputs, prefix)
stderr_merge = '{}/stderr/{}.merge_htseq'.format(dir_outputs, prefix)
merge_futures.append(HTSeq_Merge(multithread, inputs=[m.outputs[0]], outputs=[File(output_merge)], stderr = stderr_merge))
# Waiting for all apps to proceed with the execution of the last activity (DESeq)
[j.result() for j in merge_futures]
# DESEQ2
saida_DEseq = '{}/teste.deseq'.format(dir_outputs)
stderr_DESeq = '{}/stderr/all.deseq'.format(dir_outputs)
deseq_future = DESeq(File(script_DESeq2), dir_outputs, stdout=saida_DEseq, stderr = stderr_DESeq)
deseq_future.result()