/
bin_allc_files.py
executable file
·129 lines (95 loc) · 4.08 KB
/
bin_allc_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
import multiprocessing as mp
import numpy as np
################
# CREATE THE BINNED DATA FILES FROM ALLC FILES
################
######### INPUT PARAMETERS #################
species = 'mouse' # or human
samples = [] # LIST OF SAMPLE NAMES
allc_dir = '' # DIRECTORY WITH ALLC FILES IN IT
# FILE NAME TO ALLC FILES WILL BE CONSTRUCTED AS: allc_dir + "/allc_" + sample[n] + "_" + chromosome + ".tsv"
# OUTPUT FILE NAME: "binc_" + sample[n] + "_" + str(bin_size) + "_" + chromosome + ".tsv",
def read_allc(fname, position_as_index=True, compressed=False):
# Read methylation data stored in an "allc" file
if compressed:
os.system("bgzip -cd " + fname + ".gz > " + fname)
if position_as_index == True:
df = pd.read_csv(fname, sep="\t", index_col=1, skiprows=1,
names=['chr','pos','strand','context','mc','c','methylated'])
else:
df = pd.read_csv(fname, sep="\t", skiprows=1,
names=['chr','pos','strand','context','mc','c','methylated'])
if compressed:
os.remove(fname)
return df
def get_mCH_contexts():
contexts = []
for base1 in ['A','C','T']:
for base2 in ['A','C','G','T']:
contexts.append('C' + base1 + base2)
return contexts
def get_mCG_contexts():
return ['CGA','CGC','CGG','CGT']
def get_mouse_chromosomes(include_x=True):
chromosomes = [str(x) for x in range(1,20)]
if include_x:
chromosomes.append('X')
return chromosomes
def get_human_chromosomes(include_x=True):
chromosomes = [str(x) for x in range(1,23)]
if include_x:
chromosomes.append('X')
return chromosomes
### FUNCTION FOR BINNING THE ALLC FILES
def bin_allc(sample, path='.', bin_sizes=[1000,5000,10000,25000,100000], chromosomes=None, outpath = '.', species='mouse', compressed=False):
if chromosomes == None:
if species == 'human':
chromosomes = get_human_chromosomes()
else:
chromosomes = get_mouse_chromosomes()
for chromosome in chromosomes:
fname = path + "/allc_" + sample + "_" + chromosome + ".tsv"
if compressed:
os.system("bgzip -cd " + fname + ".gz > " + fname)
if not os.path.isfile(fname):
print("bin_allc: " + fname + " does not exist.")
return
df = read_allc(fname)
if compressed:
os.remove(fname)
for bin_size in bin_sizes:
if species == 'human':
bins = np.arange(0, get_chrom_lengths_human()[chromosome], bin_size)
else:
bins = np.arange(0, get_chrom_lengths_mouse()[chromosome], bin_size)
# mCG
df_CG = df.loc[df.context.isin(get_mCG_contexts())]
groups = df_CG.groupby(pd.cut(df_CG.index, bins))
mCG = groups.sum().mc.fillna(0)
CG = groups.sum().c.fillna(0)
# mCH
df_CH = df.loc[df.context.isin(get_mCH_contexts())]
groups = df_CH.groupby(pd.cut(df_CH.index, bins))
mCH = groups.sum().mc.fillna(0)
CH = groups.sum().c.fillna(0)
data = np.array([bins[:len(bins)-1], mCG.values, CG.values, mCH.values, CH.values]).astype(int)
binned_allc = pd.DataFrame(data.transpose(), columns=['bin','mCG','CG','mCH','CH'])
binned_allc['chr'] = chromosome
binned_allc = binned_allc[['chr','bin','mCG','CG','mCH','CH']]
binned_allc.to_csv(outpath + "/binc_" + sample + "_" + str(bin_size) + "_" + chromosome + ".tsv",
sep="\t", header=False, index=False)
### PROCESS THE WILL CALL THE BINNING FUNCTION IN A PARALLEL FASHION
def process(samples):
for sample in samples:
print(sample)
if not os.path.exists(sample):
os.makedirs(sample)
mypy.bin_allc(sample, path=allc_dir+sample, outpath=sample, species=species, bin_sizes=[100000], compressed=False)
return True
procs = min(16, len(samples))
p = mp.Pool(processes=procs)
split_samples = np.array_split(samples,procs)
pool_results = p.map(process, split_samples)
p.close()
p.join()