-
Notifications
You must be signed in to change notification settings - Fork 25
/
input.py
729 lines (643 loc) ยท 25.8 KB
/
input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
# Copyright 2017 Marco Galardini and John Lees
'''Functions to read data into pyseer and iterate over instances'''
import sys
from .utils import set_env
import re
import gzip
# avoid numpy taking up more than one thread
with set_env(MKL_NUM_THREADS='1',
NUMEXPR_NUM_THREADS='1',
OMP_NUM_THREADS='1'):
import numpy as np
import pandas as pd
from sklearn import manifold
from pysam import VariantFile
import hashlib
import binascii
import pyseer.classes as var_obj
from .cmdscale import cmdscale
def load_phenotypes(infile, column):
"""Load phenotypes vector
Args:
infile (str)
Matrix input file
column (str or None)
Phenotype column name or None to pick the last column
Returns:
p (pandas.Series)
Phenotype vector (n, 1)
"""
p = pd.read_csv(infile, index_col=0, sep='\t')
if p.shape[1] < 1:
sys.stderr.write('Phenotype file must contain at least one phenotype column\n')
sys.exit(1)
p.index = p.index.astype(str)
if np.any(p.index.duplicated()):
sys.stderr.write('Phenotype file contains duplicated sample names\n')
sys.exit(1)
if column is None:
p = p[p.columns[-1]]
else:
p = p[column]
# Remove missing values
p = p.dropna()
# Check type
if not np.issubdtype(p.values.dtype, np.number):
sys.stderr.write('Phenotypes must be numeric\n')
sys.exit(1)
return p
def load_structure(infile, p, max_dimensions, mds_type="classic", n_cpus=1,
seed=None):
"""Load population structure and apply multidimensional scaling
Args:
infile (str)
Population structure (distance matrix) input file
p (pandas.Series)
Phenotype vector (n, 1)
max_dimensions (int)
Maximum dimensions to consider when applying
`metric` or `non-metric` MDS
mds_type (str)
MDS algorithm to apply. One of `classic`,
`metric` or `non-metric`. Any other input will trigger
the `metric` MDS
n_cpus (int)
Number of CPUs to be used for the `metric` or `non-metric` MDS
seed (int or None)
Random seed for `metric` or `non-metric` MDS, None if not required
Returns:
m (pandas.DataFrame)
Population structure after MDS (n, m)
"""
m = pd.read_csv(infile,
index_col=0,
sep='\t')
m.index = m.index.astype(str)
if np.any(m.index.duplicated()):
sys.stderr.write('Structure file contains duplicated sample names\n')
sys.exit(1)
sys.stderr.write("Structure matrix has dimension " + str(m.shape) + "\n")
# Also take intersection here, so that MDS isn't done using samples not present
# in sample
intersecting_samples = p.index.intersection(m.index)
m = m.loc[intersecting_samples, intersecting_samples]
if len(intersecting_samples) == 0:
sys.stderr.write('None of the phenotyped samples were found in ' +
'population structure matrix\n')
sys.exit(1)
# MDS
if mds_type == "classic":
projection, evals = cmdscale(m)
else:
metric_mds = True
if mds_type == "non-metric":
metric_mds = False
elif mds_type != "metric":
sys.stderr.write("Unsupported mds type chosen. Assuming metric\n")
mds = manifold.MDS(max_dimensions, metric_mds, n_jobs=n_cpus,
dissimilarity='precomputed',
random_state=seed)
projection = mds.fit_transform(m.values)
m = pd.DataFrame(projection,
index=m.index)
for i in range(m.shape[1]):
m[i] = m[i] / max(abs(m[i]))
return m
def load_lineage(infile, p):
"""Load custom lineage clusters definitions
Args:
infile (str)
Input file for lineage clusters
p (pandas.Series)
Phenotypes vector (n, 1)
Returns:
result (tuple of (numpy.array, list))
Lineage binary matrix and cluster labels
"""
lin = pd.Series([x.rstrip().split()[1]
for x in open(infile)],
index=[x.split()[0]
for x in open(infile)])
lin.index = lin.index.astype(str)
if np.any(lin.index.duplicated()):
lin = lin.loc[~lin.index.duplicated()] # tilde is unary inversion
if (len(p.index.difference(lin.index)) > 0):
sys.stderr.write("All samples with a phenotype must be present in lineage file\n")
sys.exit(1)
else:
lin = lin.loc[p.index.intersection(lin.index)]
lineages = sorted(set(lin.values))
lineage_design_mat = []
lineage_assign = []
for categ in lineages:
lineage_design_mat.append(pd.Series([1 if x == categ
else 0
for x in lin.values],
index=lin.index))
lineage_assign.append(categ)
lineage_design_mat = pd.concat(lineage_design_mat, axis=1)
assert(np.all(lineage_design_mat.index == p.index))
return(lineage_design_mat.values, lineage_assign)
def load_covariates(infile, covariates, p):
"""Load and encode a covariates matrix
Args:
infile (str)
Input file for the covariates matrix
covariates (iterable or None)
List of string indicating which columns to use and their
interpretation. Example: `2q` indicates that the second column
from the file is a quantitative variable, `2` indicates that
that same column is categorical. If None, the matrix is loaded
but nothing is done with it.
p (pandas.Series)
Phenotypes vector (n, 1)
Returns:
cov (pandas.DataFrame)
Covariance matrix (n, m)
"""
c = pd.read_csv(infile,
index_col=0,
header=0,
sep='\t')
c.index = c.index.astype(str)
if np.any(c.index.duplicated()):
sys.stderr.write('Covariate file contains duplicated sample names\n')
sys.exit(1)
if (len(p.index.difference(c.index)) > 0):
sys.stderr.write("All samples with a phenotype must be present in covariate file\n")
sys.exit(1)
else:
c = c.loc[p.index.intersection(c.index)]
# which covariates to use?
if covariates is None:
cov = pd.DataFrame([])
else:
cov = []
for col in covariates:
cnum = int(col.rstrip('q'))
if cnum == 1 or cnum > c.shape[1] + 1:
sys.stderr.write('Covariates columns values should be '
'> 1 and less than or equal to total number of ' +
'columns (%d)\n' % (c.shape[1] + 1))
return None
if col[-1] == 'q':
# quantitative
cov.append(c.iloc[:,cnum-2])
else:
# categorical, dummy-encode it
categories = set(c.iloc[:,cnum-2])
categories.pop()
for i, categ in enumerate(categories):
cov.append(pd.Series([1 if x == categ
else 0
for x in
c.iloc[:,cnum-2].values],
index=c.index,
name=c.columns[cnum-2] + "_" + str(i)))
if len(cov) > 0:
cov = pd.concat(cov, axis=1)
else:
cov = pd.DataFrame([])
return cov
def load_burden(infile, burden_regions):
"""Load burden regions for VCF analysis
Args:
infile (str)
Input file for burden regions
burden_regions (list)
List to be filled in-place
"""
with open(infile, "r") as region_file:
for line in region_file:
name, regions = line.rstrip().split()
region = (name, [])
for section in regions.split(','):
region[1].append(section)
burden_regions.append(region)
def open_variant_file(var_type, var_file, burden_file, burden_regions, uncompressed):
"""Open a variant file for use as an iterable
Args:
var_type (str)
Type of variants file (kmers, vcf, Rtab)
var_file (str)
Location of file
burden_file (str)
File containing regions to group burden tests
burden_regions (list)
List of burden regions to be filled in-place
uncompressed (bool)
True if kmer file is not gzipped
"""
sample_order = []
if var_type == "kmers":
if uncompressed:
infile = open(var_file)
else:
infile = gzip.open(var_file, 'r')
elif var_type == "vcf":
infile = VariantFile(var_file)
if burden_file:
load_burden(burden_file, burden_regions)
else:
# Rtab files have a header, rather than sample names accessible by row
infile = open(var_file)
header = infile.readline().rstrip()
sample_order = [str(x) for x in header.split()[1:]]
return infile, sample_order
def read_variant(infile, p, var_type, burden, burden_regions,
uncompressed, all_strains, sample_order,
keep_list = None, noparse = False):
"""Read input line and parse depending on input file type
Return a variant name and pres/abs vector
Args:
infile (opened file)
Handle to opened variant file
p (pandas.Series)
Phenotypes vector (n, 1)
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
uncompressed (bool)
Whether the kmers file is uncompressed
all_strains (set-like)
All sample labels that should be present
sample_order
Samples order to interpret each Rtab line
keep_list (dict)
Variant names to properly read, any other will
return None
(default = None)
noparse (bool)
Set True to skip line without parsing and
return None, irrespective of presence in
skip_list
(default = False)
Returns:
eof (bool)
Whether we are at the end of the file
k (numpy.array)
Variant presence/absence vector
var_name (str)
Variant name
kstrains (list)
Samples in which the variant is present
nkstrains (list)
Samples in which the variant is absent
af (float)
Allele frequency
missing (float)
Missing frequency
"""
if var_type not in {'kmers', 'vcf', 'Rtab'}:
raise ValueError('Variants type not supported')
if var_type == "vcf":
# burden tests read through regions and slice vcf
if burden:
if len(burden_regions) > 0:
line_in = burden_regions.popleft()
else: # Last; to raise exception on next loop
line_in = None
# read single vcf line
else:
try:
line_in = next(infile)
except StopIteration:
line_in = None
else:
# kmers and Rtab plain text files
line_in = infile.readline()
if not line_in or noparse:
eof = True
return(eof, None, None, None, None, None, None)
else:
eof = False
d = {}
if var_type == "kmers":
if not uncompressed:
line_in = line_in.decode()
var_name, strains = (line_in.split()[0],
line_in.rstrip().split(
'|')[1].lstrip().split())
if keep_list != None and var_name not in keep_list:
return(eof, None, None, None, None, None, None)
else:
d = {str(x.split(':')[0]): 1
for x in strains}
elif var_type == "vcf":
if not burden:
var_name = read_vcf_var(line_in, d, keep_list)
if var_name is None:
return (eof, None, None, None, None, None, None)
else:
# burden test. Regions are named contig:start-end.
# Start is non-inclusive, so start one before to include
(var_name, regions) = line_in
for region in regions:
region = re.match('^(.+):(\d+)-(\d+)$', region)
if region:
# Adds presence to d for every variant
# observation in region
for variant in infile.fetch(region.group(1),
int(region.group(2)) - 1,
int(region.group(3))):
var_sub_name = read_vcf_var(variant, d, keep_list)
else: # stop trying to make 'fetch' happen
sys.stderr.write("Could not parse region %s\n" %
str(region))
return (eof, None, None, None, None, None, None)
elif var_type == "Rtab":
try:
split_line = line_in.rstrip().split('\t')
except TypeError:
# python3 unicode corner case
split_line = line_in.decode().rstrip().split('\t')
var_name, strains = split_line[0], split_line[1:]
if keep_list != None and var_name not in keep_list:
return (eof, None, None, None, None, None, None)
# sanity check
if len(strains) == 0:
raise ValueError('No sample data found; is this a Rtab file?')
# sanity check
if len(strains) != len(sample_order):
raise ValueError('Unexpected mismatch between header and data row')
for present, sample in zip(strains, sample_order):
# sanity check
if present not in {'0', '1', '.'}:
raise ValueError('Rtab file not binary')
if present == '1':
d[sample] = 1
elif present == ".":
d[sample] = np.nan
# Use common dictionary to format design matrix etc
kstrains = sorted(set(d.keys()).intersection(all_strains)) # This will include missing
nkstrains = sorted(all_strains.difference(set(kstrains)))
# default for missing samples is absent kmer
for x in nkstrains:
d[x] = 0
af = float(len(kstrains)) / len(all_strains)
if len(kstrains) == 0:
sys.stderr.write("No observations of " + var_name + " in selected samples\n")
k = np.array([d[x] for x in p.index
if x in d])
missing = float(np.sum(np.isnan(k))) / len(all_strains)
return (eof, k, var_name, kstrains, nkstrains, af, missing)
def read_vcf_var(variant, d, keep_list = None):
"""Parses vcf variants from pysam
Returns None if filtered variant. Mutates passed dictionary d
Args:
variant (pysam.libcbcf.VariantRecord)
Variant to be parsed
d (dict)
Dictionary to be populated in-place
keep_list (list)
List of variants to read
"""
var_name = "_".join([variant.contig, str(variant.pos)] +
[str(allele) for allele in variant.alleles])
if keep_list == None or var_name in keep_list:
# Do not support multiple alleles. Use 'bcftools norm' to split these
if variant.alts != None and len(variant.alts) > 1:
sys.stderr.write("Multiple alleles at %s_%s. Skipping\n" %
(variant.contig, str(variant.pos)))
var_name = None
elif len(variant.filter.keys()) > 0 and "PASS" not in variant.filter.keys():
var_name = None
else:
for sample, call in variant.samples.items():
# This is dominant encoding. Any instance of '1' will count as present
# Could change to additive, summing instances, or reccessive only counting
# when all instances are 1.
# Shouldn't matter for bacteria, but some people call hets
for haplotype in call.get('GT', [None]):
# if sample already present in dictionary do not overwrite
# important for burden testing
if (haplotype is None or str(haplotype) == ".") and sample not in d:
d[sample] = np.nan
elif haplotype is not None and haplotype != 0 and str(haplotype) != ".":
d[sample] = 1
break
# if sample already present as missing and haplotype is '0'
# then remove
elif sample in d and np.isnan(d[sample]) and str(haplotype) != '.':
del d[sample]
else:
var_name = None
return(var_name)
def iter_variants(p, m, cov, var_type, burden, burden_regions, infile,
all_strains, sample_order, lineage_effects, lineage_clusters,
min_af, max_af, max_missing, filter_pvalue, lrt_pvalue, null_fit,
firth_null, uncompressed, continuous):
"""Make an iterable to pass single variants to fixed effects regression
Args:
p (pandas.DataFrame)
Phenotype vector (n, 1)
m (numpy.array)
Population structure matrix (n, k)
cov (pandas.DataFrame)
Covariates matrix (n, m)
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sampes order to interpret each Rtab line
lineage_effects (bool)
Whether to fit lineage effects
lineage clusters (list)
Lineage clusters indexes
min_af (float)
Minimum allele frequency (inclusive)
max_af (float)
maximum allele frequency (inclusive)
max_missing (float)
maximum missing frequency
filter_pvalue (float)
Pre-filtering p-value threshold
lrt_pvalue (float)
Filtering p-value threshold
null_fit (float or statsmodels.regression.linear_model.RegressionResultsWrapper)
Null-fit likelihood (binary) or model (continuous)
firth_null (float)
Firth regression likelihood
uncompressed (bool)
Whether the kmers file is uncompressed
continuous (bool)
Whether the phenotype is continuous or not
Returns:
var_name (str)
Variant name
v (numpy.array)
Phenotypes vector (n, 1)
k (numpy.array)
Variant presence/absence vector (n, 1)
m (numpy.array)
Population structure matrix (n, k)
c (numpy.array)
Covariates matrix (n, m)
af (float)
Allele frequency
pattern (bytes)
Variant hash
lineage_effects (bool)
Whether to fit lineage effects
lineage clusters (list)
Lineage clusters indexes
filter_pvalue (float)
Pre-filtering p-value threshold
lrt_pvalue (float)
Filtering p-value threshold
null_fit (float or statsmodels.regression.linear_model.RegressionResultsWrapper)
Null-fit likelihood (binary) or model (continuous)
firth_null (float)
Firth regression likelihood
kstrains (iterable)
Sample labels with the variant
nkstrains (iterable)
Sample labels without the variant
continuous (bool)
Whether the phenotype is continuous or not
"""
while True:
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(infile,
p,
var_type,
burden,
burden_regions,
uncompressed,
all_strains,
sample_order)
# check for EOF
if eof:
return
v = p.values
c = cov.values
if k is not None:
pattern = hash_pattern(k)
else:
pattern = None
if (k is None) or not (min_af <= af <= max_af) or (missing > max_missing):
# TODO: currently the way to know if a variant was
# af filtered is to check that phenotypes vector is None
# a more robust way could be better
yield (var_name, None, k, m, c, af, pattern,
lineage_effects, lineage_clusters,
filter_pvalue, lrt_pvalue, null_fit, firth_null,
kstrains, nkstrains, continuous)
else:
yield (var_name, v, k, m, c, af, pattern,
lineage_effects, lineage_clusters,
filter_pvalue, lrt_pvalue, null_fit, firth_null,
kstrains, nkstrains, continuous)
def iter_variants_lmm(variant_iter, lmm, h2,
lineage, lineage_clusters,
covariates, continuous, filter_pvalue, lrt_pvalue):
"""Make an iterable to pass single variants to fixed effects regression"""
for variants, variant_mat, eof in variant_iter:
if len(variants) == 0:
break
yield (lmm, h2, variants, variant_mat, lineage,
lineage_clusters, covariates,
continuous, filter_pvalue, lrt_pvalue)
if eof:
break
# Loads a block of variants into memory for use with LMM
def load_var_block(var_type, p, burden, burden_regions, infile,
all_strains, sample_order, min_af, max_af,
max_missing, uncompressed, block_size):
"""Make in iterable to load blocks of variants for LMM
Args:
var_type (str)
Variants type (one of: kmers, vcf or Rtab)
p (pandas.DataFrame)
Phenotype vector (n, 1)
burden (bool)
Whether to slice a vcf file by burden regions
burden_regions (collections.deque)
Burden regions to slice the vcf with
infile (opened file)
Handle to opened variant file
all_strains (set-like)
All sample labels that should be present
sample_order
Sampes order to interpret each Rtab line
min_af (float)
Minimum allele frequency (inclusive)
max_af (float)
maximum allele frequency (inclusive)
max_missing (float)
maximum missing frequency
uncompressed (bool)
Whether the kmers file is uncompressed
block_size (int)
How many variants to be loaded at once
Returns:
variants (iterable)
A collection of pyseer.classes.LMM objects describing the
loaded variants (n,)
variant_mat (numpy.array)
Variant bloack presence/absence matrix (n, block_size)
eof (bool)
Whether we are at the end of the file
"""
while True:
variants = []
# pre-allocation of memory
variant_mat = np.zeros((len(p), block_size))
for var_idx in range(block_size):
eof, k, var_name, kstrains, nkstrains, af, missing = read_variant(
infile, p, var_type,
burden, burden_regions,
uncompressed, all_strains,
sample_order)
# check for EOF
if eof:
break
if k is None or af < min_af or af > max_af or missing > max_missing:
pattern = None
else:
pattern = hash_pattern(k)
variant_mat[:, var_idx] = k
variants.append((var_obj.LMM(var_name, pattern, af, np.nan,
np.nan, np.nan, np.nan, np.nan,
np.nan, kstrains, nkstrains,
set(), True, True),
p.values, k))
yield (variants, variant_mat, eof)
if eof:
break
yield None, None, True
def hash_pattern(k):
"""Calculates the hash of a presence/absence vector
Args:
k (numpy.array)
Variant presence/absence binary vector (n, 1)
Returns:
hash (byte)
Hashed pattern
"""
pattern = k.view(np.uint8)
hashed = hashlib.md5(pattern)
return (binascii.b2a_base64(hashed.digest()))
def file_hash(filename):
"""Calculates the hash of an entire file on disk
Args:
filename (str)
Location of file on disk
Returns:
hash (str)
SHA256 checksum
"""
hash_sha256 = hashlib.sha256()
with open(filename, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_sha256.update(chunk)
return hash_sha256.hexdigest()