#!/usr/bin/env python
Lifts intropolis over to hg38 and writes output including both hg19 coordinates
and hg38 coordinates.
liftOver executable available from
intropolis.v1.hg19.tsv.gz .
Writes to stdout. We ran
--liftover /path/to/liftOver
--chain /path/to/hg19ToHg38.over.chain
--intropolis /path/to/intropolis.v1.hg19.tsv.gz
| gzip >intropolis.v1.hg19_with_liftover_to_hg38.tsv.gz
Tab-separated output fields
1. hg19 chrom
2. hg19 start (1-based, inclusive)
3. hg19 end (1-based, inclusive)
4. hg19 strand
5. left motif (e.g., GT)
6. right motif (e.g., AG)
7. comma-separated list of indexes of samples from
intropolis in which junction was found
8. comma-separated list of numbers of reads in corresponding samples from
field 7 overlapping junction
9. hg38 chrom or NA if liftover unsuccessful
10. hg38 start or NA
11. hg38 end or NA
12. hg38 strand or NA
import tempfile
import gzip
import shutil
import atexit
import subprocess
import os
if __name__ == '__main__':
import argparse
# Print file's docstring if -h is invoked
parser = argparse.ArgumentParser(description=__doc__,
parser.add_argument('--liftover', type=str, required=True,
help=('path to liftOver executable available from '
parser.add_argument('--chain', type=str, required=True,
help=('path to unzipped liftover chain; this should be '
parser.add_argument('--intropolis', type=str, required=True,
help='path to intropolis.v1.hg19.tsv.gz'
parser.add_argument('--temp-dir', type=str, required=False,
help='where to store temporary files; defaults to TMPDIR'
args = parser.parse_args()
temp_dir = tempfile.mkdtemp(dir=args.temp_dir)
to_liftover = os.path.join(temp_dir, 'to_liftover.bed')
temp_hg38 = os.path.join(temp_dir, 'hg38.bed')
temp_hg19 = os.path.join(temp_dir, 'hg19.bed')
with open(temp_hg19, 'w') as hg19_stream,
) as input_stream:
for i, line in enumerate(input_stream):
tokens = line.strip().split('\t')
chrom, start, end, strand = (
tokens[0], str(int(tokens[1]) - 1),
tokens[2], tokens[3],
) # zero-based, half-open coordinates for BED
# Tack original junction onto junction name
junction_name = ';'.join([str(i), chrom, start, end, strand])
print >>hg19_stream, '{}\t{}\t{}\tinfo_{}\t1\t{}'.format(
chrom, start, end, junction_name, strand
# Convert junctions from hg19 to hg38
liftover_process =' '.join([
to_sort = os.path.join(temp_dir, 'intropolis_and_liftover.tsv.gz')
with, 'w') as both_stream:
with open(temp_hg38) as hg38_stream:
for line in hg38_stream:
chrom, start, end, name, score, strand = line.strip().split(
(_, hg19_chrom, hg19_start,
hg19_end, hg19_strand) = name.split(';')
hg19_start, start = int(hg19_start), int(start)
print >>both_stream, '\t'.join(
[hg19_chrom, str(hg19_start + 1), hg19_end,
hg19_strand, chrom, str(start + 1),
end, strand, 'FAKE']
with as intropolis_stream:
for line in intropolis_stream:
print >>both_stream, line,
sorted_together = os.path.join(temp_dir, 'sorted_together.tsv.gz')
'gzip -cd {} | sort -k1,1 -k2,2n -k3,3n | gzip >{}'.format(
to_sort, sorted_together
), shell=True, bufsize=-1
with as sorted_stream:
junction_1_tokens = sorted_stream.readline().strip().split('\t')
junction_2_tokens = sorted_stream.readline().strip().split('\t')
while True:
if junction_1_tokens[:4] == junction_2_tokens[:4]:
# Liftover available
if len(junction_1_tokens) > len(junction_2_tokens):
hg38_tokens = junction_1_tokens
hg19_tokens = junction_2_tokens
hg38_tokens = junction_2_tokens
hg19_tokens = junction_1_tokens
print '\t'.join(hg19_tokens + hg38_tokens[4:8])
junction_1_tokens = sorted_stream.readline().strip()
if not junction_1_tokens:
# End of file; nothing to print
junction_1_tokens = junction_1_tokens.split('\t')
junction_2_tokens = sorted_stream.readline().strip()
if not junction_2_tokens:
# End of file; print junction 1 tokens and sign out
print '\t'.join(junction_1_tokens + ['NA'] * 4)
junction_2_tokens = junction_2_tokens.split('\t')
'''Liftover not available for junction 1, but have to check
junction 2 against next junction.'''
print '\t'.join(junction_1_tokens + ['NA'] * 4)
junction_1_tokens = junction_2_tokens
junction_2_tokens = sorted_stream.readline().strip()
if not junction_2_tokens:
# End of file; print new junction 1 tokens and sign out
print '\t'.join(junction_1_tokens + ['NA'] * 4)
junction_2_tokens = junction_2_tokens.split('\t')