In [1]:
import pysam
import gzip

In [2]:
def from_genome_or_bam(filename):
    a = []
    if filename.endswith(".bam"):
        file = pysam.AlignmentFile(filename)
    else:
        file = pysam.FastaFile(filename)
    for ref_name in file.references:
        a.append((ref_name, str(file.get_reference_length(ref_name))))
    a.sort()
    return a

In [3]:
def from_anno(filename):
    a = set()
    with gzip.open(filename, 'rt') as gtf:
        for line in gtf:

        # filtering out comment lines
            line = line.strip()
            if line.startswith("#") or not line:
                continue
            if "#" in line:
                line = line.split("#")[0].strip()

            # splitting to fields
            ref_name, *_ = line.split("\t")
        
            a.add(ref_name)
    
    a = list(a)
    a.sort()
    return a    

In [4]:
def comparison(*args):
    max_len = max(map(len, args))
    for arg in args:
        while len(arg) < max_len:
            arg.append("" * 10)
    for i in range(max_len):
        print("|".join([f"{[arg[i], ' '.join(arg[i])][type(arg[i]) == tuple]:>20}" for arg in args]))

In [5]:
comparison(from_anno("../annotations/dm3.gtf.gz"),
           from_genome_or_bam("../genomes/dm3.fa"), 
           from_genome_or_bam("../../_pyIPSA/data/dm3_ENCFF393FYM.bam"))

               chr2L|      chr2L 23011544|      chr2L 23011544
            chr2LHet|     chr2LHet 368872|     chr2LHet 368872
               chr2R|      chr2R 21146708|      chr2R 21146708
            chr2RHet|    chr2RHet 3288761|    chr2RHet 3288761
               chr3L|      chr3L 24543557|      chr3L 24543557
            chr3LHet|    chr3LHet 2555491|    chr3LHet 2555491
               chr3R|      chr3R 27905053|      chr3R 27905053
            chr3RHet|    chr3RHet 2517507|    chr3RHet 2517507
                chr4|        chr4 1351857|        chr4 1351857
                chrM|          chrM 19517|          chrM 19517
                chrU|       chrU 10049037|       chrU 10049037
           chrUextra|  chrUextra 29004656|       chrX 22422827
                chrX|       chrX 22422827|      chrXHet 204112
             chrXHet|      chrXHet 204112|      chrYHet 347038
             chrYHet|      chrYHet 347038|                    


In [6]:
comparison(from_anno("../annotations/hg19.gtf.gz"),
           from_genome_or_bam("../genomes/hg19.fa"), 
           from_genome_or_bam("../../_pyIPSA/data/PCAWG.bam"))

                chr1|   GL000191.1 106433|         1 249250621
               chr10|   GL000192.1 547496|        10 135534747
               chr11|   GL000193.1 189789|        11 135006516
               chr12|   GL000194.1 191469|        12 133851895
               chr13|   GL000195.1 182896|        13 115169878
               chr14|    GL000196.1 38914|        14 107349540
               chr15|    GL000197.1 37175|        15 102531392
               chr16|    GL000198.1 90085|         16 90354753
               chr17|   GL000199.1 169874|         17 81195210
               chr18|   GL000200.1 187035|         18 78077248
               chr19|    GL000201.1 36148|         19 59128983
                chr2|    GL000202.1 40103|         2 243199373
               chr20|    GL000203.1 37498|         20 63025520
               chr21|    GL000204.1 81310|         21 48129895
               chr22|   GL000205.1 174588|         22 51304566
                chr3|    GL000206.1 41001|         3 19

In [7]:
comparison(from_anno("../annotations/hg19.gtf.gz"),
           from_genome_or_bam("../genomes/hg19.fa"), 
           from_genome_or_bam("../../_pyIPSA/data/ENCFF040ZWV.bam"))

                chr1|   GL000191.1 106433|     ERCC-00002 1061
               chr10|   GL000192.1 547496|     ERCC-00003 1023
               chr11|   GL000193.1 189789|      ERCC-00004 523
               chr12|   GL000194.1 191469|     ERCC-00007 1135
               chr13|   GL000195.1 182896|      ERCC-00009 984
               chr14|    GL000196.1 38914|      ERCC-00012 994
               chr15|    GL000197.1 37175|      ERCC-00013 808
               chr16|    GL000198.1 90085|     ERCC-00014 1957
               chr17|   GL000199.1 169874|      ERCC-00016 844
               chr18|   GL000200.1 187035|     ERCC-00017 1136
               chr19|    GL000201.1 36148|     ERCC-00018 1032
                chr2|    GL000202.1 40103|      ERCC-00019 644
               chr20|    GL000203.1 37498|      ERCC-00022 751
               chr21|    GL000204.1 81310|      ERCC-00023 273
               chr22|   GL000205.1 174588|      ERCC-00024 536
                chr3|    GL000206.1 41001|     ERCC-000

In [8]:
comparison(from_anno("../annotations/hg19.gtf.gz"),
           from_genome_or_bam("../genomes/hg19.fa"), 
           from_genome_or_bam("../../_pyIPSA/data/1a1_S2_R1_001.Aligned.sortedByCoord.out.bam"))

                chr1|   GL000191.1 106433|      chr1 249250621
               chr10|   GL000192.1 547496|     chr10 135534747
               chr11|   GL000193.1 189789|     chr11 135006516
               chr12|   GL000194.1 191469|chr11_gl000202_random 40103
               chr13|   GL000195.1 182896|     chr12 133851895
               chr14|    GL000196.1 38914|     chr13 115169878
               chr15|    GL000197.1 37175|     chr14 107349540
               chr16|    GL000198.1 90085|     chr15 102531392
               chr17|   GL000199.1 169874|      chr16 90354753
               chr18|   GL000200.1 187035|      chr17 81195210
               chr19|    GL000201.1 36148|chr17_ctg5_hap1 1680828
                chr2|    GL000202.1 40103|chr17_gl000203_random 37498
               chr20|    GL000203.1 37498|chr17_gl000204_random 81310
               chr21|    GL000204.1 81310|chr17_gl000205_random 174588
               chr22|   GL000205.1 174588|chr17_gl000206_random 41001
                

In [9]:
comparison(from_anno("../annotations/hg38.gtf.gz"),
           from_genome_or_bam("../genomes/hg38.fa"), 
           from_genome_or_bam("../../_pyIPSA/data/hg38_ENCFF171TTJ.bam"))

          GL000009.2|   GL000008.2 209709|     ERCC-00002 1061
          GL000194.1|   GL000009.2 201709|     ERCC-00003 1023
          GL000195.1|   GL000194.1 191469|      ERCC-00004 523
          GL000205.2|   GL000195.1 182896|     ERCC-00007 1135
          GL000213.1|   GL000205.2 185591|      ERCC-00009 984
          GL000216.2|    GL000208.1 92689|      ERCC-00012 994
          GL000218.1|   GL000213.1 164239|      ERCC-00013 808
          GL000219.1|   GL000214.1 137718|     ERCC-00014 1957
          GL000220.1|   GL000216.2 176608|      ERCC-00016 844
          GL000225.1|   GL000218.1 161147|     ERCC-00017 1136
          KI270442.1|   GL000219.1 179198|     ERCC-00018 1032
          KI270711.1|   GL000220.1 161802|      ERCC-00019 644
          KI270713.1|   GL000221.1 155397|      ERCC-00022 751
          KI270721.1|   GL000224.1 179693|      ERCC-00023 273
          KI270726.1|   GL000225.1 211173|      ERCC-00024 536
          KI270727.1|    GL000226.1 15008|     ERCC-000