In [2]:
# dictionary mapping gene names to list of last included exons
five_prime_genes = {
    # MGA exon 22, NUTM1 exon 3, source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6318763/
    "MGA": [22], 
    
    # BRD4 exons 10 & 11, sources:
    # - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5378225/
    # - https://aacrjournals.org/cancerres/article/63/2/304/510577/BRD4-NUT-Fusion-OncogeneA-Novel-Mechanism-in
    "BRD4": [10, 11],
    
    # BRD3 exon 9, NUTM1 exon 2, source: https://www.nature.com/articles/1210852
    "BRD3": [9],
    
    
    # MXD4 exon 5, NUTM1 exons 2 & 3, sources:
    # - https://pubmed.ncbi.nlm.nih.gov/30338611/
    # - https://www.nature.com/articles/s41379-021-00792-z
    "MXD4": [5],
    
    # CIC exons 16-20, NUTM1 exons 2-5, source: https://hal.archives-ouvertes.fr/hal-01927040
    "CIC": [16, 17, 18, 19, 20],
    
    # SLC12A6 exon 2, NUTM1 exon 3, source: https://www.haematologica.org/article/view/9099
    "SLC12A6": [2],
    
    # YAP1 exon 3, NUTM1 exon 2, source: https://link.springer.com/article/10.1007/s12105-020-01173-9
    "YAP1": [3],
}

# even though the CIC paper claims there are fusions on exons 4 & 5, I don't see those in any other papers
NUTM1_start_exons = [2, 3] 

In [3]:
from pyensembl import ensembl_grch38

In [44]:
name_to_gene = {}
for name in set(five_prime_genes.keys()).union({"NUTM1"}):
    genes = ensembl_grch38.genes_by_name(name)
    assert len(genes) == 1
    name_to_gene[name] = genes[0]

In [45]:
def transcript_key(t):
    return (t.complete, -t.support_level if t.support_level else 100, len(t.protein_sequence) if t.protein_sequence else 0)

def pick_best_transcript(ts):
    return sorted(ts, key=transcript_key)[-1]

In [52]:
canonical_transcripts = {"NUTM1": "NUTM1-203"}

name_to_transcript = {}

for (name, gene) in name_to_gene.items():
    if name in canonical_transcripts:
        transcript_name = canonical_transcripts[name]
        transcript = ensembl_grch38.transcripts_by_name(transcript_name)[0]
    else:
        transcript = pick_best_transcript(gene.transcripts)
    name_to_transcript[name] = transcript

In [53]:
name_to_transcript

{'CIC': Transcript(transcript_id='ENST00000681038', transcript_name='CIC-209', gene_id='ENSG00000079432', biotype='protein_coding', contig='19', start=42269252, end=42295796, strand='+', genome='GRCh38'),
 'SLC12A6': Transcript(transcript_id='ENST00000676379', transcript_name='SLC12A6-221', gene_id='ENSG00000140199', biotype='protein_coding', contig='15', start=34230036, end=34337462, strand='-', genome='GRCh38'),
 'BRD4': Transcript(transcript_id='ENST00000679869', transcript_name='BRD4-213', gene_id='ENSG00000141867', biotype='protein_coding', contig='19', start=15235519, end=15332539, strand='-', genome='GRCh38'),
 'YAP1': Transcript(transcript_id='ENST00000282441', transcript_name='YAP1-201', gene_id='ENSG00000137693', biotype='protein_coding', contig='11', start=102110447, end=102233424, strand='+', genome='GRCh38'),
 'BRD3': Transcript(transcript_id='ENST00000303407', transcript_name='BRD3-201', gene_id='ENSG00000169925', biotype='protein_coding', contig='9', start=134030305, end

In [54]:
name_to_coding_exon_lengths = {}
for name, transcript in name_to_transcript.items():
    exon_coords = transcript.coding_sequence_position_ranges
    if transcript.strand == "-":
        exons = reversed([(end, start) for (start, end) in exon_coords])
    name_to_coding_exon_lengths[name] = [end - start + 1 for (start, end) in exon_coords]
        

In [55]:
name_to_coding_exon_lengths

{'CIC': [2794,
  150,
  235,
  130,
  183,
  166,
  203,
  226,
  104,
  1234,
  188,
  122,
  167,
  294,
  326,
  245,
  155,
  132,
  132,
  365],
 'SLC12A6': [271,
  45,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  32,
  33],
 'BRD4': [285,
  138,
  136,
  290,
  363,
  129,
  210,
  200,
  296,
  111,
  53,
  370,
  588,
  113,
  163,
  131,
  206,
  238,
  66],
 'YAP1': [321, 251, 116, 114, 182, 48, 131, 113, 236],
 'BRD3': [213, 138, 148, 215, 372, 129, 192, 236, 293, 129, 113],
 'MXD4': [64, 100, 30, 115, 163, 155],
 'MGA': [1064,
  949,
  79,
  96,
  132,
  105,
  659,
  346,
  227,
  186,
  73,
  518,
  151,
  291,
  1505,
  131,
  52,
  207,
  112,
  234,
  177,
  1274],
 'NUTM1': [6, 94, 709, 129, 137, 287, 117, 2001]}

In [73]:
name_to_5prime_utr_exons_count = {}
for name, t in name_to_transcript.items():
    
    if t.strand == "+":
        start_codon_pos = min(t.start_codon_positions)
        count = sum([
            end < start_codon_pos
            for (_, end) in t.exon_intervals])
    else:
        start_codon_pos = max(t.start_codon_positions)
        count = sum([
            start > start_codon_pos
            for (start, end) in t.exon_intervals])
    name_to_5prime_utr_exons_count[name] = count
name_to_5prime_utr_exons_count

{'CIC': 1,
 'SLC12A6': 0,
 'BRD4': 1,
 'YAP1': 0,
 'BRD3': 1,
 'MXD4': 0,
 'MGA': 1,
 'NUTM1': 0}

In [None]:

name_and_exon_to_cds_length = {}
for name, exon_numbers in list(five_prime_genes.items()) + [("NUTM1", NUTM1_start_exons)]:
    print(name, exon_numbers) 
    skip_exons = name_to_5prime_utr_exons_count[name]
    if name == "NUTM1":
        min_exon = min(exon_numbers)
        exon_numbers = [min_exon - 1] + exon_numbers
    for exon_number in exon_numbers:
        cds_length = sum(name_to_coding_exon_lengths[name][:exon_number])
        print("-- %s %d: %d (%d)" % (name, exon_number, cds_length, cds_length % 3))

In [58]:
name_to_transcript["YAP1"]

Transcript(transcript_id='ENST00000282441', transcript_name='YAP1-201', gene_id='ENSG00000137693', biotype='protein_coding', contig='11', start=102110447, end=102233424, strand='+', genome='GRCh38')

In [64]:
name_to_transcript["BRD4"]

Transcript(transcript_id='ENST00000679869', transcript_name='BRD4-213', gene_id='ENSG00000141867', biotype='protein_coding', contig='19', start=15235519, end=15332539, strand='-', genome='GRCh38')

In [67]:
t.start_codon_positions

[34343697, 34343698, 34343699]

In [68]:
t.start_codon_unspliced_offsets

[382, 383, 384]

In [71]:
t.exon_intervals

[(34343315, 34343702),
 (34345942, 34346035),
 (34347969, 34348677),
 (34350704, 34350832),
 (34353736, 34353872),
 (34354446, 34354732),
 (34355021, 34355137),
 (34355488, 34357735)]