In [3]:
# dictionary mapping gene names to list of last included exons
five_prime_genes = {
    # MGA exon 22, NUTM1 exon 3, source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6318763/
    "MGA": [22], 
    
    # BRD4 exons 10 & 11, sources:
    # - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5378225/
    # - https://aacrjournals.org/cancerres/article/63/2/304/510577/BRD4-NUT-Fusion-OncogeneA-Novel-Mechanism-in
    #
    # - exon 15, source: https://www.nature.com/articles/onc2012487
    "BRD4": [10, 11, 12, 13, 14, 15],
    
    # BRD3 exon 9, NUTM1 exon 2, source: https://www.nature.com/articles/1210852
    "BRD3": [9],
    
    
    # MXD4 exon 5, NUTM1 exons 2 & 3, sources:
    # - https://pubmed.ncbi.nlm.nih.gov/30338611/
    # - https://www.nature.com/articles/s41379-021-00792-z
    "MXD4": [5],
    
    # CIC exons 16-20, NUTM1 exons 2-5, source: https://hal.archives-ouvertes.fr/hal-01927040
    "CIC": [16, 17, 18, 19, 20],
    
    # SLC12A6 exon 2, NUTM1 exon 3, source: https://www.haematologica.org/article/view/9099
    "SLC12A6": [2],
    
    # YAP1 exon 3, NUTM1 exon 2, source: https://link.springer.com/article/10.1007/s12105-020-01173-9
    "YAP1": [3],
    
    # NSD3 exon 7, NUTM1 exon 2, sources:
    # * https://www.frontiersin.org/articles/10.3389/fonc.2022.860830/full
    # * https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4125436/
    "NSD3": [7],
}

# * even though the CIC paper claims there are fusions on exons 4 & 5, I don't see those in any other papers
# * OK, found one reference with BRD4 exon 11 fused onto NUTM1 exon 3, both of which have end/start phases of 1
#   source: 
# * Oy, found that older references missed what is now considered exon 1, so adding exon 6 which was the old exon 5
NUTM1_start_exons = [2, 3, 4, 5, 6] 

In [4]:
from pyensembl import ensembl_grch38

In [5]:
name_to_gene = {}
for name in set(five_prime_genes.keys()).union({"NUTM1"}):
    genes = ensembl_grch38.genes_by_name(name)
    assert len(genes) == 1
    name_to_gene[name] = genes[0]

In [6]:
def transcript_key(t):
    return (t.complete, -t.support_level if t.support_level else 100, len(t.protein_sequence) if t.protein_sequence else 0)

def pick_best_transcript(ts):
    return sorted(ts, key=transcript_key)[-1]

In [7]:
canonical_transcripts = {"NUTM1": "NUTM1-203"}

name_to_transcript = {}

for (name, gene) in name_to_gene.items():
    if name in canonical_transcripts:
        transcript_name = canonical_transcripts[name]
        transcript = ensembl_grch38.transcripts_by_name(transcript_name)[0]
    else:
        transcript = pick_best_transcript(gene.transcripts)
    name_to_transcript[name] = transcript

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/iskander/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/iskander/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/iskander/Library/Caches/pyensembl/GRCh38/ensembl109/Homo_sapiens.GRCh38.pep.all.fa.gz.pickle


In [18]:

name_to_transcripts = {}

for (name, gene) in name_to_gene.items():
    ts = sorted([t for t in gene.transcripts if t.biotype == "protein_coding" and t.complete])
    print(name, gene, ts)
    name_to_transcripts[name] = ts
    

MXD4 Gene(gene_id='ENSG00000123933', gene_name='MXD4', biotype='protein_coding', contig='4', start=2247432, end=2262109, strand='-', genome='GRCh38') [Transcript(transcript_id='ENST00000337190', transcript_name='MXD4-201', gene_id='ENSG00000123933', biotype='protein_coding', contig='4', start=2247432, end=2262109, strand='-', genome='GRCh38')]
SLC12A6 Gene(gene_id='ENSG00000140199', gene_name='SLC12A6', biotype='protein_coding', contig='15', start=34229784, end=34338060, strand='-', genome='GRCh38') [Transcript(transcript_id='ENST00000354181', transcript_name='SLC12A6-202', gene_id='ENSG00000140199', biotype='protein_coding', contig='15', start=34229784, end=34337781, strand='-', genome='GRCh38'), Transcript(transcript_id='ENST00000290209', transcript_name='SLC12A6-201', gene_id='ENSG00000140199', biotype='protein_coding', contig='15', start=34229996, end=34318779, strand='-', genome='GRCh38'), Transcript(transcript_id='ENST00000676379', transcript_name='SLC12A6-221', gene_id='ENSG0000

In [20]:
name_to_coding_exon_lengths = {}
for name, transcripts in name_to_transcripts.items():
    for t in transcripts:
        exon_coords = t.coding_sequence_position_ranges
        if t.strand == "-":
            exons = reversed([(end, start) for (start, end) in exon_coords])
        name_to_coding_exon_lengths[t.name] = [end - start + 1 for (start, end) in exon_coords]


In [21]:
name_to_coding_exon_lengths

{'MXD4-201': [64, 100, 30, 115, 163, 155],
 'SLC12A6-202': [271,
  45,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  89],
 'SLC12A6-201': [118,
  45,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  89],
 'SLC12A6-221': [271,
  45,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  32,
  33],
 'SLC12A6-204': [271,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  89],
 'SLC12A6-217': [271,
  45,
  95,
  132,
  147,
  55,
  131,
  242,
  215,
  159,
  99,
  58,
  175,
  119,
  99,
  120,
  105,
  169,
  196,
  170,
  132,
  108,
  185,
  134,
  89],
 'SLC12A6-205': [94,
  45,
  95,
  13

In [22]:
name_to_5prime_utr_exons_count = {}
for name, ts in name_to_transcripts.items():
    for t in ts:
        if t.strand == "+":
            start_codon_pos = min(t.start_codon_positions)
            count = sum([
                end < start_codon_pos
                for (_, end) in t.exon_intervals])
        else:
            start_codon_pos = max(t.start_codon_positions)
            count = sum([
                start > start_codon_pos
                for (start, end) in t.exon_intervals])
        name_to_5prime_utr_exons_count[t.name] = count
name_to_5prime_utr_exons_count

{'MXD4-201': 0,
 'SLC12A6-202': 1,
 'SLC12A6-201': 0,
 'SLC12A6-221': 0,
 'SLC12A6-204': 0,
 'SLC12A6-217': 0,
 'SLC12A6-205': 1,
 'SLC12A6-203': 1,
 'SLC12A6-216': 3,
 'SLC12A6-207': 1,
 'SLC12A6-206': 1,
 'NSD3-202': 1,
 'NSD3-203': 1,
 'NSD3-206': 1,
 'NSD3-201': 1,
 'MGA-207': 1,
 'MGA-201': 1,
 'MGA-214': 1,
 'BRD4-201': 1,
 'BRD4-213': 1,
 'BRD4-203': 1,
 'BRD4-202': 1,
 'BRD3-201': 1,
 'BRD3-202': 1,
 'NUTM1-203': 0,
 'NUTM1-204': 1,
 'NUTM1-202': 0,
 'NUTM1-201': 0,
 'YAP1-201': 0,
 'YAP1-204': 0,
 'YAP1-209': 0,
 'YAP1-202': 0,
 'YAP1-210': 0,
 'YAP1-211': 0,
 'YAP1-208': 0,
 'YAP1-203': 1,
 'CIC-203': 1,
 'CIC-209': 1,
 'CIC-201': 0,
 'CIC-206': 0}

In [58]:

name_and_exon_to_cds_length = {}

for gene_name, exon_numbers in list(five_prime_genes.items()) + [("NUTM1", NUTM1_start_exons)]:
    print(gene_name, exon_numbers) 
    exon_numbers = [min(exon_numbers) - 1] + exon_numbers
    for t in name_to_transcripts[gene_name]:

        num_utr_exons = name_to_5prime_utr_exons_count[t.name]
        num_exons = len(t.exons)
        for exon_number in exon_numbers:
            if exon_number < num_exons:

                cds_length = sum(name_to_coding_exon_lengths[t.name][:(exon_number - num_utr_exons)])
                print("-- %s %d: %d (%d)" % (t.name, exon_number, cds_length, cds_length % 3))

                name_and_exon_to_cds_length[(t.name, exon_number)] = cds_length
            else:
                print("??? %s has %d exons, can't truncate at exon %d" % (t, num_exons, exon_number))

MGA [22]
-- MGA-207 21: 7117 (1)
-- MGA-207 22: 7294 (1)
-- MGA-201 21: 7510 (1)
-- MGA-201 22: 7744 (1)
-- MGA-214 21: 7657 (1)
-- MGA-214 22: 7891 (1)
BRD4 [10, 11, 12, 13, 14, 15]
-- BRD4-201 9: 1751 (2)
-- BRD4-201 10: 2047 (1)
-- BRD4-201 11: 2158 (1)
-- BRD4-201 12: 2211 (0)
-- BRD4-201 13: 2581 (1)
-- BRD4-201 14: 3169 (1)
-- BRD4-201 15: 3282 (0)
-- BRD4-213 9: 1751 (2)
-- BRD4-213 10: 2047 (1)
-- BRD4-213 11: 2158 (1)
-- BRD4-213 12: 2211 (0)
-- BRD4-213 13: 2581 (1)
-- BRD4-213 14: 3169 (1)
-- BRD4-213 15: 3282 (0)
-- BRD4-203 9: 1751 (2)
-- BRD4-203 10: 2047 (1)
-- BRD4-203 11: 2158 (1)
??? Transcript(transcript_id='ENST00000371835', transcript_name='BRD4-203', gene_id='ENSG00000141867', biotype='protein_coding', contig='19', start=15247047, end=15280451, strand='-', genome='GRCh38') has 12 exons, can't truncate at exon 12
??? Transcript(transcript_id='ENST00000371835', transcript_name='BRD4-203', gene_id='ENSG00000141867', biotype='protein_coding', contig='19', start=152470

In [61]:

name_and_exon_start_phase = {}
name_and_exon_end_phase = {}
for gene_name, exon_numbers in list(five_prime_genes.items()) + [("NUTM1", NUTM1_start_exons)]:
    for exon_number in exon_numbers:
        for t in name_to_transcripts[gene_name]:
            key_prev = (t.name, exon_number - 1)
            if key_prev in name_and_exon_to_cds_length:
                
                cds_length_prev = name_and_exon_to_cds_length[(t.name, exon_number - 1)]
            else:
                print("Skipping %s" % (key_prev,))
                continue
            key = (t.name, exon_number)
            if key in name_and_exon_to_cds_length:
                cds_length = name_and_exon_to_cds_length[(t.name, exon_number)]
            else:
                print("Skipping %s" % (key,))
                continue
                
            start_phase = cds_length_prev % 3
            end_phase = cds_length % 3
            name_and_exon_start_phase[(t.name, exon_number)] = start_phase
            name_and_exon_end_phase[(t.name, exon_number)] = end_phase
            print("%s exon %d, start phase %d, end phase %d" % (t.name, exon_number, start_phase, end_phase))


MGA-207 exon 22, start phase 1, end phase 1
MGA-201 exon 22, start phase 1, end phase 1
MGA-214 exon 22, start phase 1, end phase 1
BRD4-201 exon 10, start phase 2, end phase 1
BRD4-213 exon 10, start phase 2, end phase 1
BRD4-203 exon 10, start phase 2, end phase 1
BRD4-202 exon 10, start phase 2, end phase 1
BRD4-201 exon 11, start phase 1, end phase 1
BRD4-213 exon 11, start phase 1, end phase 1
BRD4-203 exon 11, start phase 1, end phase 1
BRD4-202 exon 11, start phase 1, end phase 1
BRD4-201 exon 12, start phase 1, end phase 0
BRD4-213 exon 12, start phase 1, end phase 0
Skipping ('BRD4-203', 12)
Skipping ('BRD4-202', 12)
BRD4-201 exon 13, start phase 0, end phase 1
BRD4-213 exon 13, start phase 0, end phase 1
Skipping ('BRD4-203', 12)
Skipping ('BRD4-202', 12)
BRD4-201 exon 14, start phase 1, end phase 1
BRD4-213 exon 14, start phase 1, end phase 1
Skipping ('BRD4-203', 13)
Skipping ('BRD4-202', 13)
BRD4-201 exon 15, start phase 1, end phase 0
BRD4-213 exon 15, start phase 1, end 

In [64]:
valid_pairs = []

for five_prime_name, five_prime_exon_numbers in list(five_prime_genes.items()):
    for five_prime_exon_number in five_prime_exon_numbers:
        for five_prime_transcript in name_to_transcripts[five_prime_name]:
            five_prime_key = (five_prime_transcript.name, five_prime_exon_number)
            if five_prime_key in name_and_exon_end_phase:
                five_prime_phase = name_and_exon_end_phase[five_prime_key]
            else:
                print("Skipping %s" % (five_prime_key,))
                continue
            for three_prime_name, three_prime_exon_numbers in [("NUTM1", NUTM1_start_exons)]:
                for three_prime_transcript in name_to_transcripts[three_prime_name]:
                    for three_prime_exon_number in three_prime_exon_numbers:
                        three_prime_key = (three_prime_transcript.name, three_prime_exon_number)
                        if three_prime_key in name_and_exon_start_phase:
                            three_prime_phase = name_and_exon_start_phase[three_prime_key]
                        else:
                            print("Skipping %s" % (three_prime_key,))
                        if five_prime_phase == three_prime_phase:
                            print("%s exon %d -> %s exon %d" % (
                                five_prime_transcript.name, five_prime_exon_number, 
                                three_prime_transcript.name, three_prime_exon_number))
                            valid_pairs.append((
                                (five_prime_transcript.name, five_prime_exon_number),
                                (three_prime_transcript.name, three_prime_exon_number)
                            ))

MGA-207 exon 22 -> NUTM1-203 exon 3
MGA-207 exon 22 -> NUTM1-203 exon 6
MGA-207 exon 22 -> NUTM1-204 exon 3
MGA-207 exon 22 -> NUTM1-204 exon 6
MGA-207 exon 22 -> NUTM1-202 exon 2
MGA-207 exon 22 -> NUTM1-202 exon 5
MGA-207 exon 22 -> NUTM1-201 exon 2
MGA-207 exon 22 -> NUTM1-201 exon 5
MGA-201 exon 22 -> NUTM1-203 exon 3
MGA-201 exon 22 -> NUTM1-203 exon 6
MGA-201 exon 22 -> NUTM1-204 exon 3
MGA-201 exon 22 -> NUTM1-204 exon 6
MGA-201 exon 22 -> NUTM1-202 exon 2
MGA-201 exon 22 -> NUTM1-202 exon 5
MGA-201 exon 22 -> NUTM1-201 exon 2
MGA-201 exon 22 -> NUTM1-201 exon 5
MGA-214 exon 22 -> NUTM1-203 exon 3
MGA-214 exon 22 -> NUTM1-203 exon 6
MGA-214 exon 22 -> NUTM1-204 exon 3
MGA-214 exon 22 -> NUTM1-204 exon 6
MGA-214 exon 22 -> NUTM1-202 exon 2
MGA-214 exon 22 -> NUTM1-202 exon 5
MGA-214 exon 22 -> NUTM1-201 exon 2
MGA-214 exon 22 -> NUTM1-201 exon 5
BRD4-201 exon 10 -> NUTM1-203 exon 3
BRD4-201 exon 10 -> NUTM1-203 exon 6
BRD4-201 exon 10 -> NUTM1-204 exon 3
BRD4-201 exon 10 -> NUTM1

In [65]:
def translate(seq):
      
    table = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
    }
    protein = ""
    if len(seq)%3 == 0:
        for i in range(0, len(seq), 3):
            codon = seq[i:i + 3]
            protein+= table[codon]
    return protein

In [72]:
fused_coding_sequences = {}
fused_proteins = {}
breakpoint_offset = {}
for (k1, k2) in valid_pairs:
    (p5_name, p5_exon), (p3_name, p3_exon) = k1, k2
    p5_t = ensembl_grch38.transcripts_by_name(p5_name)[0]
    p5_full_cds = p5_t.coding_sequence
    p5_cds_length = name_and_exon_to_cds_length[(p5_name, p5_exon)]
    p5_stop_offset = min(p5_t.stop_codon_spliced_offsets)
    
    print("Truncating %s CDS (length %d, stop at %d) to exon %d (length %d)" % (
        p5_name,
        len(p5_full_cds),
        p5_stop_offset,
        p5_exon,
        p5_cds_length,
    ))
    if (p5_exon >= len(p5_t.exons)) or (p5_cds_length > p5_stop_offset):
        print("!!! Skipping %s" % (p5_name))
        continue
    
    p5_cds = p5_full_cds[:p5_cds_length]
    p3_t = ensembl_grch38.transcripts_by_name(p3_name)[0]
    p3_full_cds = p3_t.coding_sequence
    p3_cds_length_before = name_and_exon_to_cds_length[(p3_name, p3_exon - 1)]
    p3_cds = p3_full_cds[p3_cds_length_before:]
    print("Starting %s CDS (length %d) at exon %d (length %d)" % (
        p3_name,
        len(p3_full_cds),
        p3_exon,
        len(p3_cds)
    ))

    combined_cds = p5_cds + p3_cds
    assert len(combined_cds) % 3 == 0
    fused_coding_sequences[(k1, k2)] = combined_cds
    breakpoint_offset[(k1, k2)] = len(p5_cds)
    fused_proteins[(k1, k2)] = translate(combined_cds)
    
    

Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-203 CDS (length 3483) at exon 3 (length 3383)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-203 CDS (length 3483) at exon 6 (length 2408)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-204 CDS (length 3399) at exon 3 (length 3383)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-204 CDS (length 3399) at exon 6 (length 2408)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-202 CDS (length 3453) at exon 2 (length 3383)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-202 CDS (length 3453) at exon 5 (length 2408)
Truncating MGA-207 CDS (length 8571, stop at 8764) to exon 22 (length 7294)
Starting NUTM1-201 CDS (length 3399) at exon 2 (length 3383)
Truncating MGA-207 CDS (length 8571, stop

In [73]:
import pandas as pd
from collections import defaultdict
columns = [
    "upstream_gene",
    "upstream_transcript",
    "upstream_last_exon",
    "downstream_gene",
    "downstream_transcript",
    "downstream_first_exon",
    "upstream_cds",
    "downstream_cds",
    "protein_upstream",
    "protein_junction",
    "protein_downstream",
    "junction_creates_mutant_residue",
]
data = defaultdict(list)
for (k1, k2), cds in fused_coding_sequences.items():
    (upstream_transcript_name, upstream_exon), (downstream_transcript_name, downstream_exon) = k1, k2
    upstream_transcript = ensembl_grch38.transcripts_by_name(upstream_transcript_name)[0]
    upstream_gene = upstream_transcript.gene_name
    downstream_transcript = ensembl_grch38.transcripts_by_name(downstream_transcript_name)[0]
    downstream_gene = downstream_transcript.gene_name

    breakpoint = breakpoint_offset[(k1, k2)]
    protein = fused_proteins[(k1, k2)]
    data["upstream_gene"].append(upstream_gene)
    data["upstream_transcript"].append(upstream_transcript.name)
    data["upstream_last_exon"].append(upstream_exon)
    data["downstream_gene"].append(downstream_gene)
    data["downstream_transcript"].append(downstream_transcript.name)
    data["downstream_first_exon"].append(downstream_exon)
    data["upstream_cds"].append(cds[:breakpoint])
    data["downstream_cds"].append(cds[breakpoint:])
    if breakpoint % 3 == 0:
        data["junction_inside_codon"].append(False)
        data["protein_upstream"].append(protein[:breakpoint // 3])
        data["protein_junction"].append("")
        data["protein_downstream"].append(protein[breakpoint//3:])
        data["junction_same_upstream"].append(True)
        data["junction_same_downstream"].append(True)
        data["junction_same_in_both"].append(True)
        data["junction_same_in_either"].append(True)
        data["junction_mutant_residue"].append(False)
 
        
    else:
        data["junction_inside_codon"].append(True)
        data["protein_upstream"].append(protein[:breakpoint // 3])
        junction_aa = protein[breakpoint // 3]
        data["protein_junction"].append(junction_aa)
        data["protein_downstream"].append(protein[breakpoint // 3 + 1:])
        upstream_protein_sequence = upstream_transcript.protein_sequence
        downstream_protein_sequence = downstream_transcript.protein_sequence
        p3_name, p3_exon = k2
        cds_length_before_downstream_breakpoint = name_and_exon_to_cds_length[(p3_name, p3_exon - 1)]
        same_upstream = (junction_aa == upstream_protein_sequence[breakpoint//3])
        same_downstream = (junction_aa == downstream_protein_sequence[cds_length_before_downstream_breakpoint // 3])
        data["junction_same_upstream"].append(same_upstream)
        data["junction_same_downstream"].append(same_downstream)
        data["junction_same_in_both"].append(same_upstream and same_downstream)
        data["junction_same_in_either"].append(same_upstream or same_downstream)
        data["junction_mutant_residue"].append(not (same_upstream or same_downstream))
        
        
    
df = pd.DataFrame(data)

df

Unnamed: 0,upstream_gene,upstream_transcript,upstream_last_exon,downstream_gene,downstream_transcript,downstream_first_exon,upstream_cds,downstream_cds,junction_inside_codon,protein_upstream,protein_junction,protein_downstream,junction_same_upstream,junction_same_downstream,junction_same_in_both,junction_same_in_either,junction_mutant_residue
0,MGA,MGA-207,22,NUTM1,NUTM1-203,3,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
1,MGA,MGA-207,22,NUTM1,NUTM1-203,6,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
2,MGA,MGA-207,22,NUTM1,NUTM1-204,3,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
3,MGA,MGA-207,22,NUTM1,NUTM1-204,6,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
4,MGA,MGA-207,22,NUTM1,NUTM1-202,2,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,NSD3,NSD3-201,7,NUTM1,NUTM1-204,6,ATGGATTTCTCTTTCTCTTTCATGCAAGGGATCATGGGAAACACAA...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MDFSFSFMQGIMGNTIQQPPQLIDSANIRQEDAFDNNSDIAEDGGQ...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
464,NSD3,NSD3-201,7,NUTM1,NUTM1-202,2,ATGGATTTCTCTTTCTCTTTCATGCAAGGGATCATGGGAAACACAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MDFSFSFMQGIMGNTIQQPPQLIDSANIRQEDAFDNNSDIAEDGGQ...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
465,NSD3,NSD3-201,7,NUTM1,NUTM1-202,5,ATGGATTTCTCTTTCTCTTTCATGCAAGGGATCATGGGAAACACAA...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MDFSFSFMQGIMGNTIQQPPQLIDSANIRQEDAFDNNSDIAEDGGQ...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
466,NSD3,NSD3-201,7,NUTM1,NUTM1-201,2,ATGGATTTCTCTTTCTCTTTCATGCAAGGGATCATGGGAAACACAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MDFSFSFMQGIMGNTIQQPPQLIDSANIRQEDAFDNNSDIAEDGGQ...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False


In [74]:
df_short = pd.DataFrame()
df_short["upstream_transcript"] = df["upstream_transcript"]
df_short["upstream_exon"] = df["upstream_last_exon"]
df_short["downstream_transcript"] = df["downstream_transcript"]
df_short["downstream_exon"] = df["downstream_first_exon"]

df_short["protein_upstream"] = df['protein_upstream'].str.slice(-10, None)
df_short["protein_junction"] = df['protein_junction']
df_short["protein_downstream"] = df['protein_downstream'].str.slice(0, 10)
df_short["inside_codon"] = df["junction_inside_codon"]
df_short["same_upstream"] = df["junction_same_upstream"]
df_short["same_downstream"] = df["junction_same_downstream"]
df_short["junction_mutant"] = df["junction_mutant_residue"]
df_short["protein_combined"] = [
    row.protein_upstream + row.protein_junction + row.protein_downstream
    if row.inside_codon else row.protein_upstream +  row.protein_downstream
    for  (_, row)  in df_short.iterrows()
];
df_short

Unnamed: 0,upstream_transcript,upstream_exon,downstream_transcript,downstream_exon,protein_upstream,protein_junction,protein_downstream,inside_codon,same_upstream,same_downstream,junction_mutant,protein_combined
0,MGA-207,22,NUTM1-203,3,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
1,MGA-207,22,NUTM1-203,6,QVAGSAVALP,V,YIPKKAASKT,True,False,True,False,QVAGSAVALPVYIPKKAASKT
2,MGA-207,22,NUTM1-204,3,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
3,MGA-207,22,NUTM1-204,6,QVAGSAVALP,V,YIPKKAASKT,True,False,True,False,QVAGSAVALPVYIPKKAASKT
4,MGA-207,22,NUTM1-202,2,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
...,...,...,...,...,...,...,...,...,...,...,...,...
463,NSD3-201,7,NUTM1-204,6,SSPEATSGST,V,YIPKKAASKT,True,False,True,False,SSPEATSGSTVYIPKKAASKT
464,NSD3-201,7,NUTM1-202,2,SSPEATSGST,A,SALPGPDMSM,True,False,True,False,SSPEATSGSTASALPGPDMSM
465,NSD3-201,7,NUTM1-202,5,SSPEATSGST,V,YIPKKAASKT,True,False,True,False,SSPEATSGSTVYIPKKAASKT
466,NSD3-201,7,NUTM1-201,2,SSPEATSGST,A,SALPGPDMSM,True,False,True,False,SSPEATSGSTASALPGPDMSM


In [75]:
df.to_csv("nutm1-fusion-sequences.csv", index=True)

In [76]:
df_short.to_csv("nutm1-fusion-sequences-compact.csv", index=True)

In [77]:
for p, g, in df_short.groupby("protein_combined"):
    print(p, g)

EGDIFTFDRTASALPGPDMSM     upstream_transcript  upstream_exon downstream_transcript  downstream_exon  \
232             CIC-201             18             NUTM1-203                3   
234             CIC-201             18             NUTM1-204                3   
236             CIC-201             18             NUTM1-202                2   
238             CIC-201             18             NUTM1-201                2   
240             CIC-206             18             NUTM1-203                3   
242             CIC-206             18             NUTM1-204                3   
244             CIC-206             18             NUTM1-202                2   
246             CIC-206             18             NUTM1-201                2   
248             CIC-203             19             NUTM1-203                3   
250             CIC-203             19             NUTM1-204                3   
252             CIC-203             19             NUTM1-202                2   
254   

In [241]:
name_to_transcript["BRD4"].protein_sequence

'MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPPPPETSNPNKPKRQTNQLQYLLRVVLKTLWKHQFAWPFQQPVDAVKLNLPDYYKIIKTPMDMGTIKKRLENNYYWNAQECIQDFNTMFTNCYIYNKPGDDIVLMAEALEKLFLQKINELPTEETEIMIVQAKGRGRGRKETGTAKPGVSTVPNTTQASTPPQTQTPQPNPPPVQATPHPFPAVTPDLIVQTPVMTVVPPQPLQTPPPVPPQPQPPPAPAPQPVQSHPPIIAATPQPVKTKKGVKRKADTTTPTTIDPIHEPPSLPPEPKTTKLGQRRESSRPVKPPKKDVPDSQQHPAPEKSSKVSEQLKCCSGILKEMFAKKHAAYAWPFYKPVDVEALGLHDYCDIIKHPMDMSTIKSKLEAREYRDAQEFGADVRLMFSNCYKYNPPDHEVVAMARKLQDVFEMRFAKMPDEPEEPVVAVSSPAVPPPTKVVAPPSSSDSSSDSSSDSDSSTDDSEEERAQRLAELQEQLKAVHEQLAALSQPQQNKPKKKEKDKKEKKKEKHKRKEEVEENKKSKAKEPPPKKTKKNNSSNSNVSKKEPAPMKSKPPPTYESEEEDKCKPMSYEEKRQLSLDINKLPGEKLGRVVHIIQSREPSLKNSNPDEIEIDFETLKPSTLRELERYVTSCLRKKRKPQAEKVDVIAGSSKMKGFSSSESESSSESSSSDSEDSETEMAPKSKKKGHPGREQKKHHHHHHQQMQQAPAPVPQQPPPPPQQPPPPPPPQQQQQPPPPPPPPSMPQQAAPAMKSSPPPFIATQVPVLEPQLPGSVFDPIGHFTQPILHLPQPELPPHLPQPPEHSTPPHLNQHAVVSPPALHNALPQQPSRPSNRAAALPPKPARPPAVSPALTQTPLLPQPPMAQPPQVLLEDEEPPAPPLTSMQMQLYLQQLQKVQPPTPLLPSVKVQSQPPPPLPPPPHPSVQQQLQQQPPPPPPPQPQPPPQQQHQPPPRPVHL

In [251]:
name_to_transcript["NUTM1"].protein_sequence[6:]

'PGPDCLILEASRQPQLVPKPERMASDGASALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMPSVFSPDNPLMLSAFPSSLLVTGDGGPCLSGAGAGKVIVKVKTEGGSAEPSQTQNFILTQTALNSTAPGTPCGGLEGPAPPFVTASNVKTILPSKAVGVSQEGPPGLPPQPPPPVAQLVPIVPLEKAWPGPHGTTGEGGPVATLSKPSLGDRSKISKDVYENFRQWQRYKALARRHLSQSPDTEALSCFLIPVLRSLARLKPTMTLEEGLPLAVQEWEHTSNFDRMIFYEMAERFMEFEAEEMQIQNTQLMNGSQGLSPATPLKLDPLGPLASEVCQQPVYIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEWLVGTHLATGESDGKQEEEGQQQEEEGMYPDPGLLSYINELCSQKVFVSKVEAVIHPQFLADLLSPEKQRDPLALIEELEQEEGLTLAQLVQKRLMALEEEEDAEAPPSFSGAQLDSSPSGSVEDEDGDGRLRPSPGLQGAGGAACLGKVSSSGKRAREVHGGQEQALDSPRGMHRDGNTLPSPSSWDLQPELAAPQGTPGPLGVERRGSGKVINQVSLHQDGHLGGAGPPGHCLVADRTSEALPLCWQGGFQPESTPSLDAGLAELAPLQGQGLEKQVLGLQKGQQTGGRGVLPQGKEPLAVPWEGSSGAMWGDDRGTPMAQSYDQNPSPRAAGERDDVCLSPGVWLSSEMDAVGLELPVQIEEVIESFQVEKCVTEYQEGCQGLGSRGNISLGPGETLVPGDTESSVIPCGGTVAAAALEKRNYCSLPGPLRANSPPLRSKENQEQSCETVGHPSDLWAEGCFPLLESGDSTLGSSKETLPPTCQGNLLIMGTEDASSLPEASQEAGSRGNSFSPLLETIEPVNILDVKDDCGLQLRVSEDTCPLNVHSYDPQGEGRVDPDLSKPKNLAPLQESQESYTTGTPKATSSHQGLGSTLPRR

In [249]:
df_short[df_short.protein_upstream.str.slice(-2, None) == "ST"]

Unnamed: 0,upstream_transcript,upstream_exon,downstream_transcript,downstream_exon,protein_upstream,protein_junction,protein_downstream,inside_codon,same_upstream,same_downstream,junction_mutant
9,BRD4-213,14,NUTM1-203,3,RHHKSDPYST,A,SALPGPDMSM,True,False,True,False
10,BRD4-213,14,NUTM1-203,6,RHHKSDPYST,V,YIPKKAASKT,True,False,True,False
19,CIC-209,18,NUTM1-203,3,SYRKKRKNST,A,SALPGPDMSM,True,False,True,False
20,CIC-209,18,NUTM1-203,6,SYRKKRKNST,V,YIPKKAASKT,True,False,True,False
29,NSD3-202,7,NUTM1-203,3,SSPEATSGST,A,SALPGPDMSM,True,False,True,False
30,NSD3-202,7,NUTM1-203,6,SSPEATSGST,V,YIPKKAASKT,True,False,True,False


In [250]:
df

Unnamed: 0,upstream_gene,upstream_transcript,upstream_last_exon,downstream_gene,downstream_transcript,downstream_first_exon,upstream_cds,downstream_cds,junction_inside_codon,protein_upstream,protein_junction,protein_downstream,junction_same_upstream,junction_same_downstream,junction_same_in_both,junction_same_in_either,junction_mutant_residue
0,MGA,MGA-207,22,NUTM1,NUTM1-203,3,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
1,MGA,MGA-207,22,NUTM1,NUTM1-203,6,ATGGAGGAGAAACAGCAGATTATATTGGCTAATCAAGATGGTGGAA...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MEEKQQIILANQDGGTVAGAAPTFFVILKQPGNGKTDQGILVTNQD...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
2,BRD4,BRD4-213,10,NUTM1,NUTM1-203,3,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,True,True,True,True,False
3,BRD4,BRD4-213,10,NUTM1,NUTM1-203,6,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
4,BRD4,BRD4-213,11,NUTM1,NUTM1-203,3,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
5,BRD4,BRD4-213,11,NUTM1,NUTM1-203,6,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
6,BRD4,BRD4-213,12,NUTM1,NUTM1-203,2,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,GTTACTCTGGGTCCTGGACCTGACTGCCTCATTCTGGAGGCTTCCA...,False,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,,VTLGPGPDCLILEASRQPQLVPKPERMASDGASALPGPDMSMKPSA...,True,True,True,True,False
7,BRD4,BRD4-213,13,NUTM1,NUTM1-203,3,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,True,True,True,True,False
8,BRD4,BRD4-213,13,NUTM1,NUTM1-203,6,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,TGTACATTCCGAAGAAGGCAGCCTCCAAGACACGGGCCCCCCGCCG...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,V,YIPKKAASKTRAPRRRQRKAQRPPAPEAPKEIPPEAVKEYVDIMEW...,False,True,False,True,False
9,BRD4,BRD4-213,14,NUTM1,NUTM1-203,3,ATGTCTGCGGAGAGCGGCCCTGGGACGAGATTGAGAAATCTGCCAG...,CATCTGCATTGCCGGGACCGGATATGAGCATGAAACCTAGTGCCGC...,True,MSAESGPGTRLRNLPVMGDGLETSQMSTTQAQAQPQPANAASTNPP...,A,SALPGPDMSMKPSAAPSPSPALPFLPPTSDPPDHPPREPPPQPIMP...,False,True,False,True,False
