In [9]:
import pandas as pd

In [10]:
df = pd.read_csv("nutm1-fusion-sequences-compact.csv");
df

Unnamed: 0.1,Unnamed: 0,upstream_transcript,upstream_exon,downstream_transcript,downstream_exon,protein_upstream,protein_junction,protein_downstream,inside_codon,same_upstream,same_downstream,junction_mutant,protein_combined
0,0,MGA-207,22,NUTM1-203,3,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
1,1,MGA-207,22,NUTM1-203,6,QVAGSAVALP,V,YIPKKAASKT,True,False,True,False,QVAGSAVALPVYIPKKAASKT
2,2,MGA-207,22,NUTM1-204,3,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
3,3,MGA-207,22,NUTM1-204,6,QVAGSAVALP,V,YIPKKAASKT,True,False,True,False,QVAGSAVALPVYIPKKAASKT
4,4,MGA-207,22,NUTM1-202,2,QVAGSAVALP,A,SALPGPDMSM,True,False,True,False,QVAGSAVALPASALPGPDMSM
...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,463,NSD3-201,7,NUTM1-204,6,SSPEATSGST,V,YIPKKAASKT,True,False,True,False,SSPEATSGSTVYIPKKAASKT
464,464,NSD3-201,7,NUTM1-202,2,SSPEATSGST,A,SALPGPDMSM,True,False,True,False,SSPEATSGSTASALPGPDMSM
465,465,NSD3-201,7,NUTM1-202,5,SSPEATSGST,V,YIPKKAASKT,True,False,True,False,SSPEATSGSTVYIPKKAASKT
466,466,NSD3-201,7,NUTM1-201,2,SSPEATSGST,A,SALPGPDMSM,True,False,True,False,SSPEATSGSTASALPGPDMSM


In [25]:
import numpy as np

def transcript_key(t):
    return (t.complete, -t.support_level if t.support_level else 100, len(t.protein_sequence) if t.protein_sequence else 0)

def pick_best_transcript(ts):
    return sorted(ts, key=transcript_key)[-1]

from pyensembl import ensembl_grch38
with open("nutm1-fusion-sequences.fasta", "w") as f:
    for p, group in df.groupby("protein_combined"):
        print(p, group)
        upstream_transcript_names = group["upstream_transcript"]
        upstream_exons = group["upstream_exon"]

        downstream_transcript_names = group["downstream_transcript"]
        downstream_exons = group["downstream_exon"]
        all_sources = [
            "%s-exon-%d-%s-exon-%d" % (
                ut, ue, dt, de
            ) 
            for (ut, ue, dt, de)
            in zip(upstream_transcript_names, upstream_exons, downstream_transcript_names, downstream_exons)
        ]
        concat_sources = ";".join(all_sources)
        
        downstream_transcripts = [
            ensembl_grch38.transcripts_by_name(t_name)[0]
            for t_name in downstream_transcript_names
        ]
        best_nutm1_transcript = pick_best_transcript(downstream_transcripts)
        downstream_mask = np.array([
            t.name == best_nutm1_transcript.name
            for t in downstream_transcripts
        ])
        upstream_names_subset = upstream_transcript_names[downstream_mask]
        upstream_transcripts = [
            ensembl_grch38.transcripts_by_name(t_name)[0]
            for t_name in upstream_names_subset
        ]
        best_upstream_transcript = pick_best_transcript(upstream_transcripts)
        upstream_mask = np.array([
            t_name == best_upstream_transcript.name
            for t_name in upstream_transcript_names
        ])
        combined_mask = upstream_mask & downstream_mask
        assert combined_mask.sum() == 1
       
        best_upstream_exon = upstream_exons[combined_mask].iloc[0]
        best_downstream_exon = downstream_exons[combined_mask].iloc[0]
        
        n_aa = len(p)
        line = ">%s-exon-%d-%s-exon-%d-%daa %s\n%s\n" % (
            best_upstream_transcript.name,
            best_upstream_exon,
            best_nutm1_transcript.name,
            best_downstream_exon,
            n_aa,
            concat_sources,
            p)
        print(line)
        f.write(line)

EGDIFTFDRTASALPGPDMSM      Unnamed: 0 upstream_transcript  upstream_exon downstream_transcript  \
232         232             CIC-201             18             NUTM1-203   
234         234             CIC-201             18             NUTM1-204   
236         236             CIC-201             18             NUTM1-202   
238         238             CIC-201             18             NUTM1-201   
240         240             CIC-206             18             NUTM1-203   
242         242             CIC-206             18             NUTM1-204   
244         244             CIC-206             18             NUTM1-202   
246         246             CIC-206             18             NUTM1-201   
248         248             CIC-203             19             NUTM1-203   
250         250             CIC-203             19             NUTM1-204   
252         252             CIC-203             19             NUTM1-202   
254         254             CIC-203             19             NUT

In [26]:
!cat nutm1-fusion-sequences.fasta

>CIC-209-exon-19-NUTM1-201-exon-2-21aa CIC-201-exon-18-NUTM1-203-exon-3;CIC-201-exon-18-NUTM1-204-exon-3;CIC-201-exon-18-NUTM1-202-exon-2;CIC-201-exon-18-NUTM1-201-exon-2;CIC-206-exon-18-NUTM1-203-exon-3;CIC-206-exon-18-NUTM1-204-exon-3;CIC-206-exon-18-NUTM1-202-exon-2;CIC-206-exon-18-NUTM1-201-exon-2;CIC-203-exon-19-NUTM1-203-exon-3;CIC-203-exon-19-NUTM1-204-exon-3;CIC-203-exon-19-NUTM1-202-exon-2;CIC-203-exon-19-NUTM1-201-exon-2;CIC-209-exon-19-NUTM1-203-exon-3;CIC-209-exon-19-NUTM1-204-exon-3;CIC-209-exon-19-NUTM1-202-exon-2;CIC-209-exon-19-NUTM1-201-exon-2
EGDIFTFDRTASALPGPDMSM
>CIC-209-exon-19-NUTM1-201-exon-5-21aa CIC-201-exon-18-NUTM1-203-exon-6;CIC-201-exon-18-NUTM1-204-exon-6;CIC-201-exon-18-NUTM1-202-exon-5;CIC-201-exon-18-NUTM1-201-exon-5;CIC-206-exon-18-NUTM1-203-exon-6;CIC-206-exon-18-NUTM1-204-exon-6;CIC-206-exon-18-NUTM1-202-exon-5;CIC-206-exon-18-NUTM1-201-exon-5;CIC-203-exon-19-NUTM1-203-exon-6;CIC-203-exon-19-NUTM1-204-exon-6;CIC-203-exon-19-NUTM1-202-exon-5;CIC-203

In [27]:
!cat nutm1-fusion-sequences.fasta

>CIC-209-exon-19-NUTM1-201-exon-2-21aa CIC-201-exon-18-NUTM1-203-exon-3;CIC-201-exon-18-NUTM1-204-exon-3;CIC-201-exon-18-NUTM1-202-exon-2;CIC-201-exon-18-NUTM1-201-exon-2;CIC-206-exon-18-NUTM1-203-exon-3;CIC-206-exon-18-NUTM1-204-exon-3;CIC-206-exon-18-NUTM1-202-exon-2;CIC-206-exon-18-NUTM1-201-exon-2;CIC-203-exon-19-NUTM1-203-exon-3;CIC-203-exon-19-NUTM1-204-exon-3;CIC-203-exon-19-NUTM1-202-exon-2;CIC-203-exon-19-NUTM1-201-exon-2;CIC-209-exon-19-NUTM1-203-exon-3;CIC-209-exon-19-NUTM1-204-exon-3;CIC-209-exon-19-NUTM1-202-exon-2;CIC-209-exon-19-NUTM1-201-exon-2
EGDIFTFDRTASALPGPDMSM
>CIC-209-exon-19-NUTM1-201-exon-5-21aa CIC-201-exon-18-NUTM1-203-exon-6;CIC-201-exon-18-NUTM1-204-exon-6;CIC-201-exon-18-NUTM1-202-exon-5;CIC-201-exon-18-NUTM1-201-exon-5;CIC-206-exon-18-NUTM1-203-exon-6;CIC-206-exon-18-NUTM1-204-exon-6;CIC-206-exon-18-NUTM1-202-exon-5;CIC-206-exon-18-NUTM1-201-exon-5;CIC-203-exon-19-NUTM1-203-exon-6;CIC-203-exon-19-NUTM1-204-exon-6;CIC-203-exon-19-NUTM1-202-exon-5;CIC-203