In [1]:
import pandas as pd
from glob import glob
import processor
import bioframe as bf
import numpy as np
import bioframe.vis
import matplotlib.pyplot as plt

In [2]:
data_path = '/scratch/groups/horence/angelika/EM_expression/data/'
processed_path = './processed_files/'

In [3]:
glob(data_path+'*')

['/scratch/groups/horence/angelika/EM_expression/data/iso',
 '/scratch/groups/horence/angelika/EM_expression/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_exon_reads.parquet',
 '/scratch/groups/horence/angelika/EM_expression/data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct.gz',
 '/scratch/groups/horence/angelika/EM_expression/data/gencode.v38.annotation.gff3.gz',
 '/scratch/groups/horence/angelika/EM_expression/data/GTEx_Analysis_2017-06-05_v8_STARv2.5.3a_junctions.gct.gz']

In [4]:
genecode_path = data_path+'gencode.v38.annotation.gff3.gz'
genecode = bf.read_table(genecode_path, schema='gff', skiprows=7)
genecode = genecode[~genecode.start.isna()]
genecode.start = genecode.start.astype(int)
genecode.end = genecode.end.astype(int)

genecode.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attributes
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000223972.5;gene_id=ENSG00000223972.5...
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.5;...
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...


In [5]:
glob(processed_path+'*')

['./processed_files/genes_filtered.tsv']

In [6]:
genes = pd.read_csv(processed_path+'genes_filtered.tsv', sep='\t')

In [7]:
genes.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attributes,Description,Name,Parent,counts
0,chr1,HAVANA,gene,141474,173862,.,-,.,ID=ENSG00000241860.7;gene_id=ENSG00000241860.7...,RP11-34P13.13,ENSG00000241860.7,ENSG00000241860.7,6
1,chr1,HAVANA,gene,257864,359681,.,-,.,ID=ENSG00000228463.10;gene_id=ENSG00000228463....,AP006222.1,ENSG00000228463.10,ENSG00000228463.10,6
2,chr1,HAVANA,gene,365389,522928,.,-,.,ID=ENSG00000237094.12;gene_id=ENSG00000237094....,RP4-669L17.4,ENSG00000237094.12,ENSG00000237094.12,25
3,chr1,HAVANA,gene,586071,827796,.,-,.,ID=ENSG00000230021.10;gene_id=ENSG00000230021....,RP11-206L10.17,ENSG00000230021.10,ENSG00000230021.10,15
4,chr1,HAVANA,gene,778747,810065,.,+,.,ID=ENSG00000237491.10;gene_id=ENSG00000237491....,LINC01409,ENSG00000237491.10,ENSG00000237491.10,32


In [8]:
transcripts = processor.get_feature(genecode, 'transcript')
transcripts = pd.merge(transcripts, genes.Name, left_on='Parent', right_on='Name').drop('Name', axis=1)

In [9]:
transcripts.head()

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attributes,Parent,tID
0,chr1,HAVANA,transcript,141474,149707,.,-,.,ID=ENST00000484859.1;Parent=ENSG00000241860.7;...,ENSG00000241860.7,ENST00000484859.1
1,chr1,HAVANA,transcript,142808,146831,.,-,.,ID=ENST00000490997.5;Parent=ENSG00000241860.7;...,ENSG00000241860.7,ENST00000490997.5
2,chr1,HAVANA,transcript,146386,173862,.,-,.,ID=ENST00000466557.6;Parent=ENSG00000241860.7;...,ENSG00000241860.7,ENST00000466557.6
3,chr1,HAVANA,transcript,165491,169210,.,-,.,ID=ENST00000662089.1;Parent=ENSG00000241860.7;...,ENSG00000241860.7,ENST00000662089.1
4,chr1,HAVANA,transcript,165889,168767,.,-,.,ID=ENST00000491962.1;Parent=ENSG00000241860.7;...,ENSG00000241860.7,ENST00000491962.1


In [10]:
exons = processor.get_feature(genecode, 'exon')
exons = pd.merge(exons, transcripts.tID, left_on='Parent', right_on='tID').drop('tID', axis=1)

In [11]:
def get_exons(gene, transcripts, exons):
    gtrans= transcripts[transcripts.Parent == gene]
    exons_list = [exons[exons.Parent==tr.tID].iloc[:, :5] for index, tr in gtrans.iterrows()]
    return exons_list
        

In [12]:
def build_matrix(exons_list):
    merged_df = bf.merge(pd.concat(exons_list), min_dist=0).drop('n_intervals', axis=1)
    final_overlap = merged_df
    for i, exon_df in enumerate(exons_list):
        current_overlap = bf.overlap(final_overlap, exon_df)
        current_overlap = current_overlap[~current_overlap.start.duplicated()]
        matches = ~current_overlap['start_'].isna()
        current_overlap = current_overlap.iloc[:, np.arange(0, 3+i)]
        current_overlap['exon_'+str(i)] = 0
        current_overlap.loc[matches, 'exon_'+str(i)] =1
        final_overlap = current_overlap

    final_overlap.iloc[:, 3:] = final_overlap.iloc[:, 3:].astype(int)
    return final_overlap

In [41]:
bf.select(genes, 'chr2')

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attributes,Description,Name,Parent,counts
1109,chr2,HAVANA,gene,217730,266398,.,-,.,ID=ENSG00000035115.22;gene_id=ENSG00000035115....,SH3YL1,ENSG00000035115.22,ENSG00000035115.22,29
1110,chr2,HAVANA,gene,264140,278283,.,+,.,ID=ENSG00000143727.16;gene_id=ENSG00000143727....,ACP1,ENSG00000143727.16,ENSG00000143727.16,13
1111,chr2,HAVANA,gene,279558,288851,.,-,.,ID=ENSG00000189292.16;gene_id=ENSG00000189292....,ALKAL2,ENSG00000189292.16,ENSG00000189292.16,8
1112,chr2,HAVANA,gene,290941,314373,.,-,.,ID=ENSG00000235779.8;gene_id=ENSG00000235779.8...,AC079779.5,ENSG00000235779.8,ENSG00000235779.8,39
1113,chr2,HAVANA,gene,558153,578145,.,+,.,ID=ENSG00000233633.2;gene_id=ENSG00000233633.2...,AC093326.3,ENSG00000233633.2,ENSG00000233633.2,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,chr2,HAVANA,gene,241675747,241686944,.,-,.,ID=ENSG00000168393.13;gene_id=ENSG00000168393....,DTYMK,ENSG00000168393.13,ENSG00000168393.13,7
1995,chr2,HAVANA,gene,241687085,241729478,.,+,.,ID=ENSG00000168395.16;gene_id=ENSG00000168395....,ING5,ENSG00000168395.16,ENSG00000168395.16,13
1996,chr2,HAVANA,gene,241734602,241768816,.,+,.,ID=ENSG00000180902.18;gene_id=ENSG00000180902....,D2HGDH,ENSG00000180902.18,ENSG00000180902.18,16
1997,chr2,HAVANA,gene,241809065,241817413,.,+,.,ID=ENSG00000204099.12;gene_id=ENSG00000204099....,NEU4,ENSG00000204099.12,ENSG00000204099.12,17


In [42]:
dummy_gene = 'ENSG00000180902.18'

In [37]:
exons_list = get_exons(dummy_gene, transcripts, exons)

In [38]:
exons_matrix = build_matrix(exons_list)

In [39]:
exons_matrix

Unnamed: 0,chrom,start,end,exon_0,exon_1,exon_2,exon_3,exon_4,exon_5,exon_6,exon_7,exon_8,exon_9,exon_10
0,chr14,20343615,20343687,1,1,1,1,0,0,0,0,0,0,0
1,chr14,20344932,20345126,1,1,1,1,0,0,0,0,0,0,0
2,chr14,20345394,20345464,1,1,1,1,0,0,0,0,0,0,0
3,chr14,20346863,20346913,1,1,1,1,0,0,0,0,0,0,0
4,chr14,20350526,20350622,1,0,1,1,0,0,0,0,0,0,0
5,chr14,20350699,20351122,1,0,1,1,1,0,0,0,0,0,0
6,chr14,20352245,20352347,1,1,1,1,1,0,0,0,0,0,0
7,chr14,20352513,20353003,0,0,0,0,0,1,0,0,0,0,0
8,chr14,20354085,20354247,1,1,1,1,1,1,0,0,0,0,0
9,chr14,20354642,20354947,1,1,1,1,0,0,1,0,0,0,0


In [40]:
exons_matrix.to_csv(processed_path + dummy_gene+ ".tsv", sep='\t', index = False)