In [1]:
import pandas as pd
import numpy as np

# get tss regions
- read mm10 gtf
- extract tss

In [2]:
mm10gtf = "/cndd2/Public_Datasets/references/mm10/transcriptome/gencode.vM10.annotation_genes.gtf"

In [5]:
gtf = pd.read_csv(mm10gtf, sep ='\t', header=None)

In [6]:
gtf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC..."
1,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR..."
2,chr1,HAVANA,gene,3205901,3671498,.,-,.,"gene_id ""ENSMUSG00000051951.5""; gene_type ""pro..."
3,chr1,HAVANA,gene,3252757,3253236,.,+,.,"gene_id ""ENSMUSG00000102851.1""; gene_type ""pro..."
4,chr1,HAVANA,gene,3365731,3368549,.,-,.,"gene_id ""ENSMUSG00000103377.1""; gene_type ""TEC..."


In [10]:
# check these are all genes not transripts
gtf.shape, np.sum(gtf[2] == 'gene')

((48440, 9), 48440)

### extract mm10 gene name

In [7]:
gene_info = [i.split(';') for i in gtf[8]] # seperate elements of identification

In [8]:
gene_info[0] # check the fields of a row

['gene_id "ENSMUSG00000102693.1"',
 ' gene_type "TEC"',
 ' gene_status "KNOWN"',
 ' gene_name "4933401J01Rik"',
 ' level 2',
 ' havana_gene "OTTMUSG00000049935.1"',
 '']

In [9]:
# get ensemble id

In [11]:
gene_id = [i[0].split()[1] for i in gene_info] # get the second part of each string split by space
gene_id = [i.strip("'").strip('"') for i in gene_id] # strip away quotes
gene_id = [i.split('.')[0] for i in gene_id] # remove transcript ids

In [12]:
gtf['gene_id'] = gene_id

In [13]:
gtf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,gene_id
0,chr1,HAVANA,gene,3073253,3074322,.,+,.,"gene_id ""ENSMUSG00000102693.1""; gene_type ""TEC...",ENSMUSG00000102693
1,chr1,ENSEMBL,gene,3102016,3102125,.,+,.,"gene_id ""ENSMUSG00000064842.1""; gene_type ""snR...",ENSMUSG00000064842
2,chr1,HAVANA,gene,3205901,3671498,.,-,.,"gene_id ""ENSMUSG00000051951.5""; gene_type ""pro...",ENSMUSG00000051951
3,chr1,HAVANA,gene,3252757,3253236,.,+,.,"gene_id ""ENSMUSG00000102851.1""; gene_type ""pro...",ENSMUSG00000102851
4,chr1,HAVANA,gene,3365731,3368549,.,-,.,"gene_id ""ENSMUSG00000103377.1""; gene_type ""TEC...",ENSMUSG00000103377


### get TSS

In [20]:
def get_tss(gene_df):
    pos_strand = gene_df[6] == '+'
    neg_strand = gene_df[6] == '-'
    tss = gene_df[3]*pos_strand.astype(int) + gene_df[4]*neg_strand.astype(int) - neg_strand.astype(int)
    tss_up = gene_df[3]*pos_strand.astype(int) + pos_strand.astype(int) + gene_df[4]*neg_strand.astype(int)
    return tss, tss_up

In [21]:
tss, tss_up = get_tss(gtf)

### create bedfile of tss information

In [29]:
tss_bed = pd.DataFrame(data=np.array([gtf[0], tss.values, tss_up.values, gtf['gene_id']]).T,
                       columns=['chrom', 'start', 'end', 'gene_id'])

In [34]:
tss_bed.head()

Unnamed: 0,chrom,start,end,gene_id
0,chr1,3073253,3073254,ENSMUSG00000102693
1,chr1,3102016,3102017,ENSMUSG00000064842
2,chr1,3671497,3671498,ENSMUSG00000051951
3,chr1,3252757,3252758,ENSMUSG00000102851
4,chr1,3368548,3368549,ENSMUSG00000103377


In [37]:
tss_bed.to_csv('data/mm10_tss.bed', sep='\t', columns=None, header=None, index=None)

In [39]:
pd.read_csv('data/mm10_tss.bed', sep='\t', header=None).head()

Unnamed: 0,0,1,2,3
0,chr1,3073253,3073254,ENSMUSG00000102693
1,chr1,3102016,3102017,ENSMUSG00000064842
2,chr1,3671497,3671498,ENSMUSG00000051951
3,chr1,3252757,3252758,ENSMUSG00000102851
4,chr1,3368548,3368549,ENSMUSG00000103377


# run liftover to HG 38

In [40]:
!/cndd/bin/liftOver

liftOver - Move annotations from one assembly to another
usage:
   liftOver oldFile map.chain newFile unMapped
oldFile and newFile are in bed format by default, but can be in GFF and
maybe eventually others with the appropriate flags below.
The map.chain file has the old genome as the target and the new genome
as the query.

***********************************************************************
         assemblies of the same organism. It may not do what you want
         if you are lifting between different organisms. If there has
         been a rearrangement in one of the species, the size of the
         region being mapped may change dramatically after mapping.
***********************************************************************

options:
   -minMatch=0.N Minimum ratio of bases that must remap. Default 0.95
   -gff  File is in gff/gtf format.  Note that the gff lines are converted
         separately.  It would be good to have a separate check after this
         that the line

# liftover enhancers with less strict overlap requirement

In [41]:
!ls data

annotated_bins.tsc.gz		    mm10ToHg19.over.chain.gz
enhancer_bed.bed.gz		    mm10ToHg38.over.chain.gz
enhancer_bins.bed.gz		    mm10_tss.bed
enhancers_chromsort_slop1kb.bed.gz  promoters_lifted.bed.gz
enhancers_lifted.bed.gz		    promoter_sort.bed.gz
enhancers_unlifted.bed.gz	    promoters_unlifted.bed.gz
hi_c_bins.bed.gz		    tss_bins.bed.gz
human_genes.gz			    use_gene_bed.gz


In [44]:
!gunzip data/enhancer_bed.bed.gz

In [45]:
!/cndd/bin/liftOver -minMatch=0.8 data/enhancer_bed.bed data/mm10ToHg38.over.chain.gz enhancers_lifted.bed enahncers_unlifted.bed

Reading liftover chains
Mapping coordinates


In [46]:
!/cndd/bin/liftOver -minMatch=0.8 data/mm10_tss.bed data/mm10ToHg38.over.chain.gz tss_lifted.bed tss_unlifted.bed

Reading liftover chains
Mapping coordinates


In [47]:
!wc -l tss_lifted.bed

33814 tss_lifted.bed


In [48]:
!wc -l enhancers_lifted.bed

126134 enhancers_lifted.bed


In [49]:
!head enhancers_lifted.bed

chr8	55592914	55593008	5
chr8	55587811	55587820	7
chr8	55528337	55528612	10
chr8	55482940	55483094	17
chr8	55473255	55473498	19
chr8	55468084	55468527	20
chr8	55447899	55447923	23
chr8	55436441	55436592	24
chr8	55417801	55418057	28
chr8	55417241	55417460	29


In [50]:
! head tss_lifted.bed

chr8	55102323	55102324	ENSMUSG00000051951
chr4	83273518	83273519	ENSMUSG00000102851
chr8	55409874	55409875	ENSMUSG00000103377
chr8	55288183	55288184	ENSMUSG00000089699
chr8	55230212	55230213	ENSMUSG00000103201
chr12	68746378	68746379	ENSMUSG00000103147
chr8	54559208	54559209	ENSMUSG00000025900
chr8	54559302	54559303	ENSMUSG00000109048
chr8	54453872	54453873	ENSMUSG00000025902
chr8	54454829	54454830	ENSMUSG00000104238
