### 15 November 2019
# Bedtools intersect of transcript-merged slop bed files
### Pavlos Bousounis
***Last updated 2019-11-19***

In [1]:
from datetime import datetime
import os
import pandas as pd
import pybedtools
from pybedtools import BedTool
import re

In [4]:
# today's date
today = datetime.today().strftime('%Y-%m-%d')
today

'2019-11-19'

In [3]:
# set base directories
basedir = '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery'
slomp_dir = os.path.join(basedir, 'output/2019-11-18_slops_merged-by-tx')
data_dir = os.path.join(basedir, 'data')

In [5]:
# get list of merged slop tx files
slomp_files = os.listdir(slomp_dir)
slomp_files

['2019-11-18_RefSeqGRCh37_slop50_merged.bed',
 '2019-11-18_RefSeqGRCh37_slop45_merged.bed',
 '2019-11-18_RefSeqGRCh37_slop85_merged.bed']

In [9]:
# import merged slop bed file
file = os.path.join(slomp_dir, slomp_files[1])
df = pd.read_csv(file, sep='\t').sort_values(['chr', 'start'])

print('\n')
df.info()
print('\n')
df



<class 'pandas.core.frame.DataFrame'>
Int64Index: 696951 entries, 696948 to 3412
Data columns (total 5 columns):
chr      696951 non-null object
start    696951 non-null int64
end      696951 non-null int64
name     696951 non-null object
tx_id    696951 non-null object
dtypes: int64(2), object(3)
memory usage: 31.9+ MB




Unnamed: 0,chr,start,end,name,tx_id
696948,chr1,11829,12272,NR_046018:exon1(+),NR_046018
696949,chr1,12568,12766,NR_046018:exon2(+),NR_046018
696950,chr1,13176,14454,NR_046018:exon3(+),NR_046018
696937,chr1,14317,14874,NR_024540:exon1(-),NR_024540
696938,chr1,14925,15083,NR_024540:exon2(-),NR_024540
...,...,...,...,...,...
3415,chrY,59347190,59348341,NR_138048:exon1(-),NR_138048
3416,chrY,59349288,59349546,NR_138048:exon2(-),NR_138048
3410,chrY,59358284,59359553,NR_110561:exon1(-),NR_110561
3411,chrY,59359962,59360160,NR_110561:exon2(-),NR_110561


In [11]:
# import gene symbol - RefSeq tx acc conversion table
gene2refseq_id = os.path.join(basedir, 'output/2019-11-18_gene2refseq_tx.tsv')
gene2refseq = pd.read_csv(gene2refseq_id, sep='\t').sort_values('gene')

print('\n')
gene2refseq.info()
print('\n')
gene2refseq



<class 'pandas.core.frame.DataFrame'>
Int64Index: 70334 entries, 63285 to 2678
Data columns (total 2 columns):
gene     70334 non-null object
tx_id    70334 non-null object
dtypes: object(2)
memory usage: 1.6+ MB




Unnamed: 0,gene,tx_id
63285,A1BG,NR_015380
63284,A1BG,NM_130786
35850,A1CF,NM_001198819
35851,A1CF,NM_001370130
35849,A1CF,NM_001198818
...,...,...
28468,ZYX,NM_001362783
28467,ZYX,NM_003461
54563,ZZEF1,NM_015113
2679,ZZZ3,NM_015534


In [12]:
# import clinvar p-lp bed file
clinvar_bed_file = os.path.join(data_dir, '2019-11-07_ClinVar-GRCh37_path-likely_path.bed')
clinvar_bed = pd.read_csv(clinvar_bed_file, sep='\t')

print('\n')
clinvar_bed.info()
print('\n')
clinvar_bed



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83696 entries, 0 to 83695
Data columns (total 4 columns):
chr      83696 non-null object
start    83696 non-null int64
end      83696 non-null int64
name     83696 non-null object
dtypes: int64(2), object(2)
memory usage: 2.6+ MB




Unnamed: 0,chr,start,end,name
0,chr1,949522,949523,183381_ISG15
1,chr1,949695,949696,161455_ISG15
2,chr1,949738,949739,161454_ISG15
3,chr1,957604,957605,243036_AGRN
4,chr1,957692,957693,243037_AGRN
...,...,...,...,...
83691,chrMT,15914,15915,690233_MT
83692,chrMT,15931,15932,143904_MT
83693,chrMT,15966,15967,9572_MT
83694,chrMT,15989,15990,9570_MT


In [14]:
# extract gene names; calculate gene recovery
clinvar_bed['gene'] = clinvar_bed.name.str.split('_').str[1]
clinvar_bed['gene_recovery'] = clinvar_bed.groupby('gene')['gene'].transform('count')
gene_recovery = clinvar_bed[['gene', 'gene_recovery']].drop_duplicates().sort_values('gene')

print('\n')
gene_recovery.info()
print('\n')
gene_recovery



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3997 entries, 43292 to 20935
Data columns (total 2 columns):
gene             3997 non-null object
gene_recovery    3997 non-null int64
dtypes: int64(1), object(1)
memory usage: 93.7+ KB




Unnamed: 0,gene,gene_recovery
43292,A2ML1,2
75895,A4GALT,1
44700,AAAS,19
55506,AAGAB,8
73388,AAR2,1
...,...,...
63009,ZNHIT3,1
39927,ZP1,4
58089,ZP2,3
27944,ZP3,2


In [15]:
# run bedtools intersect -c 
slop_bedtool = BedTool(file)
cv_bedtool = BedTool(clinvar_bed_file)
ovl = slop_bedtool.intersect(b=cv_bedtool, c=True).to_dataframe()
ovl.columns = ['chr', 'start', 'end', 'exon_name', 'tx_id', 'slop_recovery_ex']
ovl = ovl[ovl.slop_recovery_ex != 0].sort_values(['chr', 'start'])

print('\n')
ovl.info()
print('\n')
ovl



<class 'pandas.core.frame.DataFrame'>
Int64Index: 73855 entries, 696772 to 3219
Data columns (total 6 columns):
chr                 73855 non-null object
start               73855 non-null int64
end                 73855 non-null int64
exon_name           73855 non-null object
tx_id               73855 non-null object
slop_recovery_ex    73855 non-null int64
dtypes: int64(3), object(3)
memory usage: 3.9+ MB




Unnamed: 0,chr,start,end,exon_name,tx_id,slop_recovery_ex
696772,chr1,949319,949965,NM_005101:exon2(+),NM_005101,3
696719,chr1,957536,957887,NM_198576:exon2(+),NM_198576,2
696744,chr1,957536,957887,NM_001305275:exon2(+),NM_001305275,2
696696,chr1,976508,977127,"NM_001364727:exon4(+),NM_001364727:exon5(+)",NM_001364727,4
696722,chr1,976508,977127,"NM_198576:exon5(+),NM_198576:exon6(+)",NM_198576,4
...,...,...,...,...,...,...
3546,chrX,154754072,154754338,NM_018196:exon6(-),NM_018196,1
3403,chrY,2654851,2655768,NM_003140:exon1(-),NM_003140,16
3183,chrY,6931893,6932235,NM_033284:exon8(+),NM_033284,1
3201,chrY,6931893,6932235,NM_134258:exon7(+),NM_134258,1


In [16]:
# add gene names and gene recovery values to intersected table
ovl = ovl.merge(gene2refseq, how='left', on='tx_id')
ovl = ovl.merge(gene_recovery, how='left', on='gene')

print('\n')
ovl.info()
print('\n')
ovl



<class 'pandas.core.frame.DataFrame'>
Int64Index: 73855 entries, 0 to 73854
Data columns (total 8 columns):
chr                 73855 non-null object
start               73855 non-null int64
end                 73855 non-null int64
exon_name           73855 non-null object
tx_id               73855 non-null object
slop_recovery_ex    73855 non-null int64
gene                73855 non-null object
gene_recovery       73383 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 5.1+ MB




Unnamed: 0,chr,start,end,exon_name,tx_id,slop_recovery_ex,gene,gene_recovery
0,chr1,949319,949965,NM_005101:exon2(+),NM_005101,3,ISG15,3.0
1,chr1,957536,957887,NM_198576:exon2(+),NM_198576,2,AGRN,15.0
2,chr1,957536,957887,NM_001305275:exon2(+),NM_001305275,2,AGRN,15.0
3,chr1,976508,977127,"NM_001364727:exon4(+),NM_001364727:exon5(+)",NM_001364727,4,AGRN,15.0
4,chr1,976508,977127,"NM_198576:exon5(+),NM_198576:exon6(+)",NM_198576,4,AGRN,15.0
...,...,...,...,...,...,...,...,...
73850,chrX,154754072,154754338,NM_018196:exon6(-),NM_018196,1,TMLHE,3.0
73851,chrY,2654851,2655768,NM_003140:exon1(-),NM_003140,16,SRY,16.0
73852,chrY,6931893,6932235,NM_033284:exon8(+),NM_033284,1,TBL1Y,1.0
73853,chrY,6931893,6932235,NM_134258:exon7(+),NM_134258,1,TBL1Y,1.0


In [19]:
# build transcript slop recovery table
tx_slop_recovery = ovl[['gene', 'tx_id', 'gene_recovery', 'slop_recovery_ex']].copy()
tx_slop_recovery['slop_recovery_tx'] = tx_slop_recovery.groupby('tx_id')['slop_recovery_ex'].transform('sum')

print('\n')
tx_slop_recovery.info()
print('\n')
tx_slop_recovery



<class 'pandas.core.frame.DataFrame'>
Int64Index: 73855 entries, 0 to 73854
Data columns (total 5 columns):
gene                73855 non-null object
tx_id               73855 non-null object
gene_recovery       73383 non-null float64
slop_recovery_ex    73855 non-null int64
slop_recovery_tx    73855 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.4+ MB




Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_ex,slop_recovery_tx
0,ISG15,NM_005101,3.0,3,3
1,AGRN,NM_198576,15.0,2,15
2,AGRN,NM_001305275,15.0,2,15
3,AGRN,NM_001364727,15.0,4,13
4,AGRN,NM_198576,15.0,4,15
...,...,...,...,...,...
73850,TMLHE,NM_018196,3.0,1,3
73851,SRY,NM_003140,16.0,16,16
73852,TBL1Y,NM_033284,1.0,1,1
73853,TBL1Y,NM_134258,1.0,1,1


In [20]:
# calculate slopped transcript percent recovery by gene
tx_slop_recovery['slop_recovery%'] = tx_slop_recovery.loc[:, 'slop_recovery_tx'] / tx_slop_recovery.loc[:, 'gene_recovery'] * 100

print('\n')
tx_slop_recovery.info()
print('\n')
tx_slop_recovery



<class 'pandas.core.frame.DataFrame'>
Int64Index: 73855 entries, 0 to 73854
Data columns (total 6 columns):
gene                73855 non-null object
tx_id               73855 non-null object
gene_recovery       73383 non-null float64
slop_recovery_ex    73855 non-null int64
slop_recovery_tx    73855 non-null int64
slop_recovery%      73383 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 3.9+ MB




Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_ex,slop_recovery_tx,slop_recovery%
0,ISG15,NM_005101,3.0,3,3,100.000000
1,AGRN,NM_198576,15.0,2,15,100.000000
2,AGRN,NM_001305275,15.0,2,15,100.000000
3,AGRN,NM_001364727,15.0,4,13,86.666667
4,AGRN,NM_198576,15.0,4,15,100.000000
...,...,...,...,...,...,...
73850,TMLHE,NM_018196,3.0,1,3,100.000000
73851,SRY,NM_003140,16.0,16,16,100.000000
73852,TBL1Y,NM_033284,1.0,1,1,100.000000
73853,TBL1Y,NM_134258,1.0,1,1,100.000000


In [21]:
# add column with slop file ID
slop_length = re.findall(r'slop\d+', file)[0]
tx_slop_recovery['slop_len'] = slop_length

print('\n')
tx_slop_recovery.info()
print('\n')
tx_slop_recovery



<class 'pandas.core.frame.DataFrame'>
Int64Index: 73855 entries, 0 to 73854
Data columns (total 7 columns):
gene                73855 non-null object
tx_id               73855 non-null object
gene_recovery       73383 non-null float64
slop_recovery_ex    73855 non-null int64
slop_recovery_tx    73855 non-null int64
slop_recovery%      73383 non-null float64
slop_len            73855 non-null object
dtypes: float64(2), int64(2), object(3)
memory usage: 4.5+ MB




Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_ex,slop_recovery_tx,slop_recovery%,slop_len
0,ISG15,NM_005101,3.0,3,3,100.000000,slop45
1,AGRN,NM_198576,15.0,2,15,100.000000,slop45
2,AGRN,NM_001305275,15.0,2,15,100.000000,slop45
3,AGRN,NM_001364727,15.0,4,13,86.666667,slop45
4,AGRN,NM_198576,15.0,4,15,100.000000,slop45
...,...,...,...,...,...,...,...
73850,TMLHE,NM_018196,3.0,1,3,100.000000,slop45
73851,SRY,NM_003140,16.0,16,16,100.000000,slop45
73852,TBL1Y,NM_033284,1.0,1,1,100.000000,slop45
73853,TBL1Y,NM_134258,1.0,1,1,100.000000,slop45


In [23]:
tx_slop_recovery[tx_slop_recovery['slop_recovery%'] > 100]

Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_ex,slop_recovery_tx,slop_recovery%,slop_len
1635,TOE1,NM_025077,11.0,3,14,127.272727,slop45
1644,TOE1,NM_025077,11.0,1,14,127.272727,slop45
1645,TOE1,NM_025077,11.0,3,14,127.272727,slop45
1646,TOE1,NM_025077,11.0,7,14,127.272727,slop45
1647,MMACHC,NM_015506,51.0,8,53,103.921569,slop45
...,...,...,...,...,...,...,...
73718,G6PD,NM_001042351,26.0,2,27,103.846154,slop45
73719,G6PD,NM_000402,26.0,2,27,103.846154,slop45
73720,G6PD,NM_001360016,26.0,2,27,103.846154,slop45
73721,G6PD,NM_001042351,26.0,2,27,103.846154,slop45


In [None]:
# save to disk
tx_slop_recovery.to_csv('output/2019-11-18_slompdf-' + slop_length + '.bed', sep='\t', index=False)