In [111]:
from datetime import datetime
import glob
import os
import pandas as pd
import pybedtools
from pybedtools import BedTool
import re

# set base directories
basedir = '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery'
slomp_dir = os.path.join(basedir, 'output/2019-11-18_slops_merged-by-tx')
data_dir = os.path.join(basedir, 'data')

# today's date
today = datetime.today().strftime('%Y-%m-%d')
today

'2019-11-22'

In [112]:
# get list of merged slop tx files
slomp_files = glob.glob(slomp_dir + '/*RefSeqGRCh37_slop*bed')
slomp_files

['/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery/output/2019-11-18_slops_merged-by-tx/2019-11-18_RefSeqGRCh37_slop50_merged.bed',
 '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery/output/2019-11-18_slops_merged-by-tx/2019-11-08_RefSeqGRCh37_slop0.bed',
 '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery/output/2019-11-18_slops_merged-by-tx/2019-11-20_RefSeqGRCh37_slop25_merged.bed',
 '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery/output/2019-11-18_slops_merged-by-tx/2019-11-20_RefSeqGRCh37_slop40_merged.bed',
 '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery/output/2019-11-18_slops_merged-by-tx/2019-11-19_RefSeqGRCh37_slop90_merged.bed',
 '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_G

In [113]:
slomp_df = pd.DataFrame(columns=['gene', 'tx_id', 'gene_recovery', 'slop_recovery_ex', 'slop_recovery_tx', 'slop_recovery%', 'slop_len'])
for file in slomp_files:
    
    # import merged slop bed file
    #file = os.path.join(slomp_dir, f)
    df_names = ['chr', 'start', 'end', 'name', 'tx_id']
    df = pd.read_csv(file, sep='\t', low_memory=False,
                     header=None, names=df_names).sort_values(['chr', 'start'])

    # import gene symbol - RefSeq tx acc conversion table
    gene2refseq_id = os.path.join(basedir, 'output/2019-11-18_gene2refseq_tx.tsv')
    gene2refseq = pd.read_csv(gene2refseq_id, sep='\t').sort_values('gene')

    # import clinvar p-lp bed file
    clinvar_bed_file = os.path.join(data_dir, '2019-11-07_ClinVar-GRCh37_path-likely_path.bed')
    clinvar_bed = pd.read_csv(clinvar_bed_file, sep='\t')

    # extract gene names; calculate gene recovery
    clinvar_bed['gene'] = clinvar_bed.name.str.split('_').str[1]
    clinvar_bed['gene_recovery'] = clinvar_bed.groupby('gene')['gene'].transform('count')
    gene_recovery = clinvar_bed[['gene', 'gene_recovery']].drop_duplicates().sort_values('gene')

    # run bedtools intersect -c 
    slop_bedtool = BedTool(file)
    cv_bedtool = BedTool(clinvar_bed_file)
    ovl = slop_bedtool.intersect(b=cv_bedtool, c=True).to_dataframe()
    ovl.columns = ['chr', 'start', 'end', 'exon_name', 'tx_id', 'slop_recovery_ex']
    ovl = ovl[ovl.slop_recovery_ex != 0].sort_values(['chr', 'start'])

    # add gene names and gene recovery values to intersected table
    ovl = ovl.merge(gene2refseq, how='left', on='tx_id')
    ovl = ovl.merge(gene_recovery, how='left', on='gene')

    # build transcript slop recovery table
    tx_slop_recovery = ovl[['gene', 'tx_id', 'gene_recovery', 'slop_recovery_ex']].copy()
    tx_slop_recovery['slop_recovery_tx'] = tx_slop_recovery.groupby('tx_id')['slop_recovery_ex'].transform('sum')

    # calculate slopped transcript percent recovery by gene
    tx_slop_recovery['slop_recovery%'] = tx_slop_recovery.loc[:, 'slop_recovery_tx'] / tx_slop_recovery.loc[:, 'gene_recovery'] * 100

    # add column with slop file ID
    slop_length = re.findall(r'slop\d+', file)[0]
    tx_slop_recovery['slop_len'] = slop_length
    
    # append to master table
    slomp_df = pd.concat([slomp_df, tx_slop_recovery], sort=False)

In [114]:
slomp_tab = slomp_df.sort_values(['gene', 'tx_id']).drop(columns=['slop_recovery_ex'])
slomp_tab = slomp_tab.drop_duplicates()
slomp_tab

Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_tx,slop_recovery%,slop_len
14139,A2ML1,NM_001282424,2.0,1,50.0,slop50
14580,A2ML1,NM_001282424,2.0,1,50.0,slop25
14517,A2ML1,NM_001282424,2.0,1,50.0,slop40
13569,A2ML1,NM_001282424,2.0,1,50.0,slop90
13689,A2ML1,NM_001282424,2.0,1,50.0,slop80
...,...,...,...,...,...,...
49544,ZSWIM6,NM_020928,2.0,2,100.0,slop95
48787,ZSWIM6,NM_020928,2.0,2,100.0,slop115
51350,ZSWIM6,NM_020928,2.0,2,100.0,slop55
24200,ZSWIM7,NM_001042697,,1,,slop115


In [115]:
slomp_tab.slop_len.unique()

array(['slop50', 'slop25', 'slop40', 'slop90', 'slop80', 'slop45',
       'slop30', 'slop100', 'slop85', 'slop95', 'slop115', 'slop55',
       'slop0'], dtype=object)

## Write table to disk

In [116]:
tmp_slomp_out = '../output/{}_RefSeqGRCh37_tx-slop-0-25-30-40-45-50-55-80-85-90-95-100-115.tsv'.format(today)
slomp_tab.to_csv(tmp_slomp_out, sep='\t', index=None)

## (Slop0 file processing (RefSeq bed)

In [60]:
slop0_file = '../output/2019-11-18_slops_merged-by-tx/2019-11-08_RefSeqGRCh37_slop0.bed'
slop0_cols = ['chr', 'start', 'end', 'name']
tmp = pd.read_csv(slop0_file, sep='\t', header=None, names=slop0_cols, low_memory=False)

tmp['tx_id'] = tmp.loc[:, 'name'].str.extract(r'((NM|NR)_\d+)')[0]
tmp

Unnamed: 0,chr,start,end,name,tx_id
0,1,11874,12227,DDX11L1-NR_046018:exon1(+),NR_046018
1,1,12613,12721,DDX11L1-NR_046018:exon2(+),NR_046018
2,1,13221,14409,DDX11L1-NR_046018:exon3(+),NR_046018
3,1,29321,29370,WASH7P-NR_024540:exon11(-),NR_024540
4,1,24738,24891,WASH7P-NR_024540:exon10(-),NR_024540
...,...,...,...,...,...
713100,Y,59349333,59349501,WASIR1-NR_138048:exon2(-),NR_138048
713101,Y,59347235,59348296,WASIR1-NR_138048:exon1(-),NR_138048
713102,Y,59360501,59360854,DDX11L16-NR_110561:exon3(-),NR_110561
713103,Y,59360007,59360115,DDX11L16-NR_110561:exon2(-),NR_110561


In [66]:
tmp['chr'] = 'chr' + tmp['chr']
tmp

Unnamed: 0,chr,start,end,name,tx_id
0,chr1,11874,12227,DDX11L1-NR_046018:exon1(+),NR_046018
1,chr1,12613,12721,DDX11L1-NR_046018:exon2(+),NR_046018
2,chr1,13221,14409,DDX11L1-NR_046018:exon3(+),NR_046018
3,chr1,29321,29370,WASH7P-NR_024540:exon11(-),NR_024540
4,chr1,24738,24891,WASH7P-NR_024540:exon10(-),NR_024540
...,...,...,...,...,...
713100,chrY,59349333,59349501,WASIR1-NR_138048:exon2(-),NR_138048
713101,chrY,59347235,59348296,WASIR1-NR_138048:exon1(-),NR_138048
713102,chrY,59360501,59360854,DDX11L16-NR_110561:exon3(-),NR_110561
713103,chrY,59360007,59360115,DDX11L16-NR_110561:exon2(-),NR_110561


In [67]:
tmp.to_csv(slop0_file, sep='\t', index=False, header=None)