### 13 November, 2019
# Calculate recovery of ClinVar transcripts in RefSeq exonic regions by slop length
### Pavlos Bousounis
***Updated 2019-11-13***

### Import modules

In [6]:
from datetime import datetime
import gffutils
import gzip
import numpy as np
import os
import pandas as pd
import pybedtools
from pybedtools import BedTool
import shutil
import re

In [9]:
# set working directory
os.chdir('/Users/pbousounis/Experiments/2019-10-29_hg19mod/RefSeq-ClinVar_GRCh37_slop_region_recovery/data')

### Get today's date

In [10]:
today = datetime.today().strftime('%Y-%m-%d')

print('Today is: {}'.format(today))

Today is: 2019-11-13


## Import RefSeq exonic regions

In [11]:
rsbed_file = '2019-11-08_RefSeq-GRCh37_latest_genomicGFF3.bed'
rsbed = pd.read_csv(rsbed_file, sep='\t', low_memory=False, names=['chr', 'start', 'end', 'name'])

# extract genes
rsbed['gene'] = rsbed['name'].str.split('-').str[0]

# extract transcripts 
rsbed['rs_tx'] = rsbed['name'].str.extract(r'(\w+-)(N(M|R)_\d+)')[1]

In [12]:
# bedtools groupby

rsbedtool = BedTool.from_dataframe(rsbed)

rsbedtool_by_tx = rsbedtool.gropuby()

## Slop each transcript and save to transcript-specific subfolders

In [None]:
# create tx_slop file output directory
slop_tx_dir_out = os.path.join(basedir, (today + "_RefSeqGRCh37_exon_TX_slop_bed"))
pathlib.Path(slop_tx_dir_out).mkdir(exist_ok=True)


for tx in rsbed.rs_tx.unique():
    
    tmp_df = rsbed[rsbed.rs_tx == tx]
    tmp_df['chr'] = 'chr' + tmp_df['chr']
    
    
    bedtool_tx = BedTool.from_dataframe(tmp_df)
    
    # create tx output directory
    tx_dir_out = os.path.join(slop_tx_dir_out, tx)
    pathlib.Path(tx_dir_out, ).mkdir(exist_ok=True)
    
    # for each N in the interval 0, 5, 10, .., 155 (non-inclusive) run slop adding N bases to each end of each region
    for i in list(range(5, 155, 5)):
        
        # create the output filename
        file_out = os.path.join(tx_dir_out, '{}_RefSeqGRCh37_slop{}_{}.bed'.format(today, i, tx))
            
        # perform slop: add 'i' bases to each end of each region
        bedtool_tx.slop(g=hg19_genome, b=i).saveas(file_out)
            
        # check file save
        if os.path.isfile(os.path.join(os.getcwd(), file_out)):
            print('\nSuccess! {} saved.\n'.format(file_out))

### 

In [None]:
for f in os.listdir(slop_tx_dir_out)

## TESTING

* for every unique transcript ID in the RefSeq bed file:
    - subset the RefSeq bed by transcript ID
    - create empty slop df list
    
    for every 'n' in range(5,155,5):
        - slop 'n' basepairs to each region in transcript
        - append to the transcript slop df list
    
GOAL: One list for each transcript, containing each slop dataframe as elements

In [None]:
# create tx_slop file output directory
slop_tx_dir_out = os.path.join(basedir, (today + "_RefSeqGRCh37_exon_TX_slop_bed"))
pathlib.Path(slop_tx_dir_out).mkdir(exist_ok=True)


for tx in rsbed.rs_tx.unique():
    
    tmp_df = rsbed[rsbed.rs_tx == tx]
    tmp_df['chr'] = 'chr' + tmp_df['chr']
    
    
    bedtool_tx = BedTool.from_dataframe(tmp_df)
    
    # create tx output directory
    tx_dir_out = os.path.join(slop_tx_dir_out, tx)
    pathlib.Path(tx_dir_out, ).mkdir(exist_ok=True)
    
    # for each N in the interval 0, 5, 10, .., 155 (non-inclusive) run slop adding N bases to each end of each region
    for i in list(range(5, 155, 5)):
        
        # create the output filename
        file_out = os.path.join(tx_dir_out, '{}_RefSeqGRCh37_slop{}_{}.bed'.format(today, i, tx))
            
        # perform slop: add 'i' bases to each end of each region
        bedtool_tx.slop(g=hg19_genome, b=i).saveas(file_out)
            
        # check file save
        if os.path.isfile(os.path.join(os.getcwd(), file_out)):
            print('\nSuccess! {} saved.\n'.format(file_out))