# Process bednot() output for RefSeq vs ClinVar

In [22]:
from datetime import datetime
import os
import numpy as np
import pandas as pd
import re

today = datetime.today().strftime('%Y-%m-%d')
os.chdir('/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-10-25_RefSeq_GFF3_download_validate')

In [5]:
ref2clin_file = '2019-11-05_GRCh37_latest_genomic_IN_2019-11-05_clinvar_path-l_RECOVERY.txt'
ref2clin = pd.read_csv(ref2clin_file, sep='\t', low_memory=False)

In [6]:
df = ref2clin

# extract RefSeq TX accessions from exon_ids
df['tx_id'] = df['exon_id'].str.split('-', expand=True)[1].str.split(':', expand=True)[0]
df['gene'] = df['exon_id'].str.split('-', expand=True)[0]
df.head()

Unnamed: 0,chr,start,end,exon_id,clinvar_recovery_count,tx_id,gene
0,1,12612,12720,DDX11L1-NR_046018:exon2(+),0,NR_046018,DDX11L1
1,1,13220,14408,DDX11L1-NR_046018:exon3(+),0,NR_046018,DDX11L1
2,1,29320,29369,WASH7P-NR_024540:exon11(-),0,NR_024540,WASH7P
3,1,24737,24890,WASH7P-NR_024540:exon10(-),0,NR_024540,WASH7P
4,1,18267,18365,WASH7P-NR_024540:exon9(-),0,NR_024540,WASH7P


In [46]:
cv_bed_file = '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-06_refseq_cv_slop_intersect/bed/2019-11-05_clinvar_path-l.bed'
cv_bed = pd.read_csv(cv_bed_file, sep='\t')
cv_bed.head()

Unnamed: 0,chr,start,end,name,gene,total_gene_vars
0,chr1,949522,949523,183381_ISG15,ISG15,3.0
1,chr1,949695,949696,161455_ISG15,ISG15,3.0
2,chr1,949738,949739,161454_ISG15,ISG15,3.0
3,chr1,957604,957605,243036_AGRN,AGRN,15.0
4,chr1,957692,957693,243037_AGRN,AGRN,15.0


### Join the clinvar total_gene_vars to the ref2clin table, by gene names

* Add transcript recovery and percent recovery columns

In [47]:
result = pd.merge(df, cv_bed, how='left', on=['gene'])
result.head()
# df['gene_recovery'] = 
# #df['gene_recovery'] = df['clinvar_recovery_count'].groupby(df['gene']).transform('sum')
# df['transcript_recovery'] = df['clinvar_recovery_count'].groupby(df['tx_id']).transform('sum')
# df['tx_perc_recovery'] = df['transcript_recovery'] / df['gene_recovery'] * 100
# df.head()

Unnamed: 0,chr_x,start_x,end_x,exon_id,clinvar_recovery_count,tx_id,gene,gene_recovery,transcript_recovery,tx_perc_recovery,chr_y,start_y,end_y,name,total_gene_vars
0,1,12612,12720,DDX11L1-NR_046018:exon2(+),0,NR_046018,DDX11L1,0.0,0.0,,,,,,
1,1,13220,14408,DDX11L1-NR_046018:exon3(+),0,NR_046018,DDX11L1,0.0,0.0,,,,,,
2,1,29320,29369,WASH7P-NR_024540:exon11(-),0,NR_024540,WASH7P,0.0,0.0,,,,,,
3,1,24737,24890,WASH7P-NR_024540:exon10(-),0,NR_024540,WASH7P,0.0,0.0,,,,,,
4,1,18267,18365,WASH7P-NR_024540:exon9(-),0,NR_024540,WASH7P,0.0,0.0,,,,,,


In [8]:
# save the file as tsv with date prefix
file_out = today + '_GRCh37_latest_genomic_VS_clinvar_path-l_tx-gene-recovery.tsv'
df.to_csv(file_out, sep='\t', header=True, index=False)

### Import clinvar bed file for total_gene_vars values

In [9]:
txdf = df[['tx_id', 'gene', 'gene_recovery', 'transcript_recovery', 'tx_perc_recovery']].dropna().drop_duplicates()
txdf.head()

Unnamed: 0,tx_id,gene,gene_recovery,transcript_recovery,tx_perc_recovery
194,NM_005101,ISG15,3.0,3.0,100.0
196,NM_001305275,AGRN,30.0,15.0,50.0
235,NM_198576,AGRN,30.0,15.0,50.0
345,NM_003327,TNFRSF4,1.0,1.0,100.0
366,NM_080605,B3GALT6,18.0,18.0,100.0


In [19]:
# rank transcripts by percent recovery
txdf['tx_rank'] = txdf.groupby("gene")["tx_perc_recovery"].rank("dense", ascending=False)
txdf.head(200)

Unnamed: 0,tx_id,gene,gene_recovery,transcript_recovery,tx_perc_recovery,tx_rank
194,NM_005101,ISG15,3.0,3.0,100.000000,1.0
196,NM_001305275,AGRN,30.0,15.0,50.000000,1.0
235,NM_198576,AGRN,30.0,15.0,50.000000,1.0
345,NM_003327,TNFRSF4,1.0,1.0,100.000000,1.0
366,NM_080605,B3GALT6,18.0,18.0,100.000000,1.0
584,NM_004421,DVL1,34.0,17.0,50.000000,1.0
599,NM_001330311,DVL1,34.0,17.0,50.000000,1.0
617,NM_001282582,MXRA8,5.0,1.0,20.000000,1.0
628,NM_001282583,MXRA8,5.0,1.0,20.000000,1.0
638,NM_001282584,MXRA8,5.0,1.0,20.000000,1.0


In [13]:
txdf_fileout = today + '_refseq_VS_clinvar_TX_GENE_RECOVERY.tsv'
txdf.to_csv(txdf_fileout, sep='\t', index=False, header=True)

In [14]:
# did file save succesfully?
os.path.isfile(txdf_fileout)

True

In [26]:
top_txdf = txdf[txdf.tx_rank == 1.0]
top_txdf = top_txdf[['gene', 'tx_id', 'gene_recovery', 'transcript_recovery', 'tx_perc_recovery', 'tx_rank']]

In [27]:
top_txdf.head()

Unnamed: 0,gene,tx_id,gene_recovery,transcript_recovery,tx_perc_recovery,tx_rank
194,ISG15,NM_005101,3.0,3.0,100.0,1.0
196,AGRN,NM_001305275,30.0,15.0,50.0,1.0
235,AGRN,NM_198576,30.0,15.0,50.0,1.0
345,TNFRSF4,NM_003327,1.0,1.0,100.0,1.0
366,B3GALT6,NM_080605,18.0,18.0,100.0,1.0


In [44]:
top_txdf_fileout = today + '_refseq_VS_clinvar_TOP_TX_BY_RECOVERY.tsv'
top_txdf.to_csv(top_txdf_fileout, sep='\t', header=True, index=False)

# did file save succesfully?
os.path.isfile(top_txdf_fileout)

True

In [43]:
os.getcwd()

'/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-10-25_RefSeq_GFF3_download_validate'

In [45]:
txdf.to_csv('2019-11-06_RefSeq_Ranked_TXs.tsv', sep='\t', header=True, index=False)