In [2]:
import os
import pandas as pd
import pybedtools
from pybedtools import BedTool

In [78]:
def slop_overlap(clinvar_bed_file, slop_file, gene2refseq_id):
    
    # import table to add refseq tx associated gene symbol
    gene2refseq = pd.read_csv(gene2refseq_id, sep='\t')
    
    # import clinvar
    clinvar_bed = pd.read_csv(clinvar_bed_file, sep='\t')

    # calculate gene recovery
    clinvar_bed['gene'] = clinvar_bed.name.str.split('_').str[1]
    clinvar_bed['gene_recovery'] = clinvar_bed.groupby('gene')['gene'].transform('count')
    gene_recovery = clinvar_bed[['gene', 'gene_recovery']].drop_duplicates()

    # run bedtools intersect -c 
    slop_bedtool = BedTool(slop_file)
    cv_bedtool = BedTool(clinvar_bed_file)
    ovl = slop_bedtool.intersect(b=cv_bedtool, c=True).to_dataframe()
    ovl.columns = ['chr', 'start', 'end', 'exon_name', 'tx_id', 'slop_recoveryEX']
    ovl = ovl[ovl.slop_recoveryEX != 0].sort_values(['chr', 'start'])

    # add gene names and gene recovery values to intersected table
    ovl = ovl.merge(gene2refseq, how='left', on='tx_id')
    ovl = ovl.merge(gene_recovery, how='left', on='gene')
    
    # build transcript slop recovery table
    tx_slop_recovery = ovl[['gene', 'tx_id', 'gene_recovery', 'slop_recoveryEX']]
    tx_slop_recovery['slop_recoveryTX'] = tx_slop_recovery.groupby('tx_id')['slop_recoveryEX'].transform('sum')

    # calculate slopped transcript percent recovery by gene
    tx_slop_recovery['slop_recovery%'] = tx_slop_recovery['slop_recoveryTX'] / tx_slop_recovery['gene_recovery'] * 100
    
    return(tx_slop_recovery)

In [86]:
# import table to add refseq tx associated gene symbol
gene2refseq = pd.read_csv(gene2refseq_id, sep='\t')
    
# import clinvar
clinvar_bed = pd.read_csv(clinvar_bed_file, sep='\t')

## Create gene recovery table

In [112]:
# calculate gene recovery
clinvar_bed['gene'] = clinvar_bed.loc[:, 'name'].str.split('_').str[1]
clinvar_bed['gene_recovery'] = clinvar_bed.groupby('gene')['gene'].transform('count')

# subset into new dataframe
gene_recovery = clinvar_bed[['gene', 'gene_recovery']].drop_duplicates()

print('\n')
gene_recovery.info()
print('\n')



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3997 entries, 0 to 83389
Data columns (total 2 columns):
gene             3997 non-null object
gene_recovery    3997 non-null int64
dtypes: int64(1), object(1)
memory usage: 93.7+ KB




## Bedtools intersect of slop file and clinvar bed

In [90]:
# run bedtools intersect -c 
slop_bedtool = BedTool(slop_file)
cv_bedtool = BedTool(clinvar_bed_file)
ovl = slop_bedtool.intersect(b=cv_bedtool, c=True).to_dataframe()
ovl.columns = ['chr', 'start', 'end', 'exon_name', 'tx_id', 'slop_recoveryEX']
ovl = ovl[ovl.slop_recoveryEX != 0].sort_values(['chr', 'start'])

ovl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 686986 to 3208
Data columns (total 6 columns):
chr                72847 non-null object
start              72847 non-null int64
end                72847 non-null int64
exon_name          72847 non-null object
tx_id              72847 non-null object
slop_recoveryEX    72847 non-null int64
dtypes: int64(3), object(3)
memory usage: 3.9+ MB


In [91]:
# add gene names and gene recovery values to intersected table
ovl = ovl.merge(gene2refseq, how='left', on='tx_id')    
ovl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 7 columns):
chr                72847 non-null object
start              72847 non-null int64
end                72847 non-null int64
exon_name          72847 non-null object
tx_id              72847 non-null object
slop_recoveryEX    72847 non-null int64
gene               72847 non-null object
dtypes: int64(3), object(4)
memory usage: 4.4+ MB


In [92]:
ovl = ovl.merge(gene_recovery, how='left', on='gene')
ovl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 8 columns):
chr                72847 non-null object
start              72847 non-null int64
end                72847 non-null int64
exon_name          72847 non-null object
tx_id              72847 non-null object
slop_recoveryEX    72847 non-null int64
gene               72847 non-null object
gene_recovery      72372 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 5.0+ MB


In [93]:
# build transcript slop recovery table
tx_slop_recovery = ovl[['gene', 'tx_id', 'gene_recovery', 'slop_recoveryEX']]
tx_slop_recovery['slop_recoveryTX'] = tx_slop_recovery.groupby('tx_id')['slop_recoveryEX'].transform('sum')
tx_slop_recovery.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 5 columns):
gene               72847 non-null object
tx_id              72847 non-null object
gene_recovery      72372 non-null float64
slop_recoveryEX    72847 non-null int64
slop_recoveryTX    72847 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.3+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [94]:
# calculate slopped transcript percent recovery by gene
tx_slop_recovery['slop_recovery%'] = tx_slop_recovery.loc[:, 'slop_recoveryTX'] / tx_slop_recovery.loc[:, 'gene_recovery'] * 100
tx_slop_recovery.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 6 columns):
gene               72847 non-null object
tx_id              72847 non-null object
gene_recovery      72372 non-null float64
slop_recoveryEX    72847 non-null int64
slop_recoveryTX    72847 non-null int64
slop_recovery%     72372 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 3.9+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [113]:
tx_slop_recovery

Unnamed: 0,gene,tx_id,gene_recovery,slop_recoveryEX,slop_recoveryTX,slop_recovery%
0,ISG15,NM_005101,3.0,3,3,100.000000
1,AGRN,NM_198576,15.0,2,15,100.000000
2,AGRN,NM_001305275,15.0,2,15,100.000000
3,AGRN,NM_001364727,15.0,4,13,86.666667
4,AGRN,NM_198576,15.0,4,15,100.000000
...,...,...,...,...,...,...
72842,TMLHE,NM_018196,3.0,1,3,100.000000
72843,SRY,NM_003140,16.0,16,16,100.000000
72844,TBL1Y,NM_033284,1.0,1,1,100.000000
72845,TBL1Y,NM_134258,1.0,1,1,100.000000


In [99]:
clinvar_bed_file = 'data/2019-11-07_ClinVar-GRCh37_path-likely_path.bed'
slop_file = 'output/2019-11-17_RefSeqGRCh37_slop50_merged.bed'
gene2refseq_id = 'output/2019-11-18_gene2refseq_tx.tsv'

slop50 = slop_overlap(clinvar_bed_file, slop_file, gene2refseq_id)
slop50.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 6 columns):
gene               72847 non-null object
tx_id              72847 non-null object
gene_recovery      72372 non-null float64
slop_recoveryEX    72847 non-null int64
slop_recoveryTX    72847 non-null int64
slop_recovery%     72372 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 3.9+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [100]:
slop50.columns = ['gene', 'tx_id', 'gene_recovery', 'slop50_recoveryEX', 'slop50_recoveryTX', 'slop50_recovery%']
slop50 = slop50.drop(columns=['slop50_recoveryEX'])
#slop50 = slop50.drop_duplicates().sort_values('tx_id').head()
slop50.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72847 entries, 0 to 72846
Data columns (total 5 columns):
gene                 72847 non-null object
tx_id                72847 non-null object
gene_recovery        72372 non-null float64
slop50_recoveryTX    72847 non-null int64
slop50_recovery%     72372 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 3.3+ MB


In [103]:
slop50.sort_values('tx_id').tail()

Unnamed: 0,gene,tx_id,gene_recovery,slop50_recoveryTX,slop50_recovery%
8811,COX15,NR_164009,2.0,2,100.0
13138,DLAT,NR_164072,5.0,4,80.0
13177,DLAT,NR_164072,5.0,4,80.0
13163,DLAT,NR_164072,5.0,4,80.0
29849,LOC100505549,NR_164148,,1,


In [105]:
clinvar_bed_file = 'data/2019-11-07_ClinVar-GRCh37_path-likely_path.bed'
slop_file = 'output/2019-11-18_RefSeqGRCh37_slop45_merged.bed'
gene2refseq_id = 'output/2019-11-18_gene2refseq_tx.tsv'

slop45 = slop_overlap(clinvar_bed_file, slop_file, gene2refseq_id)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [107]:
slop45.columns = ['gene', 'tx_id', 'gene_recovery', 'slop45_recoveryEX', 'slop45_recoveryTX', 'slop45_recovery%']
#slop45 = slop45.drop(['slop45_recoveryEX'], axis=1).drop_duplicates().sort_values('tx_id').head()

slop45.sort_values('tx_id').head()

Unnamed: 0,gene,tx_id,gene_recovery,slop45_recoveryEX,slop45_recoveryTX,slop45_recovery%
2099,ACADM,NM_000016,74.0,7,74,100.0
2119,ACADM,NM_000016,74.0,4,74,100.0
2124,ACADM,NM_000016,74.0,19,74,100.0
2129,ACADM,NM_000016,74.0,3,74,100.0
2078,ACADM,NM_000016,74.0,4,74,100.0


In [110]:
slop45.sort_values('tx_id')

Unnamed: 0,gene,tx_id,gene_recovery,slop45_recoveryEX,slop45_recoveryTX,slop45_recovery%
2099,ACADM,NM_000016,74.0,7,74,100.0
2119,ACADM,NM_000016,74.0,4,74,100.0
2124,ACADM,NM_000016,74.0,19,74,100.0
2129,ACADM,NM_000016,74.0,3,74,100.0
2078,ACADM,NM_000016,74.0,4,74,100.0
...,...,...,...,...,...,...
8891,COX15,NR_164009,2.0,2,2,100.0
13364,DLAT,NR_164072,5.0,1,4,80.0
13325,DLAT,NR_164072,5.0,1,4,80.0
13350,DLAT,NR_164072,5.0,2,4,80.0


In [67]:
slop45_50 = slop45.merge(slop50, on=['tx_id', 'gene', 'gene_recovery'])
slop45_50

Unnamed: 0,gene,tx_id,gene_recovery,slop45_recoveryTX,slop45_recovery%,slop50_recoveryTX,slop50_recovery%
0,ACADM,NM_000016,74,74,100.0,74,100.0
1,ACADS,NM_000017,30,30,100.0,30,100.0
2,ACADVL,NM_000018,97,97,100.0,97,100.0
3,ACAT1,NM_000019,31,30,96.774194,30,96.774194
4,ACVRL1,NM_000020,130,129,99.230769,129,99.230769


In [69]:
slop45_50

Unnamed: 0,gene,tx_id,gene_recovery,slop45_recoveryTX,slop45_recovery%,slop50_recoveryTX,slop50_recovery%
0,ACADM,NM_000016,74,74,100.0,74,100.0
1,ACADS,NM_000017,30,30,100.0,30,100.0
2,ACADVL,NM_000018,97,97,100.0,97,100.0
3,ACAT1,NM_000019,31,30,96.774194,30,96.774194
4,ACVRL1,NM_000020,130,129,99.230769,129,99.230769
