In [2]:
import os
import pandas as pd
import pybedtools
from pybedtools import BedTool

basedir = '/Users/pbousounis/Experiments/2019-10-29_hg19mod/2019-11-10_RefSeq-ClinVar_GRCh37_slop_region_recovery'
os.chdir(basedir)

## Get RefSeq slopped bed file

In [6]:
slop45_file = 'output/2019-11-18_slompdf-slop45.bed'
slop45 = pd.read_csv(slop45_file, sep='\t')

print('\n')
slop45.info()
print('\n')
slop45



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73855 entries, 0 to 73854
Data columns (total 7 columns):
gene                73855 non-null object
tx_id               73855 non-null object
gene_recovery       73383 non-null float64
slop_recovery_ex    73855 non-null int64
slop_recovery_tx    73855 non-null int64
slop_recovery%      73383 non-null float64
slop_len            73855 non-null object
dtypes: float64(2), int64(2), object(3)
memory usage: 3.9+ MB




Unnamed: 0,gene,tx_id,gene_recovery,slop_recovery_ex,slop_recovery_tx,slop_recovery%,slop_len
0,ISG15,NM_005101,3.0,3,3,100.000000,slop45
1,AGRN,NM_198576,15.0,2,15,100.000000,slop45
2,AGRN,NM_001305275,15.0,2,15,100.000000,slop45
3,AGRN,NM_001364727,15.0,4,13,86.666667,slop45
4,AGRN,NM_198576,15.0,4,15,100.000000,slop45
...,...,...,...,...,...,...,...
73850,TMLHE,NM_018196,3.0,1,3,100.000000,slop45
73851,SRY,NM_003140,16.0,16,16,100.000000,slop45
73852,TBL1Y,NM_033284,1.0,1,1,100.000000,slop45
73853,TBL1Y,NM_134258,1.0,1,1,100.000000,slop45


## Get ClinVar bed file

In [139]:
# import clinvar bed
clinvar_bed_file = 'data/2019-11-07_ClinVar-GRCh37_path-likely_path.bed'
clinvar_bed = pd.read_csv(clinvar_bed_file, sep='\t')
clinvar_bed.to_csv(clinvar_bed_file, sep='\t', index=None)

In [140]:
clinvar_bed.info()
print('\n')
clinvar_bed.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83696 entries, 0 to 83695
Data columns (total 4 columns):
chr      83696 non-null object
start    83696 non-null int64
end      83696 non-null int64
name     83696 non-null object
dtypes: int64(2), object(2)
memory usage: 2.6+ MB




Unnamed: 0,chr,start,end,name
0,chr1,949522,949523,183381_ISG15
1,chr1,949695,949696,161455_ISG15
2,chr1,949738,949739,161454_ISG15
3,chr1,957604,957605,243036_AGRN
4,chr1,957692,957693,243037_AGRN


## Gene recovery table

In [144]:
clinvar_bed['gene'] = clinvar_bed.name.str.split('_').str[1]
clinvar_bed['gene_recovery'] = clinvar_bed.groupby('gene')['gene'].transform('count')

# get gene recovery table 
gene_recovery = clinvar_bed[['gene', 'gene_recovery']].drop_duplicates()

print('\n')
gene_recovery.info()
print('\n')
gene_recovery



<class 'pandas.core.frame.DataFrame'>
Int64Index: 3997 entries, 0 to 83389
Data columns (total 2 columns):
gene             3997 non-null object
gene_recovery    3997 non-null int64
dtypes: int64(1), object(1)
memory usage: 93.7+ KB




Unnamed: 0,gene,gene_recovery
0,ISG15,3
3,AGRN,15
18,B3GALT6,9
27,DVL1,17
44,MXRA8,1
...,...,...
83364,RAB39B,5
83369,TMLHE,3
83372,SRY,16
83388,TBL1Y,1


## Bedtools intersect slop - clinvar

In [183]:
slop_bedtool = BedTool(slop45_file)
cv_bedtool = BedTool(clinvar_bed_file)

ovl = slop_bedtool.intersect(b=cv_bedtool, c=True).to_dataframe().sort_values(['chrom', 'start'])
ovl.columns = ['chr', 'start', 'end', 'exon_id', 'tx_id', 'tx_recovery']

print('\n')
ovl.info()
print('\n')
ovl



<class 'pandas.core.frame.DataFrame'>
Int64Index: 696951 entries, 696948 to 3412
Data columns (total 6 columns):
chr            696951 non-null object
start          696951 non-null int64
end            696951 non-null int64
exon_id        696951 non-null object
tx_id          696951 non-null object
tx_recovery    696951 non-null int64
dtypes: int64(3), object(3)
memory usage: 37.2+ MB




Unnamed: 0,chr,start,end,exon_id,tx_id,tx_recovery
696948,chr1,11829,12272,NR_046018:exon1(+),NR_046018,0
696949,chr1,12568,12766,NR_046018:exon2(+),NR_046018,0
696950,chr1,13176,14454,NR_046018:exon3(+),NR_046018,0
696937,chr1,14317,14874,NR_024540:exon1(-),NR_024540,0
696938,chr1,14925,15083,NR_024540:exon2(-),NR_024540,0
...,...,...,...,...,...,...
3415,chrY,59347190,59348341,NR_138048:exon1(-),NR_138048,0
3416,chrY,59349288,59349546,NR_138048:exon2(-),NR_138048,0
3410,chrY,59358284,59359553,NR_110561:exon1(-),NR_110561,0
3411,chrY,59359962,59360160,NR_110561:exon2(-),NR_110561,0


## Merge intersect table with gene to RefSeq table

In [184]:
ovl_gene = ovl.merge(gene2rs, how='left', on='tx_id')

print('\n')
ovl_gene.info()
print('\n')
ovl_gene



<class 'pandas.core.frame.DataFrame'>
Int64Index: 696951 entries, 0 to 696950
Data columns (total 7 columns):
chr            696951 non-null object
start          696951 non-null int64
end            696951 non-null int64
exon_id        696951 non-null object
tx_id          696951 non-null object
tx_recovery    696951 non-null int64
gene           696951 non-null object
dtypes: int64(3), object(4)
memory usage: 42.5+ MB




Unnamed: 0,chr,start,end,exon_id,tx_id,tx_recovery,gene
0,chr1,11829,12272,NR_046018:exon1(+),NR_046018,0,DDX11L1
1,chr1,12568,12766,NR_046018:exon2(+),NR_046018,0,DDX11L1
2,chr1,13176,14454,NR_046018:exon3(+),NR_046018,0,DDX11L1
3,chr1,14317,14874,NR_024540:exon1(-),NR_024540,0,WASH7P
4,chr1,14925,15083,NR_024540:exon2(-),NR_024540,0,WASH7P
...,...,...,...,...,...,...,...
696946,chrY,59347190,59348341,NR_138048:exon1(-),NR_138048,0,WASIR1
696947,chrY,59349288,59349546,NR_138048:exon2(-),NR_138048,0,WASIR1
696948,chrY,59358284,59359553,NR_110561:exon1(-),NR_110561,0,DDX11L16
696949,chrY,59359962,59360160,NR_110561:exon2(-),NR_110561,0,DDX11L16


## Merge intersect table with gene recovery table

In [178]:
gene_recovery.sort_values('gene_recovery').head()

Unnamed: 0,gene,gene_recovery
14457,ATP2B2,1
16660,PRICKLE2,1
37670,AFAP1L2,1
37667,SHOC2,1
16765,ROBO2,1


In [186]:
ovl_gene_rec = ovl_gene.merge(gene_recovery, how='left', on='gene')

print('\n')
ovl_gene_rec.info()
print('\n')
ovl_gene_rec



<class 'pandas.core.frame.DataFrame'>
Int64Index: 696951 entries, 0 to 696950
Data columns (total 8 columns):
chr              696951 non-null object
start            696951 non-null int64
end              696951 non-null int64
exon_id          696951 non-null object
tx_id            696951 non-null object
tx_recovery      696951 non-null int64
gene             696951 non-null object
gene_recovery    210116 non-null float64
dtypes: float64(1), int64(3), object(4)
memory usage: 47.9+ MB




Unnamed: 0,chr,start,end,exon_id,tx_id,tx_recovery,gene,gene_recovery
0,chr1,11829,12272,NR_046018:exon1(+),NR_046018,0,DDX11L1,
1,chr1,12568,12766,NR_046018:exon2(+),NR_046018,0,DDX11L1,
2,chr1,13176,14454,NR_046018:exon3(+),NR_046018,0,DDX11L1,
3,chr1,14317,14874,NR_024540:exon1(-),NR_024540,0,WASH7P,
4,chr1,14925,15083,NR_024540:exon2(-),NR_024540,0,WASH7P,
...,...,...,...,...,...,...,...,...
696946,chrY,59347190,59348341,NR_138048:exon1(-),NR_138048,0,WASIR1,
696947,chrY,59349288,59349546,NR_138048:exon2(-),NR_138048,0,WASIR1,
696948,chrY,59358284,59359553,NR_110561:exon1(-),NR_110561,0,DDX11L16,
696949,chrY,59359962,59360160,NR_110561:exon2(-),NR_110561,0,DDX11L16,


In [1]:
ovl_gene_rec.gene_recovery.dropna().unique()

NameError: name 'ovl_gene_rec' is not defined

In [109]:
# drop rows with gene_recovery == 0
ovl = ovl.dropna()
ovl['gene_recovery'] = ovl.loc[:, 'gene_recovery'].astype('int')
ovl.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,chr,start,end,exon_name,tx_id,tx_recovery,gene,gene_recovery
0,chr1,949319,949965,NM_005101:exon2(+),NM_005101,3,ISG15,3
1,chr1,957536,957887,NM_198576:exon2(+),NM_198576,2,AGRN,15
2,chr1,957536,957887,NM_001305275:exon2(+),NM_001305275,2,AGRN,15
3,chr1,976508,977127,"NM_001364727:exon4(+),NM_001364727:exon5(+)",NM_001364727,4,AGRN,15
4,chr1,976508,977127,"NM_198576:exon5(+),NM_198576:exon6(+)",NM_198576,4,AGRN,15


In [110]:
ovl['slop45_recovery%'] = ovl['tx_recovery'] / ovl['gene_recovery'] * 100
ovl.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,chr,start,end,exon_name,tx_id,tx_recovery,gene,gene_recovery,slop45_recovery%
0,chr1,949319,949965,NM_005101:exon2(+),NM_005101,3,ISG15,3,100.0
1,chr1,957536,957887,NM_198576:exon2(+),NM_198576,2,AGRN,15,13.333333
2,chr1,957536,957887,NM_001305275:exon2(+),NM_001305275,2,AGRN,15,13.333333
3,chr1,976508,977127,"NM_001364727:exon4(+),NM_001364727:exon5(+)",NM_001364727,4,AGRN,15,26.666667
4,chr1,976508,977127,"NM_198576:exon5(+),NM_198576:exon6(+)",NM_198576,4,AGRN,15,26.666667


In [118]:
tx_slop45_Recovery = ovl[['gene', 'tx_id', 'gene_recovery', 'tx_recovery', 'slop45_recovery%']].drop_duplicates()
tx_slop45_Recovery.head(50)

Unnamed: 0,gene,tx_id,gene_recovery,tx_recovery,slop45_recovery%
0,ISG15,NM_005101,3,3,100.0
1,AGRN,NM_198576,15,2,13.333333
2,AGRN,NM_001305275,15,2,13.333333
3,AGRN,NM_001364727,15,4,26.666667
4,AGRN,NM_198576,15,4,26.666667
5,AGRN,NM_001305275,15,4,26.666667
6,AGRN,NM_001364727,15,2,13.333333
9,AGRN,NM_001364727,15,1,6.666667
10,AGRN,NM_198576,15,1,6.666667
11,AGRN,NM_001305275,15,1,6.666667


In [115]:
tmp.head()

Unnamed: 0,chr,start,end,exon_name,tx_id,tx_recovery,gene,gene_recovery,slop45_recovery%
14328,chr12,9020781,9020998,NM_001282424:exon20(+),NM_001282424,1,A2ML1,2,50.0
14327,chr12,8975172,8975354,NM_144670:exon1(+),NM_144670,1,A2ML1,2,50.0
14329,chr12,9020781,9020998,NM_144670:exon31(+),NM_144670,1,A2ML1,2,50.0
44425,chr22,43088073,43090048,NM_001318038:exon1(-),NM_001318038,1,A4GALT,1,100.0
44426,chr22,43088082,43090048,NM_017436:exon1(-),NM_017436,1,A4GALT,1,100.0
