# allele_specific_analyses(10)

1/27/2021

run gatk (see one note for details)

now see if overlap with mpra results

In [29]:
import pandas as pd 
import os,sys, glob
import pybedtools
import scipy.stats as stats
import re
import numpy as np
from statsmodels.stats.multitest import fdrcorrection


In [103]:
def read_vcf(path):
    with open(path, 'r') as f:
        lines = f.readlines()
        lines = [l.strip().split('\t') for l in lines if not l.startswith('##')]
    df=  pd.DataFrame(lines[1: ],columns=lines[0]).rename(columns={'#CHROM': 'CHROM'})
    if not df['CHROM'][0].startswith('chr'):
        df['CHROM'] = 'chr' + df.CHROM.map(str)

    df['rowname'] =  df.CHROM + '_' + df.POS.map(str)
    print('read',path)
    print('shape',df.shape)
    return df

def preprocess_vcf(row,tissue):
    # get info quality metrics
    info_str = row['INFO']
    info_str_arr = re.split('[=;]',info_str)
    qual =  pd.Series(dict(zip(info_str_arr[::2],info_str_arr[1::2] )))
    qual = qual[['DP','SOR','FS']]
    # get 
    tissue_cols = [x for x in row.index if re.search('B[1-3]$',x) is not None]    #x.startswith(tissue[:min(len(tissue),4)])]
    if len(tissue_cols)==0:
        tissue_cols=['sample','sample']
#     print('tissue_cols',tissue_cols)
    try:
        sample_1_info = row[tissue_cols[0]].split(':')
        sample_2_info = row[tissue_cols[1]].split(':')

        qual['GT_1'] = sample_1_info[0]
        qual['GT_2'] = sample_2_info[0]
        qual['AD_1_ref'] = int(sample_1_info[1].split(',')[0])
        qual['AD_1_alt'] = int(sample_1_info[1].split(',')[1])
        qual['AD_2_ref'] = int(sample_2_info[1].split(',')[0])
        qual['AD_2_alt'] = int(sample_2_info[1].split(',')[1])
    except:
        print(row)
        print(tissue_cols)
        raise
    return qual
#     format_arr = row['FORMAT'].split(':')
#     
#     format_series = pd.Series()
#     for idx,sample in enumerate(tissue_cols):
#         format_arr_sample = [x+"_"+str(idx)for x in format_arr]
#         format_series=pd.concat([format_series,pd.Series(dict(zip(format_arr_sample,row[sample].split(':'))))])
#     return pd.concat([qual,format_series])

def filter_vcf(vcf_df):
    # filter depth count (DP)>=10
    vcf_df = vcf_df[vcf_df.DP.map(int)>=10]
    # the allele  only biallelic SNP sites (GT: 0/1) w
    vcf_df = vcf_df[(vcf_df.GT_1 =='0/1')& (vcf_df.GT_2 =='0/1')]
    
    # minimum reference or alternative allele count >=2 (AD)
    vcf_df['AD_1_min'] = vcf_df[['AD_1_ref','AD_1_alt']].min(axis=1)
    vcf_df = vcf_df[vcf_df.AD_1_min>=2]
    vcf_df['AD_2_min'] = vcf_df[['AD_2_ref','AD_2_alt']].min(axis=1)
    vcf_df = vcf_df[vcf_df.AD_2_min>=2]
    return vcf_df

def preprocess_and_filter(row,tissue):
    # for hichip, do preproces and filter flag at same time
    binom_p = 1 # pass file is the probability of passing the binomial test
    # if they don't pass criterion give. a-1 for ease of filtering out 

    # get info quality metrics
    info_str = row['INFO']
    info_str_arr = re.split('[=;]',info_str)
    qual_dict =  dict(zip(info_str_arr[::2],info_str_arr[1::2] ))
#     qual = qual[['DP','SOR','FS']]
    # get tissue cols
    tissue_cols = [x for x in row.index if re.search('B[1-3]$',x) is not None]    #x.startswith(tissue[:min(len(tissue),4)])]
    if len(tissue_cols)==0:
        tissue_cols=['sample','sample']
#     print('tissue_cols',tissue_cols)

    try:
        # filter depth count (DP)>=10   
        if( int(qual_dict['DP'])<10):
            return -1
#         print('qual_dict dp')
        # the allele  only biallelic SNP sites (GT: 0/1) w        
        sample_1_info = row[tissue_cols[0]].split(':')
        sample_2_info = row[tissue_cols[1]].split(':')
        if ((sample_1_info[0]!='0/1')| (sample_2_info[0]!='0/1')):
            return -1
#         print('GT')
        
        # minimum reference or alternative allele count >=2 (AD)
        AD_arr = [  int(sample_1_info[1].split(',')[0]), #AD_1_ref
                    int(sample_1_info[1].split(',')[1]), #AD_1_alt
                    int(sample_2_info[1].split(',')[0]), #AD_2_ref
                    int(sample_2_info[1].split(',')[1])] #AD_2_alt
        binom_p = stats.binom_test(x=AD_arr[1] , n=sum(AD_arr)/2, p=0.5, alternative='two-sided')
        if min(AD_arr)<2:
            return -1
    except:
        binom_p = -1
        print('error, warning****, skipped..')
        print(row)
        print(tissue_cols)
    return binom_p    

In [104]:
save_dir = '../data/processed/vcf_files/analyses'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [105]:
# option1
lib_df1  = pd.read_csv('../data/external/GWAS/cancer_mpra_lib_snps_bedformat_121520.bed',sep='\t',header=None)
lib_df1.columns = ['chr','start','stop','rsid']
lib_df1['rowname'] = lib_df1.chr + '_' + lib_df1['start'].map(str)
lib_df_bed_df1 = lib_df1[['chr','start','stop','rowname']]
# option2
lib_df  = pd.read_csv('../data/external/GWAS/snp_list_combined.csv',index_col=0)
lib_df.columns = ['rsid','chr','start','disease']
lib_df['stop'] = lib_df['start'] + 1 
lib_df['rowname'] = lib_df.chr + '_' + lib_df['start'].map(str)
lib_df_bed_df = lib_df[['chr','start','stop','rowname']]
print('lib_df_bed_df',lib_df_bed_df.shape,'lib_df_bed_df1', lib_df_bed_df1.shape)
lib_df_bed_df = pd.concat([lib_df_bed_df,lib_df_bed_df1]).drop_duplicates()
print('lib_df_bed_df shape post merge',lib_df_bed_df.shape)
display(lib_df_bed_df[:5])
#common processing
lib_df_bed = pybedtools.BedTool.from_dataframe(lib_df_bed_df)
# print(lib_df.disease.unique())
# lib_df[:5]

lib_df_bed_df (35203, 4) lib_df_bed_df1 (5016, 4)
lib_df_bed_df shape post merge (38031, 4)


Unnamed: 0,chr,start,stop,rowname
0,chr1,44171211,44171212,chr1_44171211
1,chr1,8086527,8086528,chr1_8086527
2,chr1,108330356,108330357,chr1_108330356
3,chr5,44876507,44876508,chr5_44876507
4,chr5,50735307,50735308,chr5_50735307


In [106]:
mpra_info_df = pd.read_table('../data/external/GWAS/cancer_mpra_snps_results_120320.tsv',index_col=0)
mpra_info_df['rowname'] = 'chr'+mpra_info_df.Chr_37.map(str) + '_' +mpra_info_df.Start_37.map(str)
mpra_info_df[['Causal_SNP', 'rowname','disease']]

Unnamed: 0_level_0,Causal_SNP,rowname,disease
locus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
locus_1000,rs4866783,chr5_44876507,hmec
locus_1001,rs75282042,chr5_50735307,hmec
locus_1002,rs115010969,chr5_52535211,hmec
locus_1003,rs17343328,chr5_52615305,hmec
locus_1004,rs10940312,chr5_52667982,hmec
...,...,...,...
locus_996,rs16901937,chr5_44709141,hmec
locus_997,rs1866406,chr5_44809945,hmec
locus_999,rs10070037,chr5_44870237,hmec
locus_99,rs4915073,chr1_108325879,thy


In [107]:
mpra_res_df = pd.read_table('../data/external/GWAS/cancer_mpra_significant_hits_coords_1.27.21.tsv',index_col=0)
mpra_res_df['rowname'] = 'chr'+mpra_res_df.Chr_37.map(str) + '_' +mpra_res_df.Start_37.map(str)
# mpra_res_df = mpra_res_df.merge(lib_df[['Linked_SNP','rowname']],how='left',on='rowname' )
mpra_res_df[:5]

Unnamed: 0_level_0,Chr_37,Start_37,disease,hit_tissue,rowname
Causal_SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
rs111640187,5,82428995,airway,airway,chr5_82428995
rs34064842,6,27688625,"airway,hmec",airway,chr6_27688625
rs71546575,6,27805022,"airway,hmec",airway,chr6_27805022
rs35353359,6,28324378,"airway,hmec",airway,chr6_28324378
rs3095311,6,31051675,airway,airway,chr6_31051675


In [108]:
print(mpra_res_df.shape)
mpra_res_df.hit_tissue.unique()
# # from asATAC to tissue
mpra_tissue_mapper = {
    'Airway':'airway',
    'Astrocytes':'ast',
    'Colon':'colon',
    'Esophageal':'eso',
    'GM12878':'gm',
    'HMEC':'hmec',
    'GDSD6':'kc',
    'Melanocytes':'mc',
    'Ovarian':'ov',
    'Pancreas':'panc',
    'Prostate':'pros',
    'Renal':'renal',
    'Thyroid':'thy',
    'Uterine':'endo'
}
inv_mpra_tissue_mapper = {v: k for k, v in mpra_tissue_mapper.items()}
mpra_res_df['mapped_tissue'] = mpra_res_df.hit_tissue.map(inv_mpra_tissue_mapper )
mpra_res_df.mapped_tissue.value_counts()

(451, 5)


HMEC           147
Airway          68
Prostate        51
Colon           31
Ovarian         24
Melanocytes     23
Uterine         22
GDSD6           19
Pancreas        17
GM12878         16
Thyroid         16
Astrocytes      10
Esophageal       4
Renal            3
Name: mapped_tissue, dtype: int64

In [109]:
# mpra_rownames_sig = list(mpra_res_df.rowname.unique())
# len(mpra_rownames_sig)


In [110]:
# num_mpra_sig_hits = mpra_res_df.rowname.unique().shape[0]
num_lib_tested = lib_df.rowname.unique().shape[0]
# print(num_mpra_sig_hits, num_lib_tested)

# 1. atac

In [111]:
atac_dir = '../data/interim/merged/atac/'
atac_bed_files = glob.glob(os.path.join(atac_dir, '*bed'))
atac_bed_files_sel =sorted( [ '../data/interim/merged/atac/Renal_merged.bed',
 '../data/interim/merged/atac/Prostate_merged.bed',
 '../data/interim/merged/atac/Uterine_merged.bed',
 '../data/interim/merged/atac/GDSD6_merged.bed',
 '../data/interim/merged/atac/Pancreas_merged.bed',
 '../data/interim/merged/atac/Melanocytes_merged.bed',
 '../data/interim/merged/atac/Ovarian_merged.bed',
 '../data/interim/merged/atac/Bladder_merged.bed',
 '../data/interim/merged/atac/GDSD3_merged.bed',
 '../data/interim/merged/atac/Esophageal_merged.bed',
 '../data/interim/merged/atac/GM12878_merged.bed',
 '../data/interim/merged/atac/Astrocytes_merged.bed',
 '../data/interim/merged/atac/Colon_merged.bed',
 '../data/interim/merged/atac/GDSD0_merged.bed',
 '../data/interim/merged/atac/Thyroid_merged.bed',
 '../data/interim/merged/atac/Airway_merged.bed',
 '../data/interim/merged/atac/HMEC_merged.bed'])
atac_bed_files_sel

['../data/interim/merged/atac/Airway_merged.bed',
 '../data/interim/merged/atac/Astrocytes_merged.bed',
 '../data/interim/merged/atac/Bladder_merged.bed',
 '../data/interim/merged/atac/Colon_merged.bed',
 '../data/interim/merged/atac/Esophageal_merged.bed',
 '../data/interim/merged/atac/GDSD0_merged.bed',
 '../data/interim/merged/atac/GDSD3_merged.bed',
 '../data/interim/merged/atac/GDSD6_merged.bed',
 '../data/interim/merged/atac/GM12878_merged.bed',
 '../data/interim/merged/atac/HMEC_merged.bed',
 '../data/interim/merged/atac/Melanocytes_merged.bed',
 '../data/interim/merged/atac/Ovarian_merged.bed',
 '../data/interim/merged/atac/Pancreas_merged.bed',
 '../data/interim/merged/atac/Prostate_merged.bed',
 '../data/interim/merged/atac/Renal_merged.bed',
 '../data/interim/merged/atac/Thyroid_merged.bed',
 '../data/interim/merged/atac/Uterine_merged.bed']

In [112]:
# #testing
# atac_file = '/Users/mguo123/Documents/pan_omics_psych/data/interim/merged/atac/SL_D0_merged.bed'
# tissue_mapper = {
#     'H9_D2':'H9D2',
#     'SLC_D0':'SLC',
#     'SL_D0':'SL',
#     'Astrocytes':'AST1',
#     'H9_D10':'H9D10',
#     'H9_D0':'H9D0'
# }
# tissue = os.path.basename(atac_file).split('_merged')[0]
# atac_bed = pybedtools.BedTool(atac_file)
# lib_df_bed_atac = lib_df_bed.intersect(atac_bed).to_dataframe()
# lib_df_bed_atac  =lib_df_bed_atac[['name']].drop_duplicates().reset_index(drop=True)
# lib_df_bed_atac['tissue'] = tissue_mapper[tissue]
# lib_df_bed_atac.name[lib_df_bed_atac.name.str.startswith('chr3_503')]

In [113]:
# lib_atac_df.rowname[lib_atac_df.rowname.str.startswith('chr3_503')]

In [114]:
# aside get a table which lists for all the snps whether it's in an atac peak and which tissues

lib_atac_df = pd.DataFrame()
for atac_file in sorted(atac_bed_files_sel):
    tissue = os.path.basename(atac_file).split('_merged')[0]
    print(tissue)
#     if tissue not in tissue_mapper.keys():
#         print(tissue, 'not considered')
#         continue
    atac_bed = pybedtools.BedTool(atac_file)
    lib_df_bed_atac = lib_df_bed.intersect(atac_bed).to_dataframe()
    lib_df_bed_atac  =lib_df_bed_atac[['name']].drop_duplicates().reset_index(drop=True)
    lib_df_bed_atac['tissue'] = tissue#tissue_mapper[tissue]
    lib_atac_df = pd.concat([lib_atac_df, lib_df_bed_atac])
    print('lib_atac_df', lib_atac_df.shape)
    
lib_atac_df = lib_atac_df.groupby('name').agg({'tissue':'|'.join}).reset_index()
lib_atac_df.columns = ['rowname','atac_tissues']
lib_atac_df['bool_in_atac_pk'] = True
lib_atac_df.to_csv(os.path.join(save_dir,'lib_atac_annotation.csv'))

Airway
lib_atac_df (3067, 2)
Astrocytes
lib_atac_df (4414, 2)
Bladder
lib_atac_df (7692, 2)
Colon
lib_atac_df (11160, 2)
Esophageal
lib_atac_df (14751, 2)
GDSD0
lib_atac_df (17860, 2)
GDSD3
lib_atac_df (21137, 2)
GDSD6
lib_atac_df (24326, 2)
GM12878
lib_atac_df (27188, 2)
HMEC
lib_atac_df (30094, 2)
Melanocytes
lib_atac_df (31847, 2)
Ovarian
lib_atac_df (33669, 2)
Pancreas
lib_atac_df (37299, 2)
Prostate
lib_atac_df (39143, 2)
Renal
lib_atac_df (42699, 2)
Thyroid
lib_atac_df (46083, 2)
Uterine
lib_atac_df (50089, 2)


In [115]:
display(lib_atac_df[:5])
lib_atac_df.shape, lib_df_bed_df.shape

Unnamed: 0,rowname,atac_tissues,bool_in_atac_pk
0,chr10_101860630,Uterine,True
1,chr10_101860631,Uterine,True
2,chr10_101946033,Airway|Astrocytes|Bladder|Colon|Esophageal|GDS...,True
3,chr10_101989509,Airway|Astrocytes|Bladder|Colon|Esophageal|GDS...,True
4,chr10_101989510,Airway|Astrocytes|Bladder|Colon|Esophageal|GDS...,True


((7125, 3), (38031, 4))

In [116]:
vcf_files = sorted(glob.glob('../data/processed/vcf_files/atac/*vcf'))
vcf_files 

['../data/processed/vcf_files/atac/Airway_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Astrocytes_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Bladder_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Colon_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Esophageal_postvqsr.vcf',
 '../data/processed/vcf_files/atac/GDSD0_postvqsr.vcf',
 '../data/processed/vcf_files/atac/GDSD3_postvqsr.vcf',
 '../data/processed/vcf_files/atac/GDSD6_postvqsr.vcf',
 '../data/processed/vcf_files/atac/GM12878_postvqsr.vcf',
 '../data/processed/vcf_files/atac/HMEC_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Melanocytes_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Ovarian_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Pancreas_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Prostate_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Renal_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Thyroid_postvqsr.vcf',
 '../data/processed/vcf_files/atac/Uterine_postvqsr.vcf']

In [117]:
tissues = []
for vcf_file in vcf_files:
    tissue = os.path.basename(vcf_file).split('_post')[0]
    print(tissue)
    tissues.append(tissue)

Airway
Astrocytes
Bladder
Colon
Esophageal
GDSD0
GDSD3
GDSD6
GM12878
HMEC
Melanocytes
Ovarian
Pancreas
Prostate
Renal
Thyroid
Uterine


In [118]:
atac_tissue_mapper = dict(zip(tissues,tissues))
atac_tissue_mapper

{'Airway': 'Airway',
 'Astrocytes': 'Astrocytes',
 'Bladder': 'Bladder',
 'Colon': 'Colon',
 'Esophageal': 'Esophageal',
 'GDSD0': 'GDSD0',
 'GDSD3': 'GDSD3',
 'GDSD6': 'GDSD6',
 'GM12878': 'GM12878',
 'HMEC': 'HMEC',
 'Melanocytes': 'Melanocytes',
 'Ovarian': 'Ovarian',
 'Pancreas': 'Pancreas',
 'Prostate': 'Prostate',
 'Renal': 'Renal',
 'Thyroid': 'Thyroid',
 'Uterine': 'Uterine'}

In [119]:
### FROM TESTING
# vcf_df[:5]

hypergeometric test between two sets
- phyper=(overlap-1,list1,PopSize-list1,list2,lower.tail = FALSE, log.p = FALSE)
- scipy.stats.hypergeom.cdf(overlap, pop, list1, list2)

- 
phyper=(88,598,23000-598,5500,lower.tail = FALSE, log.p = FALSE)

In [120]:
def hypergeometric_test(x, M, n, N):
    """
    The hypergeometric distribution models drawing objects from a bin.
    - M is total number of objects
    - n is total number of Type I objects. 
    - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population

    - http://en.wikipedia.org/wiki/Hypergeometric_distribution
    - https://www.biostars.org/p/66729/
    - http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.hypergeom.html
    - http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.hypergeometric.html
    - http://stackoverflow.com/questions/6594840/what-are-equivalents-to-rs-phyper-function-in-python
    """

    assert n <= M
    assert x <= n
    assert N <= M
#     pv_le = stats.hypergeom.cdf(x+1, M, n, N)
    pv_gt = stats.hypergeom.sf(x-1, M, n, N)# 1-cdf sometimes more accurate
    return pv_gt#pv_gt pv_gt 


hyper geometric test 1
- check overlap between geneset of all asATAC snp locations with location with all of library snps over background of total # of snps? 
- check overlap between geneset of asATAC snps (tissue specific) in the library with the mpra hits in the library (tissue specific) 
- check overlap between geneset of asATAC snps (overall tissues) in the library with the mpra hits in the library all tissues)


In [121]:
%%time
asatac_lib_df = pd.DataFrame()
vcf_df_all = pd.DataFrame()
vcf_bed_df_atac_all = pd.DataFrame()
vcf_df_lib_all = pd.DataFrame()
vcf_df_lib_atac_all = pd.DataFrame()
for vcf_file in sorted(vcf_files):
    tissue = os.path.basename(vcf_file).split('_post')[0]
    print('************')
    print(tissue)
   # read vcf
    vcf_df = read_vcf(vcf_file)
    num_vcf_hits = vcf_df.shape[0]
    print('num_vcf_hits PRE FILTER', num_vcf_hits)  
    
    # filter vcf for those that meet the heterogenous requiremnts
    vcf_df['binom_p'] = vcf_df.apply(lambda row: preprocess_and_filter(row,tissue), axis=1)
    vcf_df = vcf_df[vcf_df.binom_p>=0]
    vcf_df['tissue'] = tissue
    num_vcf_hits_postfilt = vcf_df.shape[0]
    print('num_vcf_hits POST FILTER', num_vcf_hits_postfilt)      
    
    
    # binomial test
    vcf_df['pass_filt'], vcf_df['p_adjust'] = fdrcorrection(vcf_df.binom_p, alpha=0.05)
    vcf_df = vcf_df[vcf_df.pass_filt]
    num_vcf_hits_passfdr = vcf_df.shape[0]
    print('num_vcf_hits POST FDR', num_vcf_hits_passfdr)      

    vcf_bed_df = vcf_df[['CHROM','POS','rowname']]
    vcf_bed_df['stop'] = vcf_df.POS.map(int) + 1
    vcf_bed_df = vcf_bed_df[['CHROM','POS','stop','rowname']]
    vcf_bed_df.columns = ['chr','start','stop','name']
    vcf_df_all = pd.concat([vcf_df_all, vcf_df])

    vcf_df_lib = vcf_df.merge(lib_df_bed_df, how='inner',on='rowname')
    vcf_df_lib_all = pd.concat([vcf_df_lib_all, vcf_df_lib])
    lib_hits = vcf_df_lib.rowname.unique()
    num_lib_hits = lib_hits.shape[0]
    print('background library hits that are asATAC', num_lib_tested,num_lib_hits, num_lib_hits/ num_lib_tested)
    
#     if tissue in mpra_tissue_mapper:
#         mpra_res_df_tissue = mpra_res_df[mpra_res_df.tissue==mpra_tissue_mapper[tissue]]
#         mpra_hits_tissue = mpra_res_df_tissue.rowname.unique()
#         num_mpra_hits_tissue= mpra_hits_tissue.shape[0]
# #         mpra_sig_df_lib = vcf_df.merge(mpra_res_df_tissue, how='inner',on='rowname')

#         num_mpra_asatac_hits = len(set(lib_hits).intersection(set(mpra_hits_tissue)))
#         asatac_lib_df_tissue=pd.DataFrame()
#         asatac_lib_df_tissue['rowname'] = sorted(set(lib_hits).intersection(set(mpra_hits_tissue)))
#         asatac_lib_df_tissue['tissue'] =  mpra_tissue_mapper[tissue]
#         asatac_lib_df = pd.concat([asatac_lib_df, asatac_lib_df_tissue])
#         # do a hypergeometric test between being asATAC and being an mpra hit over a background of being in the mpra dataset
    
#         print('mpra sig library hits in tissue ', num_mpra_hits_tissue)
#         print('overlap:', num_mpra_asatac_hits)
# #         oddsratio, pvalue = stats.fisher_exact([[num_lib_hits, num_lib_tested- num_lib_hits],
# #                                                 [num_mpra_hits, num_mpra_sig_hits - num_mpra_hits]])
# #         print('fisher for mpra and as annotation association', pvalue, oddsratio)
#         phyper =  hypergeometric_test(num_mpra_asatac_hits, num_lib_tested, num_mpra_hits_tissue, num_lib_hits)
#         #stats.hypergeom.cdf(num_mpra_asatac_hits, num_lib_tested, num_lib_hits, num_mpra_hits)
#         print('hypergeometric test between being asATAC and MPRA hit over backgroun of being in mpra_dataset', phyper )
#         phyper =  hypergeometric_test(num_mpra_asatac_hits, num_vcf_hits_postfilt, num_mpra_hits_tissue, num_lib_hits)
#         print('hypergeometric test between being asATAC and MPRA hit over backgroun of being in an asatac', phyper )
    
    ## filter through atac
    atac_tissue = atac_tissue_mapper[tissue]
    atac_bed_file = os.path.join(atac_dir, atac_tissue+'_merged.bed')
    if not os.path.exists(atac_bed_file):
        print('atac file',atac_bed_file, 'does not exist')
        continue
    atac_bed = pybedtools.BedTool(atac_bed_file)

    vcf_bed_df_atac = pybedtools.BedTool.from_dataframe(vcf_bed_df).intersect(atac_bed).to_dataframe()
    num_atac_vcf = vcf_bed_df_atac.name.unique().shape[0]
    vcf_bed_df_atac['tissue'] = tissue
    vcf_bed_df_atac_all = pd.concat([vcf_bed_df_atac_all, vcf_bed_df_atac])
    print('atac filter', num_atac_vcf, 'out of', vcf_bed_df.shape[0], 'allele-specific atac', num_atac_vcf/vcf_bed_df.shape[0])
    lib_df_bed_atac = lib_df_bed.intersect(atac_bed).to_dataframe()
    lib_atac  = lib_df_bed_atac.name.unique()
    num_lib_atac = lib_atac.shape[0]
    print('atac filter', num_lib_atac, 'out of', num_lib_tested, 'mpra alleles tested', num_lib_atac/num_lib_tested)
#     mpra_atac = lib_df_bed_atac[lib_df_bed_atac.name.isin(mpra_rownames_sig)].name.unique()
#     num_mpra_atac = mpra_atac.shape[0]
#     print('atac filter', num_mpra_atac, 'out of', num_mpra_sig_hits, 'mpra alleles sig',num_mpra_atac/num_mpra_sig_hits)
    
    vcf_df_lib_atac = vcf_bed_df_atac.merge(lib_df_bed_df, how='inner',left_on='name',right_on='rowname')
    num_lib_hits_atac = vcf_df_lib_atac.rowname.unique().shape[0]
    vcf_df_lib_atac_all = pd.concat([vcf_df_lib_atac_all, vcf_df_lib_atac])
#     mpra_sig_df_lib_atac = vcf_bed_df_atac.merge(mpra_res_df, how='inner',left_on='name',right_on='rowname')
#     num_mpra_hits_atac = mpra_sig_df_lib_atac.rowname.unique().shape[0]
    
    print('background library hits-atac filt', num_lib_atac,num_lib_hits_atac, num_lib_hits_atac/ num_lib_atac)
#     print('mpra sig library hits-atac filt', num_mpra_atac,num_mpra_hits_atac, num_mpra_hits_atac/ num_mpra_atac)
#     if tissue in mpra_tissue_mapper:
#         mpra_hits_tissue_atac = set(mpra_hits_tissue).intersection(set(lib_atac))
#         num_mpra_hits_tissue_atac= mpra_hits_tissue.shape[0]
# #         mpra_sig_df_lib = vcf_df.merge(mpra_res_df_tissue, how='inner',on='rowname')

#         num_mpra_asatac_filtatac_hits = len(set(lib_hits).intersection(set(mpra_hits_tissue_atac)))
#         # do a hypergeometric test between being asATAC and being an mpra hit over a background of being in the mpra dataset

#         print('mpra sig library hits in tissue atac filt ', num_mpra_hits_tissue_atac)
#         print('overlap atac:', num_mpra_asatac_filtatac_hits)
# #         oddsratio, pvalue = stats.fisher_exact([[num_lib_hits, num_lib_tested- num_lib_hits],
# #                                                 [num_mpra_hits, num_mpra_sig_hits - num_mpra_hits]])
# #         print('fisher for mpra and as annotation association', pvalue, oddsratio)
#         phyper =  hypergeometric_test(num_mpra_asatac_filtatac_hits, num_atac_vcf, num_mpra_hits_tissue_atac, num_lib_atac)
#         #stats.hypergeom.cdf(num_mpra_asatac_hits, num_lib_tested, num_lib_hits, num_mpra_hits)
#         print('hypergeometric test between being asATAC and MPRA hit over backgroun of being asatac', phyper )


        

************
Airway
read ../data/processed/vcf_files/atac/Airway_postvqsr.vcf
shape (75646, 11)
num_vcf_hits PRE FILTER 75646
num_vcf_hits POST FILTER 31211
num_vcf_hits POST FDR 1772
background library hits that are asATAC 33066 34 0.001028246537228573


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1605 out of 1772 allele-specific atac 0.9057562076749436
atac filter 3067 out of 33066 mpra alleles tested 0.09275388616705982
background library hits-atac filt 3067 32 0.010433648516465601
************
Astrocytes
read ../data/processed/vcf_files/atac/Astrocytes_postvqsr.vcf
shape (59122, 11)
num_vcf_hits PRE FILTER 59122
num_vcf_hits POST FILTER 21337
num_vcf_hits POST FDR 2017
background library hits that are asATAC 33066 51 0.0015423698058428597
atac filter 587 out of 2017 allele-specific atac 0.29102627664848785


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1347 out of 33066 mpra alleles tested 0.04073670840137906
background library hits-atac filt 1347 14 0.010393466963622866
************
Bladder
read ../data/processed/vcf_files/atac/Bladder_postvqsr.vcf
shape (76857, 11)
num_vcf_hits PRE FILTER 76857
num_vcf_hits POST FILTER 26112
num_vcf_hits POST FDR 1214
background library hits that are asATAC 33066 24 0.0007258210851025222


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1118 out of 1214 allele-specific atac 0.9209225700164745
atac filter 3278 out of 33066 mpra alleles tested 0.09913506320691949
background library hits-atac filt 3278 22 0.006711409395973154
************
Colon
read ../data/processed/vcf_files/atac/Colon_postvqsr.vcf
shape (81035, 11)
num_vcf_hits PRE FILTER 81035
num_vcf_hits POST FILTER 26331
num_vcf_hits POST FDR 7390
background library hits that are asATAC 33066 192 0.005806568680820178


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 6632 out of 7390 allele-specific atac 0.8974289580514209
atac filter 3468 out of 33066 mpra alleles tested 0.10488114679731446
background library hits-atac filt 3468 177 0.05103806228373702
************
Esophageal
read ../data/processed/vcf_files/atac/Esophageal_postvqsr.vcf
shape (80599, 11)
num_vcf_hits PRE FILTER 80599
num_vcf_hits POST FILTER 26104
num_vcf_hits POST FDR 8674
background library hits that are asATAC 33066 216 0.0065323897659227


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 7884 out of 8674 allele-specific atac 0.9089232188148489
atac filter 3591 out of 33066 mpra alleles tested 0.10860097985846488
background library hits-atac filt 3591 199 0.05541631857421331
************
GDSD0
read ../data/processed/vcf_files/atac/GDSD0_postvqsr.vcf
shape (66677, 11)
num_vcf_hits PRE FILTER 66677
num_vcf_hits POST FILTER 24220
num_vcf_hits POST FDR 2215
background library hits that are asATAC 33066 41 0.0012399443537168089


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1578 out of 2215 allele-specific atac 0.7124153498871332
atac filter 3109 out of 33066 mpra alleles tested 0.09402407306598923
background library hits-atac filt 3109 30 0.00964940495336121
************
GDSD3
read ../data/processed/vcf_files/atac/GDSD3_postvqsr.vcf
shape (72690, 11)
num_vcf_hits PRE FILTER 72690
num_vcf_hits POST FILTER 26235
num_vcf_hits POST FDR 2749
background library hits that are asATAC 33066 63 0.001905280348394121


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 2062 out of 2749 allele-specific atac 0.7500909421607858
atac filter 3277 out of 33066 mpra alleles tested 0.0991048206617069
background library hits-atac filt 3277 38 0.011595971925541654
************
GDSD6
read ../data/processed/vcf_files/atac/GDSD6_postvqsr.vcf
shape (71290, 11)
num_vcf_hits PRE FILTER 71290
num_vcf_hits POST FILTER 25634
num_vcf_hits POST FDR 2497
background library hits that are asATAC 33066 50 0.0015121272606302546


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1895 out of 2497 allele-specific atac 0.7589106928313977
atac filter 3189 out of 33066 mpra alleles tested 0.09644347668299764
background library hits-atac filt 3189 36 0.011288805268109126
************
GM12878
read ../data/processed/vcf_files/atac/GM12878_postvqsr.vcf
shape (63362, 11)
num_vcf_hits PRE FILTER 63362
num_vcf_hits POST FILTER 23755
num_vcf_hits POST FDR 1407
background library hits that are asATAC 33066 36 0.0010887316276537834


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1310 out of 1407 allele-specific atac 0.9310589907604833
atac filter 2862 out of 33066 mpra alleles tested 0.08655416439847577
background library hits-atac filt 2862 34 0.011879804332634521
************
HMEC
read ../data/processed/vcf_files/atac/HMEC_postvqsr.vcf
shape (59800, 11)
num_vcf_hits PRE FILTER 59800
num_vcf_hits POST FILTER 22109
num_vcf_hits POST FDR 2272
background library hits that are asATAC 33066 52 0.0015726123510554648


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


atac filter 1779 out of 2272 allele-specific atac 0.7830105633802817
atac filter 2906 out of 33066 mpra alleles tested 0.0878848363878304
background library hits-atac filt 2906 43 0.014796971782518927
************
Melanocytes
read ../data/processed/vcf_files/atac/Melanocytes_postvqsr.vcf
shape (66427, 11)
num_vcf_hits PRE FILTER 66427
num_vcf_hits POST FILTER 36822
num_vcf_hits POST FDR 16189


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 563 0.017026552954696666
atac filter 9662 out of 16189 allele-specific atac 0.5968250046327753
atac filter 1753 out of 33066 mpra alleles tested 0.053015181757696726
background library hits-atac filt 1753 354 0.20193953223046207
************
Ovarian
read ../data/processed/vcf_files/atac/Ovarian_postvqsr.vcf
shape (55824, 11)
num_vcf_hits PRE FILTER 55824
num_vcf_hits POST FILTER 17853
num_vcf_hits POST FDR 3233


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 99 0.0029940119760479044
atac filter 1580 out of 3233 allele-specific atac 0.4887101763068358
atac filter 1822 out of 33066 mpra alleles tested 0.05510191737736648
background library hits-atac filt 1822 53 0.029088913282107574
************
Pancreas
read ../data/processed/vcf_files/atac/Pancreas_postvqsr.vcf
shape (82065, 11)
num_vcf_hits PRE FILTER 82065
num_vcf_hits POST FILTER 27878
num_vcf_hits POST FDR 7837


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 211 0.006381177039859675
atac filter 7120 out of 7837 allele-specific atac 0.9085109097869083
atac filter 3630 out of 33066 mpra alleles tested 0.10978043912175649
background library hits-atac filt 3630 195 0.05371900826446281
************
Prostate
read ../data/processed/vcf_files/atac/Prostate_postvqsr.vcf
shape (41596, 11)
num_vcf_hits PRE FILTER 41596
num_vcf_hits POST FILTER 11277
num_vcf_hits POST FDR 2092


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 59 0.0017843101675437004
atac filter 1344 out of 2092 allele-specific atac 0.6424474187380497
atac filter 1844 out of 33066 mpra alleles tested 0.055767253372043794
background library hits-atac filt 1844 52 0.028199566160520606
************
Renal
read ../data/processed/vcf_files/atac/Renal_postvqsr.vcf
shape (79237, 11)
num_vcf_hits PRE FILTER 79237
num_vcf_hits POST FILTER 26721
num_vcf_hits POST FDR 7856


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 208 0.006290449404221859
atac filter 7170 out of 7856 allele-specific atac 0.9126782077393075
atac filter 3556 out of 33066 mpra alleles tested 0.10754249077602371
background library hits-atac filt 3556 193 0.05427446569178852
************
Thyroid
read ../data/processed/vcf_files/atac/Thyroid_postvqsr.vcf
shape (79958, 11)
num_vcf_hits PRE FILTER 79958
num_vcf_hits POST FILTER 26163
num_vcf_hits POST FDR 7366


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 197 0.005957781406883203
atac filter 6550 out of 7366 allele-specific atac 0.8892207439587293
atac filter 3384 out of 33066 mpra alleles tested 0.10234077299945564
background library hits-atac filt 3384 176 0.05200945626477541
************
Uterine
read ../data/processed/vcf_files/atac/Uterine_postvqsr.vcf
shape (96367, 11)
num_vcf_hits PRE FILTER 96367
num_vcf_hits POST FILTER 44282
num_vcf_hits POST FDR 4929


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


background library hits that are asATAC 33066 110 0.00332667997338656
atac filter 4309 out of 4929 allele-specific atac 0.8742138364779874
atac filter 4006 out of 33066 mpra alleles tested 0.12115163612169601
background library hits-atac filt 4006 101 0.025212181727408887
CPU times: user 5min 32s, sys: 3.48 s, total: 5min 35s
Wall time: 5min 40s


we can see by the % that the mpra significant hits are overenriched compared to library background
- choice of library background can be changed to all gwas non cancer hits??? (will do next 



that means that MPRA signal is semi effected by 

In [124]:
1332/33066

0.040283070223189985

In [125]:
vcf_df_all.columns

Index(['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT',
       'sample', 'rowname', 'binom_p', 'tissue', 'pass_filt', 'p_adjust'],
      dtype='object')

In [126]:
print(vcf_df_all.shape)
vcf_df_all.to_csv(os.path.join(save_dir, 'vcf_df_all_atac.csv'))
print(vcf_bed_df_atac_all.shape)
vcf_bed_df_atac_all.to_csv(os.path.join(save_dir, 'vcf_bed_df_atac_all.csv'))
vcf_df_lib_all.to_csv(os.path.join(save_dir, 'vcf_df_lib_all_atac.csv'))
vcf_df_lib_atac_all.to_csv(os.path.join(save_dir, 'vcf_df_lib_atac_all.csv'))


(81709, 15)
(248925, 5)


In [127]:
vcf_df_lib_atac_all

Unnamed: 0,chrom,start_x,end,name,tissue,chr,start_y,stop,rowname
0,chr1,38461821,38461822,chr1_38461821,Airway,chr1,38461821,38461822,chr1_38461821
1,chr1,154839799,154839800,chr1_154839799,Airway,chr1,154839799,154839800,chr1_154839799
2,chr1,209979635,209979636,chr1_209979635,Airway,chr1,209979635,209979636,chr1_209979635
3,chr2,29119930,29119931,chr2_29119930,Airway,chr2,29119930,29119931,chr2_29119930
4,chr2,110316285,110316286,chr2_110316285,Airway,chr2,110316285,110316286,chr2_110316285
...,...,...,...,...,...,...,...,...,...
346,chr19,49217305,49217306,chr19_49217305,Uterine,chr19,49217305,49217306,chr19_49217305
347,chr19,49217305,49217306,chr19_49217305,Uterine,chr19,49217305,49217306,chr19_49217305
348,chr19,49217305,49217306,chr19_49217305,Uterine,chr19,49217305,49217306,chr19_49217305
349,chr19,49217305,49217306,chr19_49217305,Uterine,chr19,49217305,49217306,chr19_49217305


In [128]:
asatac_lib_df = vcf_df_lib_atac_all.merge(vcf_df_all[['rowname','tissue']],how='left',on=['rowname','tissue']).drop_duplicates().sort_values('tissue').groupby('rowname').agg({'tissue':'|'.join}).reset_index()
asatac_lib_df.columns = ['rowname','tissue_asatac']
asatac_lib_df['bool_is_asatac'] = True
asatac_lib_df= asatac_lib_df.merge(lib_df_bed_df, how='left',on='rowname')
asatac_lib_df.to_csv(os.path.join(save_dir,'lib_asatac_annotation_postfilter.csv'))
print(asatac_lib_df.shape)
asatac_lib_df[:5]

(898, 6)


Unnamed: 0,rowname,tissue_asatac,bool_is_asatac,chr,start,stop
0,chr10_104239100,Renal,True,chr10,104239100,104239101
1,chr10_104262628,Colon|Esophageal|Melanocytes|Pancreas|Prostate...,True,chr10,104262628,104262629
2,chr10_104263675,Colon|Esophageal|Melanocytes|Pancreas|Renal|Th...,True,chr10,104263675,104263676
3,chr10_104269217,Renal,True,chr10,104269217,104269218
4,chr10_104269301,Colon|Renal,True,chr10,104269301,104269302


In [129]:
vcf_bed_df_atac_all.name[vcf_bed_df_atac_all.name.isin(mpra_res_df.rowname.unique())].unique().shape

(29,)

# 1B. ATAC hypergeometric test

In [130]:
vcf_df_all = pd.read_csv(os.path.join(save_dir, 'vcf_df_all_atac.csv'),index_col=0)
vcf_df_lib_all = pd.read_csv(os.path.join(save_dir, 'vcf_df_lib_all_atac.csv'),index_col=0)
vcf_bed_df_atac_all = pd.read_csv(os.path.join(save_dir, 'vcf_bed_df_atac_all.csv'),index_col=0)
asatac_lib_df = pd.read_csv(os.path.join(save_dir,'lib_asatac_annotation_postfilter.csv'),index_col=0)


In [160]:
lib_df = pd.read_csv('/Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/external/GWAS/cancer_mpra_snps_withlead_annon.csv')
lib_df = lib_df[lib_df.snp_type=='linked_snp']
lib_df['rowname'] = lib_df.seqnames + '_'+lib_df.start.map(str)
lib_df#.shape

Unnamed: 0.1,Unnamed: 0,disease,hit,snp_type,rsid,seqnames,start,end,width,strand,REF,ALT,chr,seq_id,rowname
0,1,hmec,,linked_snp,rs4866783,chr5,44876507,44876507,1,*,C,T,5,5:44876507,chr5_44876507
3,4,hmec,,linked_snp,rs75282042,chr5,50735307,50735307,1,*,T,C,5,5:50735307,chr5_50735307
6,7,hmec,,linked_snp,rs115010969,chr5,52535211,52535211,1,*,T,A,5,5:52535211,chr5_52535211
9,10,hmec,,linked_snp,rs17343328,chr5,52615305,52615305,1,*,G,A,5,5:52615305,chr5_52615305
10,11,hmec,,linked_snp,rs10940312,chr5,52667982,52667982,1,*,A,T,5,5:52667982,chr5_52667982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7663,7664,hmec,,linked_snp,rs16901937,chr5,44709141,44709141,1,*,A,G,5,5:44709141,chr5_44709141
7664,7665,hmec,,linked_snp,rs1866406,chr5,44809945,44809945,1,*,G,C,5,5:44809945,chr5_44809945
7667,7668,hmec,,linked_snp,rs10070037,chr5,44870237,44870237,1,*,A,T,5,5:44870237,chr5_44870237
7668,7669,thy,,linked_snp,rs4915073,chr1,108325879,108325879,1,*,A,T,1,1:108325879,chr1_108325879


In [161]:
mpra_res_df

Unnamed: 0_level_0,Chr_37,Start_37,disease,hit_tissue,rowname,mapped_tissue
Causal_SNP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
rs111640187,5,82428995,airway,airway,chr5_82428995,Airway
rs34064842,6,27688625,"airway,hmec",airway,chr6_27688625,Airway
rs71546575,6,27805022,"airway,hmec",airway,chr6_27805022,Airway
rs35353359,6,28324378,"airway,hmec",airway,chr6_28324378,Airway
rs3095311,6,31051675,airway,airway,chr6_31051675,Airway
...,...,...,...,...,...,...
rs34992253,14,36647163,thy,thy,chr14_36647163,Thyroid
rs17293632,15,67442596,thy,thy,chr15_67442596,Thyroid
rs13406698,2,218283085,"hmec,thy",thy,chr2_218283085,Thyroid
rs16857609,2,218296508,"hmec,thy",thy,chr2_218296508,Thyroid


In [162]:
#         phyper =  hypergeometric_test(num_mpra_asatac_hits, num_lib_tested, num_mpra_hits_tissue, num_lib_hits)
#         #stats.hypergeom.cdf(num_mpra_asatac_hits, num_lib_tested, num_lib_hits, num_mpra_hits)
#         print('hypergeometric test between being asATAC and MPRA hit over backgroun of being in mpra_dataset', phyper )
#         phyper =  hypergeometric_test(num_mpra_asatac_hits, num_vcf_hits_postfilt, num_mpra_hits_tissue, num_lib_hits)
#         print('hypergeometric test between being asATAC and MPRA hit over backgroun of being in an asatac', phyper )
 #         # do a hypergeometric test between being asATAC and being an mpra hit over a background of being in the mpra dataset
#     - M is total number of objects
#     - n is total number of Type I objects. 
#     - x (random variate) represents the number of Type I objects in N drawn without replacement from the total population


In [163]:
mpra_res_df.rowname.isin(lib_df.rowname).value_counts()

True     403
False     48
Name: rowname, dtype: int64

In [164]:
asatac_snps =[] 
for rowname in vcf_df_lib_atac_all.rowname.unique() :
    if rowname in mpra_res_df.rowname.unique():
        asatac_snps.append(rowname)
        print(rowname)
len(asatac_snps)

chr3_194872824
chr6_167370211
chr12_48395043
chr2_202223566
chr13_31314455
chr17_76165104
chr2_70141519
chr7_97914305
chr10_38645117
chr15_67442596
chr4_164415734
chr7_97924226
chr11_18343822
chr19_13950294
chr1_155197268
chr5_67615731
chr6_4403131
chr19_41857404
chr2_67624302
chr10_38383147
chr10_38383793
chr10_96122826
chr11_69354931
chr15_40337462
chr16_50100253
chr2_67878108
chr19_13955161
chr1_10556447
chr6_117803538


29

In [165]:
asatac_snps_lib =[] 
for rowname in vcf_df_lib_atac_all.rowname.unique() :
    if rowname in lib_df.rowname.unique():
        asatac_snps_lib.append(rowname)
#         print(rowname)
len(asatac_snps_lib)

168

In [135]:
vcf_df_all['rowname'] = vcf_df_all.CHROM + '_' + vcf_df_all.POS.map(str)
vcf_df_all[vcf_df_all.rowname.isin(asatac_snps)].to_csv(os.path.join(save_dir, 'asatac-mprahits-wpval.csv'))

In [67]:
asatac_lib_snps =[] 
for rowname in vcf_df_lib_atac_all.rowname.unique() :
    if rowname in lib_df.rowname.unique():
        asatac_lib_snps.append(rowname)
#         print(rowname)
len(asatac_lib_snps)

KeyboardInterrupt: 

In [37]:

mpra_info_df[['Causal_SNP', 'rowname','disease']]
asatac_lib_df#.columns

Unnamed: 0,rowname,tissue_asatac,bool_is_asatac,chr,start,stop
0,chr10_104239100,Renal|Thyroid,True,chr10,104239100,104239101
1,chr10_104262628,Bladder|Colon|Esophageal|Melanocytes|Pancreas|...,True,chr10,104262628,104262629
2,chr10_104263675,Colon|Esophageal|Melanocytes|Ovarian|Pancreas|...,True,chr10,104263675,104263676
3,chr10_104269217,Renal,True,chr10,104269217,104269218
4,chr10_104269301,Airway|Colon|Renal,True,chr10,104269301,104269302
...,...,...,...,...,...,...
1342,chr9_33817617,Melanocytes,True,chr9,33817617,33817618
1343,chr9_34074476,Prostate|Uterine,True,chr9,34074476,34074477
1344,chr9_35538626,GDSD0|GDSD3|GDSD6,True,chr9,35538626,35538627
1345,chr9_6008571,Melanocytes,True,chr9,6008571,6008572


In [39]:
# testing for given tissue
tissue_num_hits_dict = {}

for tissue in sorted(mpra_tissue_mapper.keys()):
    print(tissue)
    mpra_tissue = mpra_tissue_mapper[tissue]
    lib_tested_rownames = set(mpra_info_df[mpra_info_df.disease.str.contains(mpra_tissue)].rowname.values)
    mpra_hit_rownames = set(mpra_res_df[mpra_res_df.mapped_tissue==tissue].rowname.values)
    asatac_hit_rowanmes = set(asatac_lib_df[asatac_lib_df.tissue_asatac.str.contains(tissue)].rowname.values)


    num_mpra_asatac_hits = len(mpra_hit_rownames.intersection(asatac_hit_rowanmes))
    num_lib_tested =lib_df1.shape[0] #len(lib_tested_rownames)
    num_mpra_hits = len(mpra_hit_rownames)
    num_asatac_hits = len(asatac_hit_rowanmes)
    print(num_mpra_asatac_hits,  num_lib_tested, num_mpra_hits, num_asatac_hits)
    phyper =  hypergeometric_test(num_mpra_asatac_hits, num_lib_tested, num_mpra_hits, num_asatac_hits)
    print('hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset', phyper )

    tissue_num_hits_dict[tissue]= num_mpra_asatac_hits
    

Airway
2 5016 68 133
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 0.5428690370111467
Astrocytes
0 5016 10 45
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 1.0
Colon
1 5016 31 252
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 0.7986728154382556
Esophageal
0 5016 4 267
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 1.0
GDSD6
0 5016 19 93
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 1.0
GM12878
0 5016 16 113
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 1.0
HMEC
2 5016 138 96
hypergeometric test between being asATAC and MPRA hit in a tissue over background of being in mpra_dataset 0.7477135595069265
Melanocytes
1 5016 23 412
hyp

In [40]:
tissue_num_hits_dict

{'Airway': 2,
 'Astrocytes': 0,
 'Colon': 1,
 'Esophageal': 0,
 'GDSD6': 0,
 'GM12878': 0,
 'HMEC': 2,
 'Melanocytes': 1,
 'Ovarian': 0,
 'Pancreas': 1,
 'Prostate': 0,
 'Renal': 0,
 'Thyroid': 1,
 'Uterine': 1}

In [41]:
lib_df1.merge(mpra_info_df[['Causal_SNP', 'rowname','disease']],how='left',on='rowname')

Unnamed: 0,chr,start,stop,rsid,rowname,Causal_SNP,disease
0,chr1,46505785,46505785,rs71062735,chr1_46505785,rs71062735,hmec
1,chr1,46358009,46358010,rs4539075,chr1_46358009,,
2,chr1,46361176,46361177,rs7512395,chr1_46361176,,
3,chr1,88423171,88423172,rs7514001,chr1_88423171,,
4,chr1,108368897,108368902,rs11278684,chr1_108368897,,
...,...,...,...,...,...,...,...
5023,chr22,40436972,40436973,rs11704416,chr22_40436972,,
5024,chr22,38568832,38568833,rs738321,chr22_38568832,,
5025,chr22,30236242,30236243,rs2105870,chr22_30236242,,
5026,chr22,40948874,40948875,rs55662398,chr22_40948874,,


# 2. Hichip

In [42]:
hichip_peaks_dir = '../data/interim/merged/anchors_bed_sort'
hichip_bed_files = glob.glob(os.path.join(hichip_peaks_dir, '*_sort.bed'))
hichip_bed_files

['../data/interim/merged/anchors_bed_sort/COLO_SCR_DMSO_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Ovarian_sort.bed',
 '../data/interim/merged/anchors_bed_sort/GDSD3_sort.bed',
 '../data/interim/merged/anchors_bed_sort/COLO_shMITF_DMSO_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Melanocytes_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Pancreas_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Colon_sort.bed',
 '../data/interim/merged/anchors_bed_sort/SCC13-CTRLi_sort.bed',
 '../data/interim/merged/anchors_bed_sort/CAL27-CTRLi_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Thyroid_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Uterine_sort.bed',
 '../data/interim/merged/anchors_bed_sort/D6-p63i_sort.bed',
 '../data/interim/merged/anchors_bed_sort/GM12878_sort.bed',
 '../data/interim/merged/anchors_bed_sort/HMEC_sort.bed',
 '../data/interim/merged/anchors_bed_sort/A431-p63i_sort.bed',
 '../data/interim/merged/anchors_bed_sort/Astrocytes_sort.bed'

In [43]:
# for hichip_file in sorted(hichip_bed_files):
#     tissue = os.path.basename(hichip_file).split('_diffloop')[0]
#     print(tissue)
    

In [44]:
# aside get a table which lists for all the snps whether it's in a hichip anchor peak and which tissues
tissue_mapper = atac_tissue_mapper

lib_hichip_df = pd.DataFrame()
for hichip_file in sorted(hichip_bed_files):# sorted(hichip_bed_files):
    tissue = os.path.basename(hichip_file).split('_sort')[0]
    print(tissue)
    if tissue not in tissue_mapper.keys():
        print(tissue, 'not considered')
        continue
    hichip_bed = pybedtools.BedTool(hichip_file).to_dataframe()
    if type(hichip_bed['chrom'][0])==int:
        hichip_bed['chrom'] = 'chr' + hichip_bed.chrom.map(str)
        hichip_bed = pybedtools.BedTool.from_dataframe(hichip_bed)
    else:
        hichip_bed = pybedtools.BedTool(hichip_file)
    lib_df_bed_hichip = lib_df_bed.intersect(hichip_bed).to_dataframe()
    lib_df_bed_hichip  =lib_df_bed_hichip[['name']].drop_duplicates().reset_index(drop=True)
    lib_df_bed_hichip['tissue'] = tissue_mapper[tissue]
    lib_hichip_df = pd.concat([lib_hichip_df, lib_df_bed_hichip])
    print('lib_hichip_df', lib_hichip_df.shape)
    
lib_hichip_df = lib_hichip_df.groupby('name').agg({'tissue':'|'.join}).reset_index()
lib_hichip_df.columns = ['rowname','hichip_tissues']
lib_hichip_df['bool_in_hichip_pk'] = True
lib_hichip_df.to_csv(os.path.join(save_dir, 'lib_hichip_annotation.csv'))

A431-CTRLi
A431-CTRLi not considered
A431-p63i
A431-p63i not considered
Airway
lib_hichip_df (7094, 2)
Astrocytes
lib_hichip_df (19811, 2)
Bladder
lib_hichip_df (32719, 2)
CAL27-CTRLi
CAL27-CTRLi not considered
CAL27-p63i
CAL27-p63i not considered
COLO_SCR_DMSO
COLO_SCR_DMSO not considered
COLO_SCR_PLX
COLO_SCR_PLX not considered
COLO_shMITF_DMSO
COLO_shMITF_DMSO not considered
COLO_shMITF_PLX
COLO_shMITF_PLX not considered
Colon
lib_hichip_df (48275, 2)
D0-CTRLi
D0-CTRLi not considered
D0-p63i
D0-p63i not considered
D3-CTRLi
D3-CTRLi not considered
D3-p63i
D3-p63i not considered
D6-CTRLi
D6-CTRLi not considered
D6-p63i
D6-p63i not considered
Esophageal
lib_hichip_df (62259, 2)
GDSD0
lib_hichip_df (75640, 2)
GDSD3
lib_hichip_df (89399, 2)
GDSD6
lib_hichip_df (102456, 2)
GM12878
lib_hichip_df (115908, 2)
HMEC
lib_hichip_df (126392, 2)
Melanocytes
lib_hichip_df (135862, 2)
Ovarian
lib_hichip_df (150199, 2)
Pancreas
lib_hichip_df (162783, 2)
Prostate
lib_hichip_df (179240, 2)
Renal
lib_hi

In [45]:
vcf_files_hichip = glob.glob('../data/processed/vcf_files/hichip/*vcf')
vcf_files_hichip

['../data/processed/vcf_files/hichip/GDSD3_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/GM12878_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Airway_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Bladder_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Astrocytes_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Esophageal_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Ovarian_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Melanocytes_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Colon_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Thyroid_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/HMEC_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Uterine_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/GDSD0_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/GDSD6_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Renal_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Pancreas_postvqsr.vcf',
 '../data/processed/vcf_files/hichip/Prosta

In [46]:
for vcf_file in vcf_files_hichip:
    tissue = os.path.basename(vcf_file).split('_post')[0]
    print(tissue)

GDSD3
GM12878
Airway
Bladder
Astrocytes
Esophageal
Ovarian
Melanocytes
Colon
Thyroid
HMEC
Uterine
GDSD0
GDSD6
Renal
Pancreas
Prostate


In [47]:
hichip_tissue_mapper =atac_tissue_mapper

In [48]:
%%time
ashichip_lib_df = pd.DataFrame()
vcf_df_all_hichip = pd.DataFrame()
vcf_bed_df_all_hichip = pd.DataFrame()
vcf_df_lib_all_hichip = pd.DataFrame()
vcf_df_lib_hichip_all_hichip = pd.DataFrame()
for vcf_file in sorted(vcf_files_hichip):
    tissue = os.path.basename(vcf_file).split('_post')[0]
    print('************')
    print('tissue', tissue)
    
   # read vcf
    vcf_df = read_vcf(vcf_file)
    num_vcf_hits = vcf_df.shape[0]
    print('num_vcf_hits PRE FILTER', num_vcf_hits)  
    
    # filter vcf for those that meet the heterogenous requiremnts
    vcf_df['binom_p'] = vcf_df.apply(lambda row: preprocess_and_filter(row,tissue), axis=1)
    vcf_df = vcf_df[vcf_df.binom_p>=0]
    vcf_df['tissue'] = tissue
    num_vcf_hits_postfilt = vcf_df.shape[0]
    print('num_vcf_hits POST FILTER', num_vcf_hits_postfilt)      
    
    
    # binomial test
    vcf_df['pass_filt'], vcf_df['p_adjust'] = fdrcorrection(vcf_df.binom_p, alpha=0.05)
    vcf_df = vcf_df[vcf_df.pass_filt]
    num_vcf_hits_passfdr = vcf_df.shape[0]
    print('num_vcf_hits POST FDR', num_vcf_hits_passfdr)      

    vcf_bed_df = vcf_df[['CHROM','POS','rowname']]
    vcf_bed_df['stop'] = vcf_df.POS.map(int) + 1
    vcf_bed_df = vcf_bed_df[['CHROM','POS','stop','rowname']]
    vcf_bed_df.columns = ['chr','start','stop','name']
    vcf_df_all_hichip = pd.concat([vcf_df_all_hichip, vcf_df])

    vcf_df_lib = vcf_df.merge(lib_df_bed_df, how='inner',on='rowname')
    vcf_df_lib_all_hichip = pd.concat([vcf_df_lib_all_hichip, vcf_df_lib])
    lib_hits = vcf_df_lib.rowname.unique()
    num_lib_hits = lib_hits.shape[0]
    print('background library hits that are asHiChIP', num_lib_tested,num_lib_hits, num_lib_hits/ num_lib_tested)
    
#     if tissue in mpra_tissue_mapper:
#         mpra_res_df_tissue = mpra_res_df[mpra_res_df.tissue==mpra_tissue_mapper[tissue]]
#         mpra_hits_tissue = mpra_res_df_tissue.rowname.unique()
#         num_mpra_hits_tissue= mpra_hits_tissue.shape[0]
# #         mpra_sig_df_lib = vcf_df.merge(mpra_res_df_tissue, how='inner',on='rowname')

#         num_mpra_ashichip_hits = len(set(lib_hits).intersection(set(mpra_hits_tissue)))
#         ashichip_lib_df_tissue=pd.DataFrame()
#         ashichip_lib_df_tissue['rowname'] = sorted(set(lib_hits).intersection(set(mpra_hits_tissue)))
#         ashichip_lib_df_tissue['tissue'] =  mpra_tissue_mapper[tissue]
#         ashichip_lib_df = pd.concat([ashichip_lib_df, ashichip_lib_df_tissue])
#         # do a hypergeometric test between being asHiChIP and being an mpra hit over a background of being in the mpra dataset
    
#         print('mpra sig library hits in tissue ', num_mpra_hits_tissue)
#         print('overlap:', num_mpra_ashichip_hits)
# #         oddsratio, pvalue = stats.fisher_exact([[num_lib_hits, num_lib_tested- num_lib_hits],
# #                                                 [num_mpra_hits, num_mpra_sig_hits - num_mpra_hits]])
# #         print('fisher for mpra and as annotation association', pvalue, oddsratio)
#         phyper =  hypergeometric_test(num_mpra_ashichip_hits, num_lib_tested, num_mpra_hits_tissue, num_lib_hits)
#         #stats.hypergeom.cdf(num_mpra_ashichip_hits, num_lib_tested, num_lib_hits, num_mpra_hits)
#         print('hypergeometric test between being asHiChIP and MPRA hit over backgroun of being in mpra_dataset', phyper )
#         phyper =  hypergeometric_test(num_mpra_ashichip_hits, num_vcf_hits, num_mpra_hits_tissue, num_lib_hits)
#         print('hypergeometric test between being asHiChIP and MPRA hit over backgroun of being in an ashichip', phyper )
    
    ## filter through hichip
    if  tissue not in hichip_tissue_mapper:
        print(tissue,'not in hichip tissue mapper')
        continue
    hichip_tissue = hichip_tissue_mapper[tissue]
    hichip_bed_file = os.path.join(hichip_peaks_dir, hichip_tissue+'_sort.bed')
    if not os.path.exists(hichip_bed_file):
        print('hichip file',hichip_bed_file, 'does not exist')
        continue
    hichip_bed = pybedtools.BedTool(hichip_bed_file).to_dataframe()
    if type(hichip_bed['chrom'][0])==int:
        hichip_bed['chrom'] = 'chr' + hichip_bed.chrom.map(str)
        hichip_bed = pybedtools.BedTool.from_dataframe(hichip_bed)
    else:
        hichip_bed = pybedtools.BedTool(hichip_bed_file)
        
    vcf_bed_df_hichip = pybedtools.BedTool.from_dataframe(vcf_bed_df).intersect(hichip_bed).to_dataframe()
    num_hichip_vcf = vcf_bed_df_hichip.name.unique().shape[0]
    vcf_bed_df_hichip['tissue'] = tissue
    vcf_bed_df_all_hichip = pd.concat([vcf_bed_df_all_hichip,vcf_bed_df_hichip])
    print('hichip filter', num_hichip_vcf, 'out of', vcf_bed_df.shape[0], 'allele-specific hichip', num_hichip_vcf/vcf_bed_df.shape[0])
    lib_df_bed_hichip = lib_df_bed.intersect(hichip_bed).to_dataframe()
    lib_hichip  = lib_df_bed_hichip.name.unique()
    num_lib_hichip = lib_hichip.shape[0]
    print('hichip filter', num_lib_hichip, 'out of', num_lib_tested, 'mpra alleles tested', num_lib_hichip/num_lib_tested)
#     mpra_hichip = lib_df_bed_hichip[lib_df_bed_hichip.name.isin(mpra_rownames_sig)].name.unique()
#     num_mpra_hichip = mpra_hichip.shape[0]
#     print('hichip filter', num_mpra_hichip, 'out of', num_mpra_sig_hits, 'mpra alleles sig',num_mpra_hichip/num_mpra_sig_hits)
    
    vcf_df_lib_hichip = vcf_bed_df_hichip.merge(lib_df_bed_df, how='inner',left_on='name',right_on='rowname')
    num_lib_hits_hichip = vcf_df_lib_hichip.rowname.unique().shape[0]
    vcf_df_lib_hichip_all_hichip = pd.concat([vcf_df_lib_hichip_all_hichip, vcf_df_lib_hichip])
#     mpra_sig_df_lib_hichip = vcf_bed_df_hichip.merge(mpra_res_df, how='inner',left_on='name',right_on='rowname')
#     num_mpra_hits_hichip = mpra_sig_df_lib_hichip.rowname.unique().shape[0]
    
    print('background library hits-hichip filt', num_lib_hichip,num_lib_hits_hichip, num_lib_hits_hichip/ num_lib_hichip)
#     print('mpra sig library hits-hichip filt', num_mpra_hichip,num_mpra_hits_hichip, num_mpra_hits_hichip/ num_mpra_hichip)
#     if tissue in mpra_tissue_mapper:
#         mpra_hits_tissue_hichip = set(mpra_hits_tissue).intersection(set(lib_hichip))
#         num_mpra_hits_tissue_hichip= mpra_hits_tissue.shape[0]
# #         mpra_sig_df_lib = vcf_df.merge(mpra_res_df_tissue, how='inner',on='rowname')

#         num_mpra_ashichip_filthichip_hits = len(set(lib_hits).intersection(set(mpra_hits_tissue_hichip)))
#         # do a hypergeometric test between being asHiChIP and being an mpra hit over a background of being in the mpra dataset

#         print('mpra sig library hits in tissue hichip filt ', num_mpra_hits_tissue_hichip)
#         print('overlap hichip:', num_mpra_ashichip_filthichip_hits)
# #         oddsratio, pvalue = stats.fisher_exact([[num_lib_hits, num_lib_tested- num_lib_hits],
# #                                                 [num_mpra_hits, num_mpra_sig_hits - num_mpra_hits]])
# #         print('fisher for mpra and as annotation association', pvalue, oddsratio)
#         phyper =  hypergeometric_test(num_mpra_ashichip_filthichip_hits, num_hichip_vcf, num_mpra_hits_tissue_hichip, num_lib_hichip)
#         #stats.hypergeom.cdf(num_mpra_ashichip_hits, num_lib_tested, num_lib_hits, num_mpra_hits)
#         print('hypergeometric test between being asHiChIP and MPRA hit over backgroun of being ashichip', phyper )



************
tissue Airway
read ../data/processed/vcf_files/hichip/Airway_postvqsr.vcf
shape (1173693, 12)
num_vcf_hits PRE FILTER 1173693
num_vcf_hits POST FILTER 110841
num_vcf_hits POST FDR 1431
background library hits that are asHiChIP 5016 12 0.0023923444976076554
hichip filter 702 out of 1431 allele-specific hichip 0.49056603773584906


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


hichip filter 7094 out of 5016 mpra alleles tested 1.414274322169059
background library hits-hichip filt 7094 4 0.000563856780377784
************
tissue Astrocytes
read ../data/processed/vcf_files/hichip/Astrocytes_postvqsr.vcf
shape (1348725, 12)
num_vcf_hits PRE FILTER 1348725
num_vcf_hits POST FILTER 135366
num_vcf_hits POST FDR 3697
background library hits that are asHiChIP 5016 19 0.003787878787878788


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




hichip filter 2270 out of 3697 allele-specific hichip 0.6140113605626183
hichip filter 12717 out of 5016 mpra alleles tested 2.535287081339713
background library hits-hichip filt 12717 13 0.0010222536761814894
************
tissue Bladder
read ../data/processed/vcf_files/hichip/Bladder_postvqsr.vcf
shape (1253005, 12)
num_vcf_hits PRE FILTER 1253005
num_vcf_hits POST FILTER 518064
num_vcf_hits POST FDR 11810
background library hits that are asHiChIP 5016 50 0.009968102073365232


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




hichip filter 7576 out of 11810 allele-specific hichip 0.6414902624894158
hichip filter 12908 out of 5016 mpra alleles tested 2.573365231259968
background library hits-hichip filt 12908 45 0.003486210102262163
************
tissue Colon
read ../data/processed/vcf_files/hichip/Colon_postvqsr.vcf
shape (953402, 12)
num_vcf_hits PRE FILTER 953402
num_vcf_hits POST FILTER 310610
num_vcf_hits POST FDR 81574


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 667 0.1329744816586922
hichip filter 42577 out of 81574 allele-specific hichip 0.5219432662367911
hichip filter 15556 out of 5016 mpra alleles tested 3.101275917065391
background library hits-hichip filt 15556 470 0.03021342247364361
************
tissue Esophageal
read ../data/processed/vcf_files/hichip/Esophageal_postvqsr.vcf
shape (866052, 12)
num_vcf_hits PRE FILTER 866052
num_vcf_hits POST FILTER 211493
num_vcf_hits POST FDR 58226


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 447 0.08911483253588516
hichip filter 29118 out of 58226 allele-specific hichip 0.5000858722907292
hichip filter 13984 out of 5016 mpra alleles tested 2.787878787878788
background library hits-hichip filt 13984 280 0.02002288329519451
************
tissue GDSD0
read ../data/processed/vcf_files/hichip/GDSD0_postvqsr.vcf
shape (1308563, 12)
num_vcf_hits PRE FILTER 1308563
num_vcf_hits POST FILTER 148582
num_vcf_hits POST FDR 8920


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 38 0.007575757575757576
hichip filter 4821 out of 8920 allele-specific hichip 0.5404708520179372
hichip filter 13381 out of 5016 mpra alleles tested 2.667663476874003
background library hits-hichip filt 13381 19 0.0014199237725132652
************
tissue GDSD3
read ../data/processed/vcf_files/hichip/GDSD3_postvqsr.vcf
shape (1276709, 12)
num_vcf_hits PRE FILTER 1276709
num_vcf_hits POST FILTER 140246
num_vcf_hits POST FDR 8306


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 28 0.005582137161084529
hichip filter 4638 out of 8306 allele-specific hichip 0.558391524199374
hichip filter 13759 out of 5016 mpra alleles tested 2.743022328548644
background library hits-hichip filt 13759 17 0.0012355549095137727
************
tissue GDSD6
read ../data/processed/vcf_files/hichip/GDSD6_postvqsr.vcf
shape (1247380, 12)
num_vcf_hits PRE FILTER 1247380
num_vcf_hits POST FILTER 133980
num_vcf_hits POST FDR 6080


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 26 0.00518341307814992
hichip filter 3527 out of 6080 allele-specific hichip 0.5800986842105263
hichip filter 13057 out of 5016 mpra alleles tested 2.6030701754385963
background library hits-hichip filt 13057 17 0.0013019836103239642
************
tissue GM12878
read ../data/processed/vcf_files/hichip/GM12878_postvqsr.vcf
shape (1015175, 12)
num_vcf_hits PRE FILTER 1015175
num_vcf_hits POST FILTER 394674
num_vcf_hits POST FDR 14080


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 102 0.02033492822966507
hichip filter 7859 out of 14080 allele-specific hichip 0.5581676136363637
hichip filter 13452 out of 5016 mpra alleles tested 2.6818181818181817
background library hits-hichip filt 13452 62 0.004608980077311924
************
tissue HMEC
read ../data/processed/vcf_files/hichip/HMEC_postvqsr.vcf
shape (1300369, 12)
num_vcf_hits PRE FILTER 1300369
num_vcf_hits POST FILTER 145379
num_vcf_hits POST FDR 3849


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 16 0.003189792663476874
hichip filter 1906 out of 3849 allele-specific hichip 0.4951935567679917
hichip filter 10484 out of 5016 mpra alleles tested 2.0901116427432216
background library hits-hichip filt 10484 12 0.0011446012972148034
************
tissue Melanocytes
read ../data/processed/vcf_files/hichip/Melanocytes_postvqsr.vcf
shape (1314344, 12)
num_vcf_hits PRE FILTER 1314344
num_vcf_hits POST FILTER 264814
num_vcf_hits POST FDR 85511


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 789 0.15729665071770335
hichip filter 27448 out of 85511 allele-specific hichip 0.3209879430716516
hichip filter 9470 out of 5016 mpra alleles tested 1.8879585326953747
background library hits-hichip filt 9470 320 0.0337909186906019
************
tissue Ovarian
read ../data/processed/vcf_files/hichip/Ovarian_postvqsr.vcf
shape (885465, 12)
num_vcf_hits PRE FILTER 885465
num_vcf_hits POST FILTER 255422
num_vcf_hits POST FDR 55842


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 560 0.11164274322169059
hichip filter 28052 out of 55842 allele-specific hichip 0.5023459045163139
hichip filter 14337 out of 5016 mpra alleles tested 2.8582535885167464
background library hits-hichip filt 14337 367 0.025598102810908838
************
tissue Pancreas
read ../data/processed/vcf_files/hichip/Pancreas_postvqsr.vcf
shape (999275, 12)
num_vcf_hits PRE FILTER 999275
num_vcf_hits POST FILTER 208362
num_vcf_hits POST FDR 54619


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 457 0.09110845295055821
hichip filter 25437 out of 54619 allele-specific hichip 0.46571705816657205
hichip filter 12584 out of 5016 mpra alleles tested 2.508771929824561
background library hits-hichip filt 12584 283 0.022488874761602034
************
tissue Prostate
read ../data/processed/vcf_files/hichip/Prostate_postvqsr.vcf
shape (1060235, 12)
num_vcf_hits PRE FILTER 1060235
num_vcf_hits POST FILTER 441257
num_vcf_hits POST FDR 15885


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 69 0.01375598086124402
hichip filter 10906 out of 15885 allele-specific hichip 0.686559647466163
hichip filter 16457 out of 5016 mpra alleles tested 3.280901116427432
background library hits-hichip filt 16457 57 0.0034635717323935103
************
tissue Renal
read ../data/processed/vcf_files/hichip/Renal_postvqsr.vcf
shape (1188479, 12)
num_vcf_hits PRE FILTER 1188479
num_vcf_hits POST FILTER 93193
num_vcf_hits POST FDR 6535


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 48 0.009569377990430622
hichip filter 4173 out of 6535 allele-specific hichip 0.6385615914307574
hichip filter 15816 out of 5016 mpra alleles tested 3.15311004784689
background library hits-hichip filt 15816 35 0.002212948912493677
************
tissue Thyroid
read ../data/processed/vcf_files/hichip/Thyroid_postvqsr.vcf
shape (983295, 12)
num_vcf_hits PRE FILTER 983295
num_vcf_hits POST FILTER 313153
num_vcf_hits POST FDR 74753


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 645 0.12858851674641147
hichip filter 34351 out of 74753 allele-specific hichip 0.45952670795820905
hichip filter 14176 out of 5016 mpra alleles tested 2.8261562998405103
background library hits-hichip filt 14176 400 0.028216704288939052
************
tissue Uterine
read ../data/processed/vcf_files/hichip/Uterine_postvqsr.vcf
shape (1248162, 12)
num_vcf_hits PRE FILTER 1248162
num_vcf_hits POST FILTER 110783
num_vcf_hits POST FDR 8598


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




background library hits that are asHiChIP 5016 44 0.008771929824561403
hichip filter 6865 out of 8598 allele-specific hichip 0.798441498022796
hichip filter 19559 out of 5016 mpra alleles tested 3.899322169059011
background library hits-hichip filt 19559 37 0.0018917122552277723
CPU times: user 58min 22s, sys: 23.4 s, total: 58min 45s
Wall time: 58min


In [49]:
print(vcf_df_all_hichip.shape)
print(vcf_bed_df_all_hichip.shape)
vcf_df_all_hichip.to_csv(os.path.join(save_dir, 'vcf_df_all_hichip.csv'))
vcf_bed_df_all_hichip.to_csv(os.path.join(save_dir, 'vcf_bed_df_all_hichip.csv'))
vcf_df_lib_all_hichip.to_csv(os.path.join(save_dir, 'vcf_df_lib_all_hichip.csv'))
vcf_df_lib_hichip_all_hichip.to_csv(os.path.join(save_dir, 'vcf_df_lib_hichip_all_hichip.csv'))


(499716, 48)
(242226, 5)


In [51]:
ashichip_lib_df = vcf_df_lib_hichip_all_hichip.merge(vcf_df_all[['rowname','tissue']],how='left',on=['rowname','tissue']).drop_duplicates().sort_values('tissue').groupby('rowname').agg({'tissue':'|'.join}).reset_index()
ashichip_lib_df.columns = ['rowname','tissue_ashichip']
ashichip_lib_df['bool_is_ashichip'] = True
ashichip_lib_df= ashichip_lib_df.merge(lib_df_bed_df, how='left',on='rowname')
ashichip_lib_df.to_csv(os.path.join(save_dir,'lib_ashichip_annotation_post_filt.csv'))


In [52]:
ashichip_lib_df

Unnamed: 0,rowname,tissue_ashichip,bool_is_ashichip,chr,start,stop
0,chr1_101328083,Colon|Esophageal|Ovarian,True,chr1,101328083,101328084
1,chr1_101328230,Thyroid,True,chr1,101328230,101328231
2,chr1_101390090,Melanocytes,True,chr1,101390090,101390091
3,chr1_101393409,Bladder|Colon|Esophageal|Ovarian|Thyroid,True,chr1,101393409,101393410
4,chr1_101466054,Esophageal|Thyroid,True,chr1,101466054,101466055
...,...,...,...,...,...,...
1243,chr9_22112241,Melanocytes,True,chr9,22112241,22112242
1244,chr9_35519359,Melanocytes,True,chr9,35519359,35519360
1245,chr9_73863654,Melanocytes,True,chr9,73863654,73863655
1246,chr9_93958422,Melanocytes,True,chr9,93958422,93958423
