07/28/2020

get unique hichip loops and anchors per tissue save as bedfile

also get differential atac files (sherlock and write to bed)

In [25]:
import os, glob
import pandas as pd
import numpy as np
import pybedtools
import matplotlib.pyplot as plt
import re
import seaborn as sns

In [3]:
loop_files = glob.glob('../data/interim/merged/loops/*.loops.csv')
anchor_annon_dir = '../data/interim/annon/anchor_atac'
anchor_annon_files = glob.glob('../data/interim/annon/anchor_atac/*_annon.bed')
promoter_annon_dir = '../data/interim/annon/promoter_anchors/'
promoter_annon_files = glob.glob('../data/interim/annon/promoter_anchors/promoter_*_annon.bed')
# anchor_annon_files
anchor_dir = '../data/interim/merged/anchors/'
anchor_files = glob.glob('../data/interim/merged/anchors/*.anchors.csv')

In [4]:
normal_tissues = ['Airway','Astrocytes','Bladder','Colon','Esophageal',
'GDSD0',
'GDSD3',
'GDSD6',
'GM12878',
'HMEC',
'Melanocytes',
'Ovarian',
'Pancreas',
'Prostate',
'Renal',
'Thyroid',
'Uterine']



# 1. Unique info

for each tissue (split normal and all) get following stats
- num unique loops
- num unique anchors
- num common loops
- num common anchors 
- num loops
- num anchors 	

summative stats (across all tissues)
    - total number of distinct anchors
    - total number of distinct loops

In [None]:
save_dir = '../data/processed/fig2_hichip/unique'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [6]:
%%time
# first get set of all loops
# all_loops, all_loops_normal = set(),set()
# all_anchors, all_anchors_normal = set(),set()

tissues = []
loops_all_df = pd.DataFrame(columns=['loop_name','tissue'])
anchors_all_df = pd.DataFrame(columns=['anchors','tissue'])

# tissue_info_dict = {}
for loop_file in sorted(loop_files):
    loop_filename = os.path.basename(loop_file)
    tissue = loop_filename.split('.')[0]
    
    #### NORMAL only####
    if tissue not in normal_tissues:
        continue
    print(tissue)
    tissues.append(tissue)
    anchor_file = os.path.join(anchor_dir, tissue+'.anchors.csv')
    loop_df = pd.read_csv(loop_file, index_col=0)
    loop_df['loop_name'] = loop_df.apply(lambda row: '::'.join(sorted([row.source,row.target])),axis=1)
    loop_df['tissue'] = tissue
    
    anchor_df = pd.read_csv(anchor_file, index_col=0)
    anchor_df['tissue'] = tissue
   
    loops_all_df = pd.concat([loops_all_df, loop_df[['loop_name', 'tissue']]])
    anchors_all_df = pd.concat([anchors_all_df, anchor_df[['anchors', 'tissue']]])
    print(tissue, 'done')
    
#     tissue_info_dict[tissue] = [loop_df,anchor_df,anchor_to_count_dict]
    
#     all_loops  = all_loops.union(set(loop_df.loop_name))
#     all_anchors  = all_anchors.union(set(anchor_df.anchors))
                                         
#     if tissue in normal_tissues:
#         all_loops_normal = all_loops_normal.union(set(loop_df.loop_name))
#         all_anchors_normal = all_anchors_normal.union(set(anchor_df.anchors))                            
    

Airway
Airway done
Astrocytes
Astrocytes done
Bladder
Bladder done
Colon
Colon done
Esophageal
Esophageal done
GDSD0
GDSD0 done
GDSD3
GDSD3 done
GDSD6
GDSD6 done
GM12878
GM12878 done
HMEC
HMEC done
Melanocytes
Melanocytes done
Ovarian
Ovarian done
Pancreas
Pancreas done
Prostate
Prostate done
Renal
Renal done
Thyroid
Thyroid done
Uterine
Uterine done
CPU times: user 1min 46s, sys: 1.14 s, total: 1min 47s
Wall time: 1min 19s


In [8]:
all_loops = loops_all_df.loop_name.unique()
all_anchors = anchors_all_df.anchors.unique()


In [14]:
# get loop/anchor by tissue binary matrix, 1 or 0 depending on whether loop/anchor exists in tissue
loops_all_df['count']=1
loop_counts = loops_all_df.pivot_table(index='loop_name', columns='tissue',values='count', fill_value=0)
loop_counts['num_tissues'] = loop_counts.sum(axis=1)

anchors_all_df['count']=1
anchor_counts = anchors_all_df.pivot_table(index='anchors', columns='tissue',values='count', fill_value=0)
anchor_counts['num_tissues'] = anchor_counts.sum(axis=1)


In [19]:
# unique loop counts
loop_counts[loop_counts.num_tissues==1].sum()

tissue
Airway           1414
Astrocytes      56819
Bladder         21783
Colon           34185
Esophageal      33479
GDSD0           19421
GDSD3           27291
GDSD6           27896
GM12878         75373
HMEC             7802
Melanocytes     16755
Ovarian         17720
Pancreas        12998
Prostate        47947
Renal           49578
Thyroid         18234
Uterine        133498
num_tissues    602193
dtype: int64

In [23]:
# unique anchor counts
anchor_counts[anchor_counts.num_tissues==1].sum()

tissue
Airway           109
Astrocytes      6753
Bladder         2280
Colon           2881
Esophageal      1955
GDSD0           2819
GDSD3           3599
GDSD6           2906
GM12878         8383
HMEC             997
Melanocytes     2681
Ovarian         1597
Pancreas        1139
Prostate        8051
Renal           6408
Thyroid         1669
Uterine        20013
num_tissues    74240
dtype: int64

In [22]:
unique_loops_df = loop_counts[loop_counts.num_tissues==1]
unique_anchors_df = anchor_counts[anchor_counts.num_tissues==1]

In [35]:
for tissue in unique_loops_df.columns:
    print(tissue)
    loop_df = pd.Series(unique_loops_df[tissue][unique_loops_df[tissue]==1].index).str.split('[:_]',expand=True)
    loop_df.drop(3,axis=1).to_csv(os.path.join(save_dir, tissue+'_loops.bed'),header=None,index=None, sep='\t')
    print(os.path.join(save_dir, tissue+'_loops.bed'))

Airway
../data/processed/fig2_hichip/unique/Airway_loops.bed
Astrocytes
../data/processed/fig2_hichip/unique/Astrocytes_loops.bed
Bladder
../data/processed/fig2_hichip/unique/Bladder_loops.bed
Colon
../data/processed/fig2_hichip/unique/Colon_loops.bed
Esophageal
../data/processed/fig2_hichip/unique/Esophageal_loops.bed
GDSD0
../data/processed/fig2_hichip/unique/GDSD0_loops.bed
GDSD3
../data/processed/fig2_hichip/unique/GDSD3_loops.bed
GDSD6
../data/processed/fig2_hichip/unique/GDSD6_loops.bed
GM12878
../data/processed/fig2_hichip/unique/GM12878_loops.bed
HMEC
../data/processed/fig2_hichip/unique/HMEC_loops.bed
Melanocytes
../data/processed/fig2_hichip/unique/Melanocytes_loops.bed
Ovarian
../data/processed/fig2_hichip/unique/Ovarian_loops.bed
Pancreas
../data/processed/fig2_hichip/unique/Pancreas_loops.bed
Prostate
../data/processed/fig2_hichip/unique/Prostate_loops.bed
Renal
../data/processed/fig2_hichip/unique/Renal_loops.bed
Thyroid
../data/processed/fig2_hichip/unique/Thyroid_loops.

In [41]:
for tissue in unique_loops_df.columns:
    print(tissue)
    anchor_df = pd.Series(unique_anchors_df[tissue][unique_anchors_df[tissue]==1].index).str.split('[:_]',expand=True)
    pybedtools.BedTool.from_dataframe(anchor_df).sort().saveas(os.path.join(save_dir, tissue+'_anchors.bed'))
    print(os.path.join(save_dir, tissue+'_anchors.bed'))

Airway
../data/processed/fig2_hichip/unique/Airway_anchors.bed
Astrocytes
../data/processed/fig2_hichip/unique/Astrocytes_anchors.bed
Bladder
../data/processed/fig2_hichip/unique/Bladder_anchors.bed
Colon
../data/processed/fig2_hichip/unique/Colon_anchors.bed
Esophageal
../data/processed/fig2_hichip/unique/Esophageal_anchors.bed
GDSD0
../data/processed/fig2_hichip/unique/GDSD0_anchors.bed
GDSD3
../data/processed/fig2_hichip/unique/GDSD3_anchors.bed
GDSD6
../data/processed/fig2_hichip/unique/GDSD6_anchors.bed
GM12878
../data/processed/fig2_hichip/unique/GM12878_anchors.bed
HMEC
../data/processed/fig2_hichip/unique/HMEC_anchors.bed
Melanocytes
../data/processed/fig2_hichip/unique/Melanocytes_anchors.bed
Ovarian
../data/processed/fig2_hichip/unique/Ovarian_anchors.bed
Pancreas
../data/processed/fig2_hichip/unique/Pancreas_anchors.bed
Prostate
../data/processed/fig2_hichip/unique/Prostate_anchors.bed
Renal
../data/processed/fig2_hichip/unique/Renal_anchors.bed
Thyroid
../data/processed/fig

In [140]:
all_counts_results_df = pd.DataFrame()
all_counts_results_df['num_loops'] = loop_counts.sum(axis=0)
all_counts_results_df['num_unique_loops'] = loop_counts[loop_counts.num_tissues==1].sum()
all_counts_results_df['num_common_loops'] = loop_counts[loop_counts.num_tissues==(loop_counts.shape[1]-1)].sum()
all_counts_results_df['num_anchors'] = anchor_counts.sum(axis=0)
all_counts_results_df['num_unique_anchors'] = anchor_counts[anchor_counts.num_tissues==1].sum()
all_counts_results_df['num_common_anchors'] = anchor_counts[anchor_counts.num_tissues==(anchor_counts.shape[1]-1)].sum()
all_counts_results_df.drop('num_tissues',inplace=True)

In [142]:
# all_counts_results_df.to_csv(os.path.join(save_dir, 'all_counts_results_df.csv'))


In [143]:

print('Total # of loops all tissues: ',all_counts_results_df.num_loops.sum())
print('Total # unique loops all tissues: ',all_counts_results_df.num_unique_loops.sum())
print('Total # common loops all tissues: ',all_counts_results_df.num_common_loops[0])

print('Total # of anchors all samples: ',all_counts_results_df.num_anchors.sum())
print('Total # unique anchors all tissues: ',all_counts_results_df.num_unique_anchors.sum())
print('Total # common anchors all tissues: ',all_counts_results_df.num_common_anchors[0])

print()


Total # of loops all tissues:  13249551
Total # unique loops all tissues:  1235518
Total # common loops all tissues:  1529
Total # of anchors all samples:  4646665
Total # unique anchors all tissues:  49665
Total # common anchors all tissues:  5904

Total # of loops normal tissues :  3180706
Total # unique loops normal tissues:  602193
Total # common loops normal tissues:  3567
Total # of anchors all samples normal:  1616223
Total # unique anchors normal tissues:  74240
Total # common anchors normal tissues:  9916


# 2. differential atac

on local computer

```
cd /Users/mguo123/Google Drive/1_khavari/omics_project-LD/pan_omics/data/processed/fig2_hichip/unique_atac

scp mguo123@login.sherlock.stanford.edu:/oak/stanford/groups/khavari/users/lkhd/project/3D/ATAC/hiseq/epithelia/ggr-project/by_celltype/*differential.txt .

```
then manually rename a couple
```
mv epithelia.HMEC.00.differential.txt epithelia.HMEC.differential.txt
mv epithelia.GDS.D0.differential.txt epithelia.GDSD0.differential.txt
mv epithelia.GDS.D3.differential.txt epithelia.GDSD3.differential.txt
mv epithelia.GDS.D6.differential.txt epithelia.GDSD6.differential.txt
```


In [42]:
save_dir = '../data/processed/fig2_hichip/unique_atac'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [45]:
sorted(glob.glob(os.path.join(save_dir, '*txt')))

['../data/processed/fig2_hichip/unique_atac/epithelia.Airway.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Astrocytes.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Bladder.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Colon.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Esophageal.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.GDSD0.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.GDSD3.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.GDSD6.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.GM12878.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.HMEC.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Melanocytes.differential.txt',
 '../data/processed/fig2_hichip/unique_atac/epithelia.Ovarian.differential.txt',
 '../data/processed/fig2_hichi

In [57]:
for file in sorted(glob.glob(os.path.join(save_dir, '*txt'))):
    tissue = os.path.basename(file).split('.')[1]
    save_path = os.path.join(save_dir, tissue+'_atac_diff.bed')
    df = pd.read_csv(file, header=None, sep='\t')
    print(tissue, df.shape[0])
    df.columns = ['name','score']
    df[['chr','start','stop']] = df.name.str.split('[:-]',expand=True)
    df = df[['chr','start','stop','score']]
    pybedtools.BedTool.from_dataframe(df).sort().saveas(save_path)
#     print(save_path)

Airway 23032
Astrocytes 25097
Bladder 27452
Colon 15185
Esophageal 18687
GDSD0 29132
GDSD3 30743
GDSD6 32995
GM12878 41327
HMEC 35742
Melanocytes 28917
Ovarian 13274
Pancreas 15659
Prostate 9180
Renal 20003
Thyroid 15144
Uterine 24126


Unnamed: 0,chrom,start,end,name
0,chr1,56063,56511,1.605055
1,chr1,739766,740428,2.694908
2,chr1,752616,753326,2.068369
3,chr1,903320,903691,1.118400
4,chr1,917404,917860,1.428870
...,...,...,...,...
24121,chrX,153322200,153322492,1.087908
24122,chrX,153788624,153788880,1.093475
24123,chrX,153872059,153872294,1.219003
24124,chrX,154340323,154340949,1.957704
