In [2]:
import inferelator_ng.prior as priors
import os
import pandas as pd
import seaborn as sns
import matplotlib.py2plot as plt
%matplotlib inline

## Generate ChipSeq priors from metadata and mm10 references
This notebook loops over different references (genebody and tss), different distances to references, samples, and genes per sample, to create priors. Priors are also combined across samples. 

In [3]:
base_dir='/Users/ndeveaux/Dropbox (Simons Foundation)/atac_parameter_search/ChipSeq/chipseq_priors/chipseq_mm10_th17_prior_gen_2017_01'

In [4]:
input_base_dir = os.path.join(base_dir, 'references')
output_base_dir = os.path.join(base_dir, 'output')

In [5]:
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)

In [6]:
macs_folders = ['2016_11_28_MACS_out', '2016_11_15_MACS_out']
references = ['tss_mm10.bed', 'genebodies_mm10.bed']
macs_dict = {'2016_11_28_MACS_out': 'macs_pval_1e-10', '2016_11_15_MACS_out': 'macs_pval_1e-5'}

In [7]:
metadata_file = os.path.join(input_base_dir, 'metadata_ChIP.txt')
metadata = pd.read_csv(metadata_file, sep = '\t')

In [8]:
dists=[5000, 10000]

In [9]:
computed_priors = {}

In [None]:
# compute priors
for row in metadata.iterrows():
    row_content = row[1]
    sample = ('_').join([row_content['sampleName'], row_content['compare2']])
    gene = row_content['gene']
    for ref in references:
        ref_bed = os.path.join(input_base_dir, ref)
        # get human readable name
        ref_name = ref.split('.')[0]
        ref_df = pd.read_csv(ref_bed, sep = '\t', header=None)
        for mac in macs_folders:
            mac_name = macs_dict[mac]
            for dist in dists:
                motif_bed = os.path.join(os.path.join(os.path.join(input_base_dir, mac), sample), '_'.join([sample, 'peaks.bed']))
                p = priors.Prior(motif_bed, ref_bed, regulators = [gene], targets = list(ref_df[3]), max_distance=dist, single_tf=True, mode='window')
                key = '_'.join([ref_name, mac_name, str(dist), gene])
                print key + ': ' + sample
                if key not in computed_priors.keys():
                    computed_priors[key] = {}
                if sample not in computed_priors[key].keys():
                    computed_priors[key][sample] = p.make_prior()

tss_mm10_macs_pval_1e-10_5000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-10_10000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-5_5000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-5_10000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-10_5000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-10_10000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-5_5000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-5_10000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-10_5000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-10_10000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-5_5000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-5_10000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-10_5000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-10_10000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-5_5000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-5_10000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-10_5000_Batf: SL10570_SL10564
tss_mm10_macs_pval_1e-10_10000_Batf: SL10570_SL10564
tss_mm10_macs_pval_1e-5_5000_Batf: SL10

In [53]:
# combine priors into dataframe, sum and save
dataframes = {}
for k in computed_priors:
    gene = k.split('_')[-1]
    result = pd.DataFrame([computed_priors[k][sample][gene].rename(sample) for sample in computed_priors[k]]).transpose()
    result['combined_' + gene] = result.sum(axis=1)
    output_filename = '.'.join([k, 'csv'])
    result.to_csv(os.path.join(output_base_dir, output_filename))
    dataframes[k] = result

In [42]:
k = 'tss_mm10_macs_pval_1e-5_5000_Batf'
gene = 'Batf'
df = pd.DataFrame([computed_priors[k][sample][gene].rename(sample) for sample in computed_priors[k]])

In [48]:
df = df.transpose()
df.loc['uc007afe.3_up_1_chr1_4785727_r',:]

SL10570_SL10564    2
SL3033_SL3036      2
SL3037_SL3036      2
Name: uc007afe.3_up_1_chr1_4785727_r, dtype: int64

In [50]:
df['combined_' + gene] = df.sum(axis=1)

In [51]:
df.loc['uc007afe.3_up_1_chr1_4785727_r',:]

SL10570_SL10564    2
SL3033_SL3036      2
SL3037_SL3036      2
combined_Batf      6
Name: uc007afe.3_up_1_chr1_4785727_r, dtype: int64

In [164]:
combined_df = pd.DataFrame(priors['2016_11_15_MACS_out_SL3033_SL3036_5000_Batf']['Batf'], columns = )

In [165]:
combined_df['1000'] = priors['2016_11_15_MACS_out_SL3033_SL3036_10000_Batf']['Batf']

In [168]:
combined_df['10000-pval-10'] = priors['2016_11_28_MACS_out_SL3033_SL3036_10000_Batf']['Batf']

In [169]:
combined_df['5000-pval-10'] = priors['2016_11_28_MACS_out_SL3033_SL3036_5000_Batf']['Batf']

In [170]:
combined_df['5000-pval-5-sample-SL3037'] = priors['2016_11_15_MACS_out_SL3037_SL3036_5000_Batf']['Batf']

In [208]:
combined_df['5000-pval-5-combined'] = combined_df['5000-pval-5-sample-SL3037'] + combined_df['Batf']

In [210]:
max(combined_df['5000-pval-5-combined'] )

15

In [None]:
def plot(df, title):
    plt.figure(figsize=(10,50))
    sns.heatmap(df.head(2000), annot=False, fmt="h", yticklabels=False)
    plt.title(title)

In [None]:
focus = 'tss_mm10_macs_pval_1e-5_5000'
plot(pd.concat([dataframes[x] for x in dataframes.keys() if x.startswith(focus)]), focus)

In [101]:
# Sanity check: see if prior is what's expected, 7. 
max(priors['2016_11_15_MACS_out_SL1040_SL972']['Stat3'])

7

In [84]:
gene_df[gene_df[3] == 'uc011whu.1_up_1_chr1_4858327_f']

Unnamed: 0,0,1,2,3,4,5
18,chr1,4858326,4858327,uc011whu.1_up_1_chr1_4858327_f,0,+


In [110]:
new_prior = priors['2016_11_15_MACS_out_SL1040_SL972']
new_prior[new_prior['Stat3'] > 6]

Unnamed: 0,TF
uc057alc.1_up_1_chr8_35620774_f,7
uc009mmr.2_up_1_chr8_84662853_r,7
uc009nyw.2_up_1_chr8_126593437_r,7
uc011ybs.1_up_1_chr11_86584159_r,7


In [97]:
motif_bed='/Users/ndeveaux/Dropbox (Simons Foundation)/atac_parameter_search/ChipSeq/2016_11_22_performance_test_output/SL1040_SL972_peaks.bed'

In [12]:
computed_priors.keys()

['tss_mm10_macs_pval_1e-10_5000_Irf4',
 'tss_mm10_macs_pval_1e-5_5000_Batf',
 'genebodies_mm10_macs_pval_1e-5_5000_Etv6',
 'genebodies_mm10_macs_pval_1e-10_5000_Irf4',
 'tss_mm10_macs_pval_1e-10_5000_Maf',
 'genebodies_mm10_macs_pval_1e-5_10000_Irf4',
 'tss_mm10_macs_pval_1e-10_5000_Etv6',
 'tss_mm10_macs_pval_1e-5_5000_Ctcf',
 'genebodies_mm10_macs_pval_1e-10_10000_Stat3',
 'tss_mm10_macs_pval_1e-10_10000_Irf4',
 'genebodies_mm10_macs_pval_1e-5_10000_RORg',
 'genebodies_mm10_macs_pval_1e-10_5000_Maf',
 'genebodies_mm10_macs_pval_1e-5_5000_Stat3',
 'genebodies_mm10_macs_pval_1e-5_5000_Maf',
 'tss_mm10_macs_pval_1e-10_5000_Fosl2',
 'genebodies_mm10_macs_pval_1e-5_10000_Etv6',
 'genebodies_mm10_macs_pval_1e-5_10000_Stat3',
 'tss_mm10_macs_pval_1e-10_5000_Batf',
 'genebodies_mm10_macs_pval_1e-5_5000_Fosl2',
 'genebodies_mm10_macs_pval_1e-10_5000_Etv6',
 'genebodies_mm10_macs_pval_1e-5_5000_Batf',
 'genebodies_mm10_macs_pval_1e-10_5000_RORg',
 'genebodies_mm10_macs_pval_1e-10_10000_Nrf2',
