In [3]:
import inferelator_ng.prior as priors
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Generate ChipSeq priors from metadata and mm10 references
This notebook loops over different references (genebody and tss), different distances to references, samples, and genes per sample, to create priors. Priors are also combined across samples. 

In [4]:
base_dir='/Users/ndeveaux/Dropbox (Simons Foundation)/atac_parameter_search/ChipSeq/chipseq_priors/chipseq_mm10_th17_prior_gen_2017_01'

In [5]:
input_base_dir = os.path.join(base_dir, 'references')
output_base_dir = os.path.join(base_dir, 'output')

In [6]:
if not os.path.exists(output_base_dir):
    os.makedirs(output_base_dir)

In [7]:
macs_folders = ['2016_11_28_MACS_out', '2016_11_15_MACS_out']
references = ['tss_mm10.bed', 'genebodies_mm10.bed']
macs_dict = {'2016_11_28_MACS_out': 'macs_pval_1e-10', '2016_11_15_MACS_out': 'macs_pval_1e-5'}

In [8]:
metadata_file = os.path.join(input_base_dir, 'metadata_ChIP.txt')
metadata = pd.read_csv(metadata_file, sep = '\t')

In [9]:
dists=[5000, 10000]

In [9]:
computed_priors = {}

In [54]:
# compute priors
for row in metadata.iterrows():
    row_content = row[1]
    sample = ('_').join([row_content['sampleName'], row_content['compare2']])
    gene = row_content['gene']
    for ref in references:
        ref_bed = os.path.join(input_base_dir, ref)
        # get human readable name
        ref_name = ref.split('.')[0]
        ref_df = pd.read_csv(ref_bed, sep = '\t', header=None)
        for mac in macs_folders:
            mac_name = macs_dict[mac]
            for dist in dists:
                motif_bed = os.path.join(os.path.join(os.path.join(input_base_dir, mac), sample), '_'.join([sample, 'peaks.bed']))
                p = priors.Prior(motif_bed, ref_bed, regulators = [gene], targets = list(ref_df[3]), max_distance=dist, single_tf=True, mode='window')
                key = '_'.join([ref_name, mac_name, str(dist), gene])
                print key + ': ' + sample
                if key not in computed_priors.keys():
                    computed_priors[key] = {}
                if sample not in computed_priors[key].keys():
                    computed_priors[key][sample] = p.make_prior()

tss_mm10_macs_pval_1e-10_5000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-10_10000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-5_5000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-5_10000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-10_5000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-10_10000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-5_5000_Batf: SL3033_SL3036
genebodies_mm10_macs_pval_1e-5_10000_Batf: SL3033_SL3036
tss_mm10_macs_pval_1e-10_5000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-10_10000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-5_5000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-5_10000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-10_5000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-10_10000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-5_5000_Batf: SL3037_SL3036
genebodies_mm10_macs_pval_1e-5_10000_Batf: SL3037_SL3036
tss_mm10_macs_pval_1e-10_5000_Batf: SL10570_SL10564
tss_mm10_macs_pval_1e-10_10000_Batf: SL10570_SL10564
tss_mm10_macs_pval_1e-5_5000_Batf: SL10

In [55]:
# combine priors into dataframe, sum and save
dataframes = {}
for k in computed_priors:
    gene = k.split('_')[-1]
    result = pd.DataFrame([computed_priors[k][sample][gene].rename(sample) for sample in computed_priors[k]]).transpose()
    result['combined_' + gene] = result.sum(axis=1)
    output_filename = '.'.join([k, 'csv'])
    result.to_csv(os.path.join(output_base_dir, output_filename))
    dataframes[k] = result

In [73]:
dataframes = {}
for f in os.listdir(output_base_dir):
    if 'Nick' not in f:
        dataframes[f.rstrip('.csv')] = pd.read_csv(os.path.join(output_base_dir, f), index_col = 0)


In [74]:
kgXref = pd.read_csv(os.path.join(base_dir, 'kgXref.txt'), sep='\t', header=None, index_col=0)

In [75]:
# take the dataframes per sample and combined them to a per gene df
collapsed_on_genes_tfs = {}
for ref in references:
    ref_name = ref.split('.')[0]
    for mac in macs_folders:
        mac_name = macs_dict[mac]
        for dist in dists:
            new_key = '_'.join([ref_name, mac_name, str(dist)])
            print new_key
            columns = []
            for gene in set(metadata['gene']):
                key = '_'.join([ref_name, mac_name, str(dist), gene])
                columns.append(dataframes[key]['combined_' + gene].rename(gene))
            collapsed_on_genes_tfs[new_key] = pd.DataFrame(columns).transpose()


tss_mm10_macs_pval_1e-10_5000
tss_mm10_macs_pval_1e-10_10000
tss_mm10_macs_pval_1e-5_5000
tss_mm10_macs_pval_1e-5_10000
genebodies_mm10_macs_pval_1e-10_5000
genebodies_mm10_macs_pval_1e-10_10000
genebodies_mm10_macs_pval_1e-5_5000
genebodies_mm10_macs_pval_1e-5_10000


In [77]:
collapsed_on_genes_tfs['genebodies_mm10_macs_pval_1e-10_5000'].head()

Unnamed: 0,Batf,Stat3,Etv6,RORg,Irf4,Maf,Hif1a,Fosl2,Nrf2,Ctcf
uc007aet.1,0,0,0,0,0,0,0,0,0,0
uc007aeu.1,1,0,0,0,0,0,0,0,0,0
uc007aev.1,0,0,0,0,0,0,0,0,0,0
uc007aew.1,0,0,0,1,0,0,0,0,0,3
uc007aex.2,0,0,0,0,0,0,0,0,0,1


In [89]:
renamed_dfs

{'genebodies_mm10_macs_pval_1e-10_10000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'genebodies_mm10_macs_pval_1e-10_5000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'genebodies_mm10_macs_pval_1e-5_10000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'genebodies_mm10_macs_pval_1e-5_5000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'tss_mm10_macs_pval_1e-10_10000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'tss_mm10_macs_pval_1e-10_5000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'tss_mm10_macs_pval_1e-5_10000': Empty DataFrame
 Columns: [Batf, Stat3, Etv6, RORg, Irf4, Maf, Hif1a, Fosl2, Nrf2, Ctcf]
 Index: [], 'tss_mm10_macs_pval_1e-5_5000': 

In [86]:
# Visualization
df = renamed_dfs['genebodies_mm10_macs_pval_1e-10_5000']

In [87]:
def plot(df, title):
    -
    sns.heatmap(df.head(2000), annot=False, fmt="h", yticklabels=False)
    plt.title(title)

In [88]:
plot(df, 'genebodies_mm10_macs_pval_1e-10_5000')

ValueError: zero-size array to reduction operation minimum which has no identity

<matplotlib.figure.Figure at 0x129a7d810>

In [99]:
# Add kgXref names
renamed_dfs = {}
for key in collapsed_on_genes_tfs:
    df = collapsed_on_genes_tfs[key]
    new_names = {}
    print key
    for row in df.index:
        gene_name = str(kgXref.loc[row.split('_')[0], 4])
        # If there's a conflict, i.e. ucsc ids with the same gene symbol, only take the maximum value
        add_new_key = True
        # copying the dictionary to loop because of dynamic changes
        if gene_name in new_names.values():
            for k in dict(new_names):
                if new_names[k] == gene_name:
                    if sum(df.loc[k, :]) > sum(df.loc[row, :]):
                        new_names.pop(k, None)
                    else:
                        add_new_key = False
        if add_new_key:
            new_names[row] = gene_name
    renamed_dfs[key] = df.loc[new_names.keys(), :].rename(index=new_names, columns=str)
    output_filename = '.'.join([key, 'csv'])
    renamed_dfs[key].to_csv(os.path.join(output_base_dir, output_filename))

genebodies_mm10_macs_pval_1e-5_10000
tss_mm10_macs_pval_1e-10_10000
tss_mm10_macs_pval_1e-5_5000
tss_mm10_macs_pval_1e-5_10000
tss_mm10_macs_pval_1e-10_5000
genebodies_mm10_macs_pval_1e-10_5000
genebodies_mm10_macs_pval_1e-10_10000
genebodies_mm10_macs_pval_1e-5_5000


In [98]:
renamed_dfs['genebodies_mm10_macs_pval_1e-5_10000'].head

<bound method DataFrame.head of                Batf  Stat3  Etv6  RORg  Irf4  Maf  Hif1a  Fosl2  Nrf2  Ctcf
Olfr1532-ps1      0      0     0     0     0    0      0      0     0     1
Sis               0      0     0     0     0    0      0      0     0     0
Olfr1140          0      0     0     0     0    0      0      0     1     0
Gm3219            0      0     0     0     0    2      0      2     3     2
Sfta2             1      0     0     1     0    0      0      2     1     0
Vezt             22      0     0     3    12    7      2     13     1     7
Rnu6              2      0     0     0     0    0      0      2     0     0
Ict1             13      6     6    11    11    6      4      4     4     6
Afg3l2            7      0     1     0     3    0      4      5     1     3
DQ704580          0      0     0     0     0    0      0      0     0     0
5730409E04Rik     2      0     0     0     0    2      0      0     2     0
Tnfrsf13b         4      2     3     3     4    3      0

Below is some draft space from commands I ran when working on this:

In [101]:
# Sanity check: see if prior is what's expected, 7. 
max(priors['2016_11_15_MACS_out_SL1040_SL972']['Stat3'])

7

In [72]:
# save
for k in renamed_dfs:
    result = renamed_dfs[k]
    output_filename = '.'.join([k, 'csv'])
    result.to_csv(os.path.join(output_base_dir, output_filename))

In [84]:
gene_df[gene_df[3] == 'uc011whu.1_up_1_chr1_4858327_f']

Unnamed: 0,0,1,2,3,4,5
18,chr1,4858326,4858327,uc011whu.1_up_1_chr1_4858327_f,0,+


In [110]:
new_prior = priors['2016_11_15_MACS_out_SL1040_SL972']
new_prior[new_prior['Stat3'] > 6]

Unnamed: 0,TF
uc057alc.1_up_1_chr8_35620774_f,7
uc009mmr.2_up_1_chr8_84662853_r,7
uc009nyw.2_up_1_chr8_126593437_r,7
uc011ybs.1_up_1_chr11_86584159_r,7


In [97]:
motif_bed='/Users/ndeveaux/Dropbox (Simons Foundation)/atac_parameter_search/ChipSeq/2016_11_22_performance_test_output/SL1040_SL972_peaks.bed'