In [1]:
import statsmodels
import numpy as np
import torch
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas
import pyfaidx
import numpy
import pyBigWig
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [79]:
def one_hot_encode(sequence, alphabet=['A', 'C', 'G', 'T'], dtype='int8', 
    desc=None, verbose=False, **kwargs):
    """Converts a string or list of characters into a one-hot encoding.

    This function will take in either a string or a list and convert it into a
    one-hot encoding. If the input is a string, each character is assumed to be
    a different symbol, e.g. 'ACGT' is assumed to be a sequence of four 
    characters. If the input is a list, the elements can be any size.

    Although this function will be used here primarily to convert nucleotide
    sequences into one-hot encoding with an alphabet of size 4, in principle
    this function can be used for any types of sequences.

    Parameters
    ----------
    sequence : str or list
        The sequence to convert to a one-hot encoding.

    alphabet : set or tuple or list
        A pre-defined alphabet where the ordering of the symbols is the same
        as the index into the returned tensor, i.e., for the alphabet ['A', 'B']
        the returned tensor will have a 1 at index 0 if the character was 'A'.
        Characters outside the alphabet are ignored and none of the indexes are
        set to 1. Default is ['A', 'C', 'G', 'T'].

    dtype : str or numpy.dtype, optional
        The data type of the returned encoding. Default is int8.

    desc : str or None, optional
        The title to display in the progress bar.

    verbose : bool or str, optional
        Whether to display a progress bar. If a string is passed in, use as the
        name of the progressbar. Default is False.

    kwargs : arguments
        Arguments to be passed into tqdm. Default is None.

    Returns
    -------
    ohe : numpy.ndarray
        A binary matrix of shape (alphabet_size, sequence_length) where
        alphabet_size is the number of unique elements in the sequence and
        sequence_length is the length of the input sequence.
    """

    d = verbose is False

    if isinstance(sequence, str):
        sequence = list(sequence)

    alphabet_lookup = {char: i for i, char in enumerate(alphabet)}

    ohe = numpy.zeros((len(sequence), len(alphabet)), dtype=dtype)
    for i, char in tqdm(enumerate(sequence), disable=d, desc=desc, **kwargs):
        idx = alphabet_lookup.get(char, -1)
        if idx != -1:
            ohe[i, idx] = 1

    return ohe

def extract_loci(loci, sequences, signals=None, controls=None, chroms=None, 
    in_window=2114, out_window=1000, max_jitter=0, min_counts=None,
    max_counts=None, n_loci=None, verbose=False):
    """Extract sequences and signals at coordinates from a locus file.

    This function will take in genome-wide sequences, signals, and optionally
    controls, and extract the values of each at the coordinates specified in
    the locus file/s and return them as tensors.

    Signals and controls are both lists with the length of the list, n_s
    and n_c respectively, being the middle dimension of the returned
    tensors. Specifically, the returned tensors of size 
    (len(loci), n_s/n_c, (out_window/in_wndow)+max_jitter*2).

    The values for sequences, signals, and controls, can either be filepaths
    or dictionaries of numpy arrays or a mix of the two. When a filepath is 
    passed in it is loaded using pyfaidx or pyBigWig respectively.   

    Parameters
    ----------
    loci: str or pandas.DataFrame or list/tuple of such
        Either the path to a bed file or a pandas DataFrame object containing
        three columns: the chromosome, the start, and the end, of each locus
        to train on. Alternatively, a list or tuple of strings/DataFrames where
        the intention is to train on the interleaved concatenation, i.e., when
        you want to train on peaks and negatives.

    sequences: str or dictionary
        Either the path to a fasta file to read from or a dictionary where the
        keys are the unique set of chromosoms and the values are one-hot
        encoded sequences as numpy arrays or memory maps.

    signals: list of strs or list of dictionaries or None, optional
        A list of filepaths to bigwig files, where each filepath will be read
        using pyBigWig, or a list of dictionaries where the keys are the same
        set of unique chromosomes and the values are numpy arrays or memory
        maps. If None, no signal tensor is returned. Default is None.

    controls: list of strs or list of dictionaries or None, optional
        A list of filepaths to bigwig files, where each filepath will be read
        using pyBigWig, or a list of dictionaries where the keys are the same
        set of unique chromosomes and the values are numpy arrays or memory
        maps. If None, no control tensor is returned. Default is None. 

    chroms: list or None, optional
        A set of chromosomes to extact loci from. Loci in other chromosomes
        in the locus file are ignored. If None, all loci are used. Default is
        None.

    in_window: int, optional
        The input window size. Default is 2114.

    out_window: int, optional
        The output window size. Default is 1000.

    max_jitter: int, optional
        The maximum amount of jitter to add, in either direction, to the
        midpoints that are passed in. Default is 0.

    min_counts: float or None, optional
        The minimum number of counts, summed across the length of each example
        and across all tasks, needed to be kept. If None, no minimum. Default 
        is None.

    max_counts: float or None, optional
        The maximum number of counts, summed across the length of each example
        and across all tasks, needed to be kept. If None, no maximum. Default 
        is None.  

    n_loci: int or None, optional
        A cap on the number of loci to return. Note that this is not the
        number of loci that are considered. The difference is that some
        loci may be filtered out for various reasons, and those are not
        counted towards the total. If None, no cap. Default is None.

    verbose: bool, optional
        Whether to display a progress bar while loading. Default is False.

    Returns
    -------
    seqs: torch.tensor, shape=(n, 4, in_window+2*max_jitter)
        The extracted sequences in the same order as the loci in the locus
        file after optional filtering by chromosome.

    signals: torch.tensor, shape=(n, len(signals), out_window+2*max_jitter)
        The extracted signals where the first dimension is in the same order
        as loci in the locus file after optional filtering by chromosome and
        the second dimension is in the same order as the list of signal files.
        If no signal files are given, this is not returned.

    controls: torch.tensor, shape=(n, len(controls), out_window+2*max_jitter)
        The extracted controls where the first dimension is in the same order
        as loci in the locus file after optional filtering by chromosome and
        the second dimension is in the same order as the list of control files.
        If no control files are given, this is not returned.
    """

    seqs, signals_, controls_ = [], [], []
    in_width, out_width = in_window // 2, out_window // 2

    # Load the sequences
    if isinstance(sequences, str):
        sequences = pyfaidx.Fasta(sequences)

    names = ['chrom', 'start', 'end']
    if not isinstance(loci, (tuple, list)):
        loci = [loci]

    loci_dfs = []
    for i, df in enumerate(loci):
        if isinstance(df, str):
            try:
                df = pandas.read_csv(df, sep='\t', usecols=[0, 1, 2], 
                    header=None, index_col=False, names=names)
            except:
                print("File Doesn't Exist!")
                return
            df['idx'] = numpy.arange(len(df)) * len(loci) + i
        loci_dfs.append(df)

    loci = pandas.concat(loci_dfs).set_index("idx").sort_index().reset_index(drop=True)

    if chroms is not None:
        loci = loci[numpy.isin(loci['chrom'], chroms)]

    # Load the signal and optional control tracks if filenames are given
    _signals = []
    if signals is not None:
        for i, signal in enumerate(signals):
            if isinstance(signal, str):
                try:
                    signal = pyBigWig.open(signal)
                except:
                    print("Null File")
                    return
            _signals.append(signal)

        signals = _signals

    _controls = []
    if controls is not None:
        for i, control in enumerate(controls):
            if isinstance(control, str):
                control = pyBigWig.open(control, "r")
            _controls.append(control)

        controls = _controls

    desc = "Loading Loci"
    d = not verbose

    max_width = max(in_width, out_width)
    loci_count = 0

    # print(loci)
    # print(loci.values)
    for chrom, start, end in tqdm(loci.values, disable=d, desc=desc):
        mid = start + (end - start) // 2

        if start - max_width - max_jitter < 0:
            continue

        if end + max_width + max_jitter >= len(sequences[chrom]):
            continue

        if n_loci is not None and loci_count == n_loci:
            break 

        start = mid - out_width - max_jitter
        end = mid + out_width + max_jitter

        # Extract the signal from each of the signal files
        if signals is not None:
            signals_.append([])
            for signal in signals:
                if isinstance(signal, dict):
                    signal_ = signal[chrom][start:end]
                else:
                    signal_ = signal.values(chrom, start, end, numpy=True)
                    signal_ = numpy.nan_to_num(signal_)

                signals_[-1].append(signal_)

        # For the sequences and controls extract a window the size of the input
        start = mid - in_width - max_jitter
        end = mid + in_width + max_jitter

        # Extract the controls from each of the control files
        if controls is not None:
            controls_.append([])
            for control in controls:
                if isinstance(control, dict):
                    control_ = control[chrom][start:end]
                else:
                    control_ = control.values(chrom, start, end, numpy=True)
                    control_ = numpy.nan_to_num(control_)

                controls_[-1].append(control_)

        # Extract the sequence
        if isinstance(sequences, dict):
            seq = sequences[chrom][start:end].T
        else:
            seq = one_hot_encode(sequences[chrom][start:end].seq.upper(), 
                alphabet=['A', 'C', 'G', 'T']).T

        seqs.append(seq)
        loci_count += 1

    seqs = torch.tensor(numpy.array(seqs), dtype=torch.float32)

    if signals is not None:
        signals_ = torch.tensor(numpy.array(signals_), dtype=torch.float32)

        idxs = torch.ones(signals_.shape[0], dtype=torch.bool)
        if max_counts is not None:
            idxs = (idxs) & (signals_.sum(dim=(1, 2)) < max_counts)
        if min_counts is not None:
            idxs = (idxs) & (signals_.sum(dim=(1, 2)) > min_counts)

        if controls is not None:
            controls_ = torch.tensor(numpy.array(controls_), dtype=torch.float32)
            return seqs[idxs], signals_[idxs], controls_[idxs]

        return seqs[idxs], signals_[idxs]
    else:
        if controls is not None:
            controls_ = torch.tensor(numpy.array(controls_), dtype=torch.float32)
            return seqs, controls_

        return seqs

def extract_signals(loci, sequences, signals=None, controls=None, chroms=None, 
    in_window=2114, out_window=1000, max_jitter=0, min_counts=None,
    max_counts=None, n_loci=None, verbose=False):
    
    # print(loci)
    # print(signals)
    
    seqs, signals_, controls_ = [], [], []
    in_width, out_width = in_window // 2, out_window // 2


    names = ['chrom', 'start', 'end']
    if not isinstance(loci, (tuple, list)):
        loci = [loci]

    loci_dfs = []
    for i, df in enumerate(loci):
        if isinstance(df, str):
            try:
                df = pandas.read_csv(df, sep='\t', usecols=[0, 1, 2], 
                    header=None, index_col=False, names=names)
            except:
                print("File Doesn't Exist!")
                return
            df['idx'] = numpy.arange(len(df)) * len(loci) + i
        loci_dfs.append(df)

    loci = pandas.concat(loci_dfs).set_index("idx").sort_index().reset_index(drop=True)

    if chroms is not None:
        loci = loci[numpy.isin(loci['chrom'], chroms)]

    # Load the signal and optional control tracks if filenames are given
    _signals = []
    if signals is not None:
        for i, signal in enumerate(signals):
            if isinstance(signal, str):
                try:
                    signal = pyBigWig.open(signal)
                except:
                    print("Null File")
                    return
            _signals.append(signal)

        signals = _signals

    _controls = []
    if controls is not None:
        for i, control in enumerate(controls):
            if isinstance(control, str):
                control = pyBigWig.open(control, "r")
            _controls.append(control)

        controls = _controls

    desc = "Loading Loci"
    d = not verbose

    max_width = max(in_width, out_width)
    loci_count = 0

    # print(loci)
    # print(loci.values)
    for chrom, start, end in tqdm(loci.values, disable=d, desc=desc):
        mid = start + (end - start) // 2

        if start - max_width - max_jitter < 0:
            continue

        if n_loci is not None and loci_count == n_loci:
            break 

        start = mid - out_width - max_jitter
        end = mid + out_width + max_jitter

        # Extract the signal from each of the signal files
        if signals is not None:
            signals_.append([])
            for signal in signals:
                if isinstance(signal, dict):
                    signal_ = signal[chrom][start:end]
                else:
                    try:
                        signal_ = signal.values(chrom, start, end, numpy=True)
                        signal_ = numpy.nan_to_num(signal_)
                    except:
                        print("error with interval bounds")
                        print(signal)
                        print(chrom)
                        print(start)
                        print(end)

                signals_[-1].append(signal_)

        # For the sequences and controls extract a window the size of the input
        start = mid - in_width - max_jitter
        end = mid + in_width + max_jitter

        # Extract the controls from each of the control files
        if controls is not None:
            controls_.append([])
            for control in controls:
                if isinstance(control, dict):
                    control_ = control[chrom][start:end]
                else:
                    control_ = control.values(chrom, start, end, numpy=True)
                    control_ = numpy.nan_to_num(control_)

                controls_[-1].append(control_)

        loci_count += 1

    if signals is not None:
        signals_ = torch.tensor(numpy.array(signals_), dtype=torch.float32)

        idxs = torch.ones(signals_.shape[0], dtype=torch.bool)
        if max_counts is not None:
            idxs = (idxs) & (signals_.sum(dim=(1, 2)) < max_counts)
        if min_counts is not None:
            idxs = (idxs) & (signals_.sum(dim=(1, 2)) > min_counts)

        return signals_[idxs]
    
def getlinmodel(cell_type, histone_type):
    signals = [root + "data/procap/observed/{}/5prime.pos.bigWig".format(cell_type), root + "data/procap/observed/{}/5prime.neg.bigWig".format(cell_type)]

    actual_levels = torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=signals), dim=1), dim=1)
    actual_levels = actual_levels.numpy()
    histone_levels = torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=[histone_paths[histone_type][cell_type]]), dim=1), dim=1)
    histone_levels = histone_levels.numpy()
    dat = pandas.DataFrame()
    dat['actual_levels'] = actual_levels
    dat['histone_levels'] = histone_levels

    results = smf.ols('actual_levels ~ histone_levels', data=dat).fit()
    print(results.summary())

def getlinmodel_mult(cell_type, histone_types):
    signals = [root + "data/procap/observed/{}/5prime.pos.bigWig".format(cell_type), root + "data/procap/observed/{}/5prime.neg.bigWig".format(cell_type)]

    actual_levels = torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=signals), dim=1), dim=1)
    actual_levels = actual_levels.numpy()
    dat = pandas.DataFrame()
    dat['actual_levels'] = actual_levels
    for histone_type in histone_types:
        histone_levels = torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=[histone_paths[histone_type][cell_type]]), dim=1), dim=1)
        histone_levels = histone_levels.numpy()
        dat[histone_type] = histone_levels

    results = smf.ols('actual_levels ~ ' + '+ '.join(histone_types), data=dat).fit()
    print(results.summary())

def preloaded_getlinmodel_mult(cell_type, histone_types, histone_levels):
    actual_levels = initiation_levels[cell_type]
    actual_levels = actual_levels.numpy()
    dat = pandas.DataFrame()
    dat['actual_levels'] = actual_levels
    for histone_type in histone_types:
        hlevels = torch.sum(torch.sum(histone_levels[cell_type1][histone_type], dim=1), dim=1)
        hlevels = hlevels.numpy()
        dat[histone_type] = hlevels

    results = smf.ols('actual_levels ~ ' + '+ '.join(histone_types), data=dat).fit()
    print(results.summary())

def preloaded_getlinmodel_diff(cell_type1, cell_type2, histone_types, histone_levels):
    diff_levels = torch.square(initiation_levels[cell_type1] - initiation_levels[cell_type2])
    diff_levels = diff_levels.numpy()
    dat = pandas.DataFrame()
    dat['diff_levels'] = diff_levels
    for histone_type in histone_types:
        hlevels1 = torch.sum(torch.sum(histone_levels[cell_type1][histone_type], dim=1), dim=1)
        hlevels1 = hlevels1.numpy()
        dat[histone_type + cell_type1] = hlevels1
        
        hlevels2 = torch.sum(torch.sum(histone_levels[cell_type2][histone_type], dim=1), dim=1)
        hlevels2 = hlevels2.numpy()
        dat[histone_type + cell_type2] = hlevels2

    cell1parameters = '{}+ '.format(cell_type1).join(histone_types) + cell_type1
    cell2parameters = '{}+ '.format(cell_type2).join(histone_types) + cell_type2
    
    results = smf.ols('diff_levels ~ ' + cell1parameters + '+' + cell2parameters, data=dat).fit()
    print(results.summary())

In [3]:
root = "/home/myin25/celltype_specificity/celltype_specificity/"
cell_types = ['K562', 'CACO2','A673', 'HUVEC']
histone_types = ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3', 'H3K9ac', 'H3K79me2']
sequences = root + "refs/hg38.fasta"

In [74]:
root = "/home/myin25/celltype_specificity/celltype_specificity/"

cell_types = ['K562', 'CACO2','A673', 'HUVEC']
histone_types = ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3', 'H3K9ac', 'H3K79me2']
sequences = root + "refs/hg38.fasta"
merged_peak_path = root + "data/procap/union_peaks_fold1_train.bed.gz"

# Histone folders
histonefolders = {'K562': root + "data/K562",
                  'CACO2': root + "data/CACO2",
                  'A673': root + "data/A673",
                  'HUVEC': root + "data/HUVEC"}

# Paths to different histones
histone_paths = {'H3K9me3':{'K562': histonefolders['K562'] + "/H3K9me3/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K9me3/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K9me3/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K9me3/foldchange.bigWig"},
                 
                 'H3K4me1':{'K562': histonefolders['K562'] + "/H3K4me1/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K4me1/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K4me1/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K4me1/foldchange.bigWig"},
                 
                 'H3K27me3':{'K562': histonefolders['K562'] + "/H3K27me3/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K27me3/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K27me3/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K27me3/foldchange.bigWig"},
                 
                 'H3K27ac':{'K562': histonefolders['K562'] + "/H3K27ac/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K27ac/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K27ac/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K27ac/foldchange.bigWig"},
                 
                 'H3K36me3':{'K562': histonefolders['K562'] + "/H3K36me3/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K36me3/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K36me3/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K36me3/foldchange.bigWig"},
                 
                 'H3K4me3':{'K562': histonefolders['K562'] + "/H3K4me3/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K4me3/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K4me3/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K4me3/foldchange.bigWig"},
                 
                 'H3K9ac':{'K562': histonefolders['K562'] + "/H3K9ac/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K9ac/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K9ac/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K9ac/foldchange.bigWig"},
                 
                 'H3K79me2':{'K562': histonefolders['K562'] + "/H3K79me2/foldchange.bigWig",
                 'CACO2':histonefolders['CACO2'] + "/H3K79me2/foldchange.bigWig",
                 'A673':histonefolders['A673'] + "/H3K79me2/foldchange.bigWig",
                 'HUVEC':histonefolders['HUVEC'] + "/H3K79me2/foldchange.bigWig"}}

levels_actual_path = {'K562' : [root + "data/procap/observed/K562/5prime.neg.bigWig", root + "data/procap/observed/K562/5prime.pos.bigWig"],
                       'CACO2' : [root + "data/procap/observed/CACO2/5prime.neg.bigWig", root + "data/procap/observed/CACO2/5prime.pos.bigWig"],
                       'A673' : [root + "data/procap/observed/A673/5prime.neg.bigWig", root + "data/procap/observed/A673/5prime.pos.bigWig"],
                       'HUVEC' : [root + "data/procap/observed/HUVEC/5prime.neg.bigWig", root + "data/procap/observed/HUVEC/5prime.pos.bigWig"]}


initiation_levels = {'K562' : torch.sum(torch.sum(extract_signals(merged_peak_path,sequences, signals=levels_actual_path['K562'],
                           controls=None, chroms=None, in_window=2114, out_window=1000, max_jitter=0), dim=1), dim=1),
                     'CACO2' : torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=levels_actual_path['CACO2'],
                           controls=None, chroms=None, in_window=2114, out_window=1000, max_jitter=0), dim=1), dim=1),
                     'A673' : torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=levels_actual_path['A673'],
                           controls=None, chroms=None, in_window=2114, out_window=1000, max_jitter=0), dim=1), dim=1),
                     'HUVEC' : torch.sum(torch.sum(extract_signals(merged_peak_path, sequences, signals=levels_actual_path['HUVEC'],
                           controls=None, chroms=None, in_window=2114, out_window=1000, max_jitter=0), dim=1), dim=1)}


In [9]:
# histone_peak_paths = {cell_type : {histone_type : root + "data/{}/{}/peaks.bed.gz".format(cell_type, histone_type) for histone_type in histone_types} for cell_type in cell_types}
histone_levels = {cell_type : {histone_type : extract_signals(merged_peak_path, sequences, signals=[histone_paths[histone_type][cell_type]]) for histone_type in histone_types} for cell_type in cell_types}



/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/K562/H3K9me3/foldchange.bigWig']
       chrom      start        end
0       chr5  174600055  174600190
1      chr15  101266186  101266192
2       chr5  178730466  178730886
3       chr8  141265328  141265430
4      chr11  122101677  122102099
...      ...        ...        ...
77104  chr10    8057680    8057768
77105   chrX   54495467   54495991
77106   chr5  141223214  141223515
77107   chr5   70586204   70586249
77108   chr5  141557700  141558018

[77109 rows x 3 columns]
[['chr5' 174600055 174600190]
 ['chr15' 101266186 101266192]
 ['chr5' 178730466 178730886]
 ...
 ['chr5' 141223214 141223515]
 ['chr5' 70586204 70586249]
 ['chr5' 141557700 141558018]]
/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/K562/H3K4me1/fol

/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K27me3/foldchange.bigWig']
       chrom      start        end
0       chr5  174600055  174600190
1      chr15  101266186  101266192
2       chr5  178730466  178730886
3       chr8  141265328  141265430
4      chr11  122101677  122102099
...      ...        ...        ...
77104  chr10    8057680    8057768
77105   chrX   54495467   54495991
77106   chr5  141223214  141223515
77107   chr5   70586204   70586249
77108   chr5  141557700  141558018

[77109 rows x 3 columns]
[['chr5' 174600055 174600190]
 ['chr15' 101266186 101266192]
 ['chr5' 178730466 178730886]
 ...
 ['chr5' 141223214 141223515]
 ['chr5' 70586204 70586249]
 ['chr5' 141557700 141558018]]
/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K27ac/

[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K27ac/foldchange.bigWig for reading
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K27ac/foldchange.bigWig for reading
[pyBwOpen] bw is NULL!


/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K4me3/foldchange.bigWig']
       chrom      start        end
0       chr5  174600055  174600190
1      chr15  101266186  101266192
2       chr5  178730466  178730886
3       chr8  141265328  141265430
4      chr11  122101677  122102099
...      ...        ...        ...
77104  chr10    8057680    8057768
77105   chrX   54495467   54495991
77106   chr5  141223214  141223515
77107   chr5   70586204   70586249
77108   chr5  141557700  141558018

[77109 rows x 3 columns]
[['chr5' 174600055 174600190]
 ['chr15' 101266186 101266192]
 ['chr5' 178730466 178730886]
 ...
 ['chr5' 141223214 141223515]
 ['chr5' 70586204 70586249]
 ['chr5' 141557700 141558018]]
/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K9ac/fo

[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K9ac/foldchange.bigWig for reading
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K9ac/foldchange.bigWig for reading
[pyBwOpen] bw is NULL!
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K79me2/foldchange.bigWig for reading
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/CACO2/H3K79me2/foldchange.bigWig for reading
[pyBwOpen] bw is NULL!


/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K4me1/foldchange.bigWig']
       chrom      start        end
0       chr5  174600055  174600190
1      chr15  101266186  101266192
2       chr5  178730466  178730886
3       chr8  141265328  141265430
4      chr11  122101677  122102099
...      ...        ...        ...
77104  chr10    8057680    8057768
77105   chrX   54495467   54495991
77106   chr5  141223214  141223515
77107   chr5   70586204   70586249
77108   chr5  141557700  141558018

[77109 rows x 3 columns]
[['chr5' 174600055 174600190]
 ['chr15' 101266186 101266192]
 ['chr5' 178730466 178730886]
 ...
 ['chr5' 141223214 141223515]
 ['chr5' 70586204 70586249]
 ['chr5' 141557700 141558018]]
/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K27me3/fo

[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K9ac/foldchange.bigWig for reading
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K9ac/foldchange.bigWig for reading
[pyBwOpen] bw is NULL!
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K79me2/foldchange.bigWig for reading
[urlOpen] Couldn't open /home/myin25/celltype_specificity/celltype_specificity/data/A673/H3K79me2/foldchange.bigWig for reading
[pyBwOpen] bw is NULL!


/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/HUVEC/H3K4me1/foldchange.bigWig']
       chrom      start        end
0       chr5  174600055  174600190
1      chr15  101266186  101266192
2       chr5  178730466  178730886
3       chr8  141265328  141265430
4      chr11  122101677  122102099
...      ...        ...        ...
77104  chr10    8057680    8057768
77105   chrX   54495467   54495991
77106   chr5  141223214  141223515
77107   chr5   70586204   70586249
77108   chr5  141557700  141558018

[77109 rows x 3 columns]
[['chr5' 174600055 174600190]
 ['chr15' 101266186 101266192]
 ['chr5' 178730466 178730886]
 ...
 ['chr5' 141223214 141223515]
 ['chr5' 70586204 70586249]
 ['chr5' 141557700 141558018]]
/home/myin25/celltype_specificity/celltype_specificity/data/procap/union_peaks_fold1_train.bed.gz
['/home/myin25/celltype_specificity/celltype_specificity/data/HUVEC/H3K27me3/

In [17]:
preloaded_getlinmodel_mult("K562", ["H3K9me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     1874.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:47:21   Log-Likelihood:            -5.9370e+05
No. Observations:               77109   AIC:                         1.187e+06
Df Residuals:                   77107   BIC:                         1.187e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    330.1134      3.811     86.612      0.0

In [18]:
preloaded_getlinmodel_mult("K562", ["H3K4me1"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     626.2
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          1.18e-137
Time:                        17:47:39   Log-Likelihood:            -5.9431e+05
No. Observations:               77109   AIC:                         1.189e+06
Df Residuals:                   77107   BIC:                         1.189e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    155.2234      2.332     66.557      0.0

In [19]:
preloaded_getlinmodel_mult("K562", ["H3K27me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     2040.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:54:51   Log-Likelihood:            -5.9361e+05
No. Observations:               77109   AIC:                         1.187e+06
Df Residuals:                   77107   BIC:                         1.187e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    248.1901      2.342    105.959      0.0

In [20]:
preloaded_getlinmodel_mult("K562", ["H3K27ac"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.279
Model:                            OLS   Adj. R-squared:                  0.279
Method:                 Least Squares   F-statistic:                 2.991e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:54:53   Log-Likelihood:            -5.8198e+05
No. Observations:               77109   AIC:                         1.164e+06
Df Residuals:                   77107   BIC:                         1.164e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -24.9074      2.059    -12.095      0.0

In [21]:
preloaded_getlinmodel_mult("K562", ["H3K36me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     650.1
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          8.21e-143
Time:                        17:54:59   Log-Likelihood:            -5.9430e+05
No. Observations:               77109   AIC:                         1.189e+06
Df Residuals:                   77107   BIC:                         1.189e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    240.8452      2.847     84.588      0.0

In [22]:
preloaded_getlinmodel_mult("K562", ["H3K4me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.278
Model:                            OLS   Adj. R-squared:                  0.278
Method:                 Least Squares   F-statistic:                 2.976e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:00   Log-Likelihood:            -5.8204e+05
No. Observations:               77109   AIC:                         1.164e+06
Df Residuals:                   77107   BIC:                         1.164e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     18.4378      1.923      9.590      0.0

In [23]:
preloaded_getlinmodel_mult("K562", ["H3K9ac"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.283
Model:                            OLS   Adj. R-squared:                  0.283
Method:                 Least Squares   F-statistic:                 3.039e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:02   Log-Likelihood:            -5.8181e+05
No. Observations:               77109   AIC:                         1.164e+06
Df Residuals:                   77107   BIC:                         1.164e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -16.5379      2.022     -8.177      0.0

In [24]:
preloaded_getlinmodel_mult("K562", ["H3K79me2"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     3841.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:04   Log-Likelihood:            -5.9275e+05
No. Observations:               77109   AIC:                         1.185e+06
Df Residuals:                   77107   BIC:                         1.186e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    129.8743      2.116     61.370      0.0

In [25]:
preloaded_getlinmodel_mult("CACO2", ["H3K9me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.034
Model:                            OLS   Adj. R-squared:                  0.034
Method:                 Least Squares   F-statistic:                     2733.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:09   Log-Likelihood:            -6.6757e+05
No. Observations:               77109   AIC:                         1.335e+06
Df Residuals:                   77107   BIC:                         1.335e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   1340.0090     16.454     81.441      0.0

In [26]:
preloaded_getlinmodel_mult("CACO2", ["H3K4me1"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.035
Model:                            OLS   Adj. R-squared:                  0.035
Method:                 Least Squares   F-statistic:                     2796.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:10   Log-Likelihood:            -6.6753e+05
No. Observations:               77109   AIC:                         1.335e+06
Df Residuals:                   77107   BIC:                         1.335e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    949.3455      9.529     99.623      0.0

In [27]:
preloaded_getlinmodel_mult("CACO2", ["H3K27me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     1719.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:11   Log-Likelihood:            -6.6806e+05
No. Observations:               77109   AIC:                         1.336e+06
Df Residuals:                   77107   BIC:                         1.336e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    813.7192      8.682     93.730      0.0

In [28]:
# CACO2 H3K27ac dne

In [29]:
preloaded_getlinmodel_mult("CACO2", ["H3K36me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     348.5
Date:                Tue, 27 Jun 2023   Prob (F-statistic):           1.35e-77
Time:                        17:55:14   Log-Likelihood:            -6.6873e+05
No. Observations:               77109   AIC:                         1.337e+06
Df Residuals:                   77107   BIC:                         1.337e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    653.6227      8.750     74.700      0.0

In [30]:
preloaded_getlinmodel_mult("CACO2", ["H3K4me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.248
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                 2.544e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:15   Log-Likelihood:            -6.5791e+05
No. Observations:               77109   AIC:                         1.316e+06
Df Residuals:                   77107   BIC:                         1.316e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    105.3067      5.133     20.515      0.0

In [31]:
# CACO2 H3K9ac dne

In [32]:
# CACO2", "H3K79me2" dne

In [33]:
preloaded_getlinmodel_mult("A673", ["H3K9me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     700.0
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          1.42e-153
Time:                        17:55:18   Log-Likelihood:            -6.7274e+05
No. Observations:               77109   AIC:                         1.345e+06
Df Residuals:                   77107   BIC:                         1.346e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    625.6430      8.786     71.206      0.0

In [34]:
preloaded_getlinmodel_mult("A673", ["H3K4me1"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     862.8
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          1.35e-188
Time:                        17:55:19   Log-Likelihood:            -6.7266e+05
No. Observations:               77109   AIC:                         1.345e+06
Df Residuals:                   77107   BIC:                         1.345e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    620.5623      8.116     76.466      0.0

In [35]:
preloaded_getlinmodel_mult("A673", ["H3K27me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     558.9
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          4.01e-123
Time:                        17:55:20   Log-Likelihood:            -6.7281e+05
No. Observations:               77109   AIC:                         1.346e+06
Df Residuals:                   77107   BIC:                         1.346e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    520.8063      6.329     82.287      0.0

In [36]:
preloaded_getlinmodel_mult("A673", ["H3K27ac"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.210
Method:                 Least Squares   F-statistic:                 2.054e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:22   Log-Likelihood:            -6.6399e+05
No. Observations:               77109   AIC:                         1.328e+06
Df Residuals:                   77107   BIC:                         1.328e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -98.4781      6.091    -16.169      0.0

In [37]:
preloaded_getlinmodel_mult("A673", ["H3K36me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     193.6
Date:                Tue, 27 Jun 2023   Prob (F-statistic):           5.78e-44
Time:                        17:55:24   Log-Likelihood:            -6.7299e+05
No. Observations:               77109   AIC:                         1.346e+06
Df Residuals:                   77107   BIC:                         1.346e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    499.4229      6.803     73.412      0.0

In [38]:
preloaded_getlinmodel_mult("A673", ["H3K4me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.175
Model:                            OLS   Adj. R-squared:                  0.175
Method:                 Least Squares   F-statistic:                 1.638e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:55:26   Log-Likelihood:            -6.6566e+05
No. Observations:               77109   AIC:                         1.331e+06
Df Residuals:                   77107   BIC:                         1.331e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     47.8531      5.777      8.283      0.0

In [None]:
# "A673", "H3K9ac" dne

In [None]:
# "A673", "H3K79me2" dne

In [39]:
preloaded_getlinmodel_mult("HUVEC", ["H3K9me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.028
Model:                            OLS   Adj. R-squared:                  0.028
Method:                 Least Squares   F-statistic:                     2215.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:45   Log-Likelihood:            -6.3715e+05
No. Observations:               77109   AIC:                         1.274e+06
Df Residuals:                   77107   BIC:                         1.274e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    529.2274      4.771    110.923      0.0

In [40]:
preloaded_getlinmodel_mult("HUVEC", ["H3K4me1"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     310.2
Date:                Tue, 27 Jun 2023   Prob (F-statistic):           2.71e-69
Time:                        17:57:45   Log-Likelihood:            -6.3808e+05
No. Observations:               77109   AIC:                         1.276e+06
Df Residuals:                   77107   BIC:                         1.276e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    425.4265      4.622     92.047      0.0

In [41]:
preloaded_getlinmodel_mult("HUVEC", ["H3K27me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.031
Model:                            OLS   Adj. R-squared:                  0.031
Method:                 Least Squares   F-statistic:                     2442.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:45   Log-Likelihood:            -6.3704e+05
No. Observations:               77109   AIC:                         1.274e+06
Df Residuals:                   77107   BIC:                         1.274e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    465.9563      3.886    119.906      0.0

In [42]:
preloaded_getlinmodel_mult("HUVEC", ["H3K27ac"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.129
Model:                            OLS   Adj. R-squared:                  0.129
Method:                 Least Squares   F-statistic:                 1.137e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:45   Log-Likelihood:            -6.3293e+05
No. Observations:               77109   AIC:                         1.266e+06
Df Residuals:                   77107   BIC:                         1.266e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    157.3123      3.773     41.692      0.0

In [43]:
preloaded_getlinmodel_mult("HUVEC", ["H3K36me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     1654.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:46   Log-Likelihood:            -6.3742e+05
No. Observations:               77109   AIC:                         1.275e+06
Df Residuals:                   77107   BIC:                         1.275e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    471.6169      4.202    112.230      0.0

In [44]:
preloaded_getlinmodel_mult("HUVEC", ["H3K4me3"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.284
Model:                            OLS   Adj. R-squared:                  0.284
Method:                 Least Squares   F-statistic:                 3.052e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:46   Log-Likelihood:            -6.2538e+05
No. Observations:               77109   AIC:                         1.251e+06
Df Residuals:                   77107   BIC:                         1.251e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     87.1002      3.324     26.205      0.0

In [45]:
preloaded_getlinmodel_mult("HUVEC", ["H3K9ac"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.295
Model:                            OLS   Adj. R-squared:                  0.295
Method:                 Least Squares   F-statistic:                 3.225e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        17:57:46   Log-Likelihood:            -6.2477e+05
No. Observations:               77109   AIC:                         1.250e+06
Df Residuals:                   77107   BIC:                         1.250e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     40.3000      3.415     11.799      0.0

In [46]:
preloaded_getlinmodel_mult("HUVEC", ["H3K79me2"], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.018
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1398.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          2.85e-303
Time:                        17:57:47   Log-Likelihood:            -6.3754e+05
No. Observations:               77109   AIC:                         1.275e+06
Df Residuals:                   77107   BIC:                         1.275e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    307.1675      3.797     80.891      0.0

In [63]:
preloaded_getlinmodel_mult('K562', ['H3K27ac', 'H3K4me3', 'H3K9ac'], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.335
Model:                            OLS   Adj. R-squared:                  0.335
Method:                 Least Squares   F-statistic:                 1.293e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:09:01   Log-Likelihood:            -5.7891e+05
No. Observations:               77109   AIC:                         1.158e+06
Df Residuals:                   77105   BIC:                         1.158e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -48.9447      2.035    -24.049      0.0

In [64]:
preloaded_getlinmodel_mult('CACO2', ['H3K4me1', 'H3K4me3'], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.251
Model:                            OLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                 1.289e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:09:05   Log-Likelihood:            -6.5778e+05
No. Observations:               77109   AIC:                         1.316e+06
Df Residuals:                   77106   BIC:                         1.316e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    237.7359      9.661     24.607      0.0

In [66]:
preloaded_getlinmodel_mult('A673', ['H3K27ac', 'H3K4me3'], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.240
Model:                            OLS   Adj. R-squared:                  0.240
Method:                 Least Squares   F-statistic:                 1.215e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:09:16   Log-Likelihood:            -6.6253e+05
No. Observations:               77109   AIC:                         1.325e+06
Df Residuals:                   77106   BIC:                         1.325e+06
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -144.5097      6.036    -23.942      0.0

In [67]:
preloaded_getlinmodel_mult('HUVEC', ['H3K4me1', 'H3K4me3', 'H3K9ac'], histone_levels)

                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.353
Model:                            OLS   Adj. R-squared:                  0.353
Method:                 Least Squares   F-statistic:                 1.402e+04
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:09:20   Log-Likelihood:            -6.2145e+05
No. Observations:               77109   AIC:                         1.243e+06
Df Residuals:                   77105   BIC:                         1.243e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     85.2987      4.153     20.540      0.0

In [69]:
preloaded_getlinmodel_mult('K562', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3', 'H3K9ac', 'H3K79me2'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.370
Model:                            OLS   Adj. R-squared:                  0.370
Method:                 Least Squares   F-statistic:                     5661.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:15:52   Log-Likelihood:            -5.7681e+05
No. Observations:               77109   AIC:                         1.154e+06
Df Residuals:                   77100   BIC:                         1.154e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     36.4299      4.236      8.601      0.0

In [71]:
preloaded_getlinmodel_mult('CACO2', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K36me3', 'H3K4me3',], histone_levels)


                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.269
Model:                            OLS   Adj. R-squared:                  0.269
Method:                 Least Squares   F-statistic:                     5678.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:16:45   Log-Likelihood:            -6.5682e+05
No. Observations:               77109   AIC:                         1.314e+06
Df Residuals:                   77103   BIC:                         1.314e+06
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    796.1437     17.476     45.555      0.0

In [72]:
preloaded_getlinmodel_mult('A673', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.264
Model:                            OLS   Adj. R-squared:                  0.264
Method:                 Least Squares   F-statistic:                     4605.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:17:16   Log-Likelihood:            -6.6128e+05
No. Observations:               77109   AIC:                         1.323e+06
Df Residuals:                   77102   BIC:                         1.323e+06
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    239.7871     11.869     20.204      0.0

In [73]:
preloaded_getlinmodel_mult('HUVEC', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3', 'H3K9ac', 'H3K79me2'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:          actual_levels   R-squared:                       0.386
Model:                            OLS   Adj. R-squared:                  0.385
Method:                 Least Squares   F-statistic:                     6046.
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        18:17:22   Log-Likelihood:            -6.1946e+05
No. Observations:               77109   AIC:                         1.239e+06
Df Residuals:                   77100   BIC:                         1.239e+06
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    243.8641      6.544     37.263      0.0

In [83]:
preloaded_getlinmodel_diff('K562', 'CACO2', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.026
Method:                 Least Squares   F-statistic:                     205.0
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        21:13:21   Log-Likelihood:            -1.3785e+06
No. Observations:               77109   AIC:                         2.757e+06
Df Residuals:                   77098   BIC:                         2.757e+06
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      2.633e+06   2.14e+05     12.298

In [84]:
preloaded_getlinmodel_diff('K562', 'A673', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     103.3
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          5.26e-256
Time:                        21:13:27   Log-Likelihood:            -1.4372e+06
No. Observations:               77109   AIC:                         2.874e+06
Df Residuals:                   77096   BIC:                         2.874e+06
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      1.05e+06   3.53e+05      2.975   

In [85]:
preloaded_getlinmodel_diff('K562', 'HUVEC', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3', 'H3K9ac', 'H3K79me2'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.032
Model:                            OLS   Adj. R-squared:                  0.032
Method:                 Least Squares   F-statistic:                     161.0
Date:                Tue, 27 Jun 2023   Prob (F-statistic):               0.00
Time:                        21:14:31   Log-Likelihood:            -1.3184e+06
No. Observations:               77109   AIC:                         2.637e+06
Df Residuals:                   77092   BIC:                         2.637e+06
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      4.203e+05   7.37e+04      5.707

In [87]:
preloaded_getlinmodel_diff('CACO2', 'A673', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.007
Method:                 Least Squares   F-statistic:                     55.40
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          3.27e-112
Time:                        21:15:31   Log-Likelihood:            -1.4009e+06
No. Observations:               77109   AIC:                         2.802e+06
Df Residuals:                   77098   BIC:                         2.802e+06
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept       2.11e+06   2.91e+05      7.247

In [90]:
preloaded_getlinmodel_diff('CACO2', 'HUVEC', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     86.79
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          5.43e-179
Time:                        21:16:10   Log-Likelihood:            -1.3598e+06
No. Observations:               77109   AIC:                         2.720e+06
Df Residuals:                   77098   BIC:                         2.720e+06
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      1.756e+06   1.68e+05     10.463

In [91]:
preloaded_getlinmodel_diff('A673', 'HUVEC', ['H3K9me3', 'H3K4me1', 'H3K27me3', 'H3K27ac', 'H3K36me3', 'H3K4me3'], histone_levels)


                            OLS Regression Results                            
Dep. Variable:            diff_levels   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.011
Method:                 Least Squares   F-statistic:                     73.19
Date:                Tue, 27 Jun 2023   Prob (F-statistic):          3.02e-179
Time:                        21:16:33   Log-Likelihood:            -1.3977e+06
No. Observations:               77109   AIC:                         2.795e+06
Df Residuals:                   77096   BIC:                         2.796e+06
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      8.908e+05   1.93e+05      4.627