In [1]:
%%bash

grep '>' ../data/svm/fasta/pos_set_filtered_500bp_test.mm10.fa | wc -l

9686


# GkmExplain

- Interpret model by running GkmExplain on:
    - posititve test set
    - chicken heart-only enhancer

In [None]:
%%bash

HERE="/project/MDL_Ibrahim/MP_all/paper"
FASTA=$HERE/data/svm/fasta
MODEL=$HERE/data/svm/lsgkm_wrbf_c10_g2.model.txt
OUT=$HERE/data/svm/scores

EXPLAIN=$HERE/code/dev/gkmexplain/lsgkm-svr/scripts/parallelize_gkmexplain.sh

$EXPLAIN $FASTA/pos_set_filtered_500bp_test.mm10.fa 58 29 $MODEL $OUT

In [22]:
! wc -l ../data/svm/scores/*

     6804 ../data/svm/scores/heart_union_enhancer_pos.galGal6.hypscore.txt
     9686 ../data/svm/scores/pos_set_filtered_500bp_test.mm10.hypscore.txt
      100 ../data/svm/scores/pos_test_100.hypscore.txt
    16590 total


# TFMoDisco

## Functions

In [None]:
import numpy as np
import os 

def get_basename(path):
    out = os.path.basename(os.path.splitext(path)[0])
    return out

def find_invalid_shapes(array_list, expected_shape):
    invalid_indices = []
    
    for idx, array in enumerate(array_list):
        if array.shape != expected_shape:
            invalid_indices.append(idx)
    
    return invalid_indices

In [None]:
## original function from 
# rewrite to adjust shape of array to (4, len) vs (len, 4) 

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((4, len(sequence)), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=0)
    return to_return

In [None]:
def normalize_scores(impscores, hyp_impscores, onehot_data):
  #normalize the hyp scores such that, at each position, hypothetical importance
  # scores that have the same sign as the original importance score all sum
  # up to the original importance score value. The rationale is that if
  # multiple different bases at a position could produce a similar score,
  # the specific identity of each individual base is less important.
  #Empirically, hypothetical scores like these appear to work better for
  # motif discovery. Using normalized importance scores derived by taking
  # the elementwise product of the normalized hypothetical scores and
  # the one-hot encoding also seems to reduce noise.
  normed_hyp_impscores = []
  normed_impscores = []
  for i in range(len(impscores)):
      imp_score_each_pos = np.sum(impscores[i],axis=-1)
      imp_score_sign_each_pos = np.sign(imp_score_each_pos)
      hyp_scores_same_sign_mask = (np.sign(hyp_impscores[i])
                                   *imp_score_sign_each_pos[:,None] > 0)
      hyp_scores_same_sign_imp_scores_sum = np.sum(
          hyp_impscores[i]*hyp_scores_same_sign_mask,axis=-1)
      ##!!! with repeats, some impscore = 0 after ohe element-wise multiplication
      ## --> solution: add pseudo count before calculate norm ratio
      hyp_scores_same_sign_imp_scores_sum[hyp_scores_same_sign_imp_scores_sum == 0] = 1e-10
      
      norm_ratio = imp_score_each_pos/hyp_scores_same_sign_imp_scores_sum
      norm_hyp = hyp_impscores[i]*norm_ratio[:,None]
      normed_hyp_impscores.append(norm_hyp)
      normed_impscores.append(norm_hyp*onehot_data[i])
  return normed_impscores, normed_hyp_impscores

In [None]:
def get_MoLite_inputs(score_file, fasta_file, out=None, out_npz=True):
    ##some lines from score files have issues, remove those
    idx_to_delete= [i for i,x in enumerate(open(score_file)) if len(x.rstrip().split("\t")) < 3]
    # read in seq
    seq=[x.rstrip() 
         for i,x in enumerate(open(fasta_file)) if i%2==1]
    # one-hot encode seq
    ohe=[np.array(one_hot_encode_along_channel_axis(x)) 
                for x in seq if ('N' not in x)] # shape (4,len(seq))
    # remove seq without score from ohe
    ohe = [ohe[i] for i in range(len(ohe)) if i not in idx_to_delete]
    
    ## parse calculated attribute score from lsgkm model and transpose to match array shape
    hypscore=[w[0].T for w in zip([
    np.array( [[float(z) for z in y.split(",")]
                for y in x.rstrip().split("\t")[2].split(";")])
    for x in open(score_file) if len(x.rstrip().split("\t")) >= 3],seq) if 'N' not in w[1]]
    
    # some scores are of the wrong shape, i.e. not all 500bp were scored, remove from seq & score
    score_idx_rm = find_invalid_shapes(hypscore, (4,500))
    
    ohe = [ohe[i] for i in range(len(ohe)) if i not in score_idx_rm]
    hypscore = [hypscore[i] for i in range(len(hypscore)) if i not in score_idx_rm]
    
    # compute importance score
    impscore=[x * y for x,y in zip(hypscore, ohe)]
    # sanity check
    assert (np.max([np.max(np.abs(z*y - x))for x,y,z in zip(impscore,ohe,hypscore)]))==0
    norm_impscore, norm_hypscore = normalize_scores(impscore, hypscore, ohe)
    #save as npz 
    if (out_npz):
        filename = get_basename(fasta_file)
        np.savez_compressed(os.path.join(out, filename + str('_hypscore')), arr_0=norm_hypscore)
        np.savez_compressed(os.path.join(out, filename + str('_seq')), arr_0=ohe)
    else:
        return norm_impscore, norm_hypscore

## Prepare inputs for TFMoDisco

In [11]:
get_MoLite_inputs(score_file = '../data/svm/scores/pos_set_filtered_500bp_test.mm10.hypscore.txt',
                  fasta_file = '../data/svm/fasta/pos_set_filtered_500bp_test.mm10.fa',
                  out = '../data/svm/npz')

In [13]:
get_MoLite_inputs(score_file = '../data/svm/scores/heart_union_enhancer_pos.galGal6.hypscore.txt',
                  fasta_file = '../data/svm/fasta/heart_union_enhancer_pos.galGal6.fa',
                  out = '../data/svm/npz')

double check shape of inputs

In [12]:
score=np.load('../data/svm/npz/pos_set_filtered_500bp_test.mm10_hypscore.npz', allow_pickle=True)
ohe=np.load('../data/svm/npz/pos_set_filtered_500bp_test.mm10_seq.npz', allow_pickle=True)

for key in score.files:
    print(f"The shape of the attribution scores is {score[key].shape}")

for key in ohe.files:
    print(f"The shape of the one-hot encoding is {ohe[key].shape}")

The shape of the attribution scores is (9626, 4, 500)
The shape of the one-hot encoding is (9626, 4, 500)


## Run MoDisco

In [17]:
%%bash

HERE=$(realpath ..)
NPZ=$HERE/data/svm/npz
MODIR=$HERE/data/svm/modisco
JASPAR=$HERE/_data/references/motifs/filtered_JASPAR_300TFs.meme

modisco motifs \
-s $NPZ/pos_set_filtered_500bp_test.mm10_seq.npz \
-a $NPZ/pos_set_filtered_500bp_test.mm10_hypscore.npz \
-n 1000 \
-w 500 \
-o $MODIR/pos_test_mm10_max1000seqlet_results.h5

modisco report \
-i $MODIR/pos_test_mm10_max1000seqlet_results.h5 \
-o $MODIR/pos_test_mm10_max1000seqlet \
-m $JASPAR

modisco motifs \
-s $NPZ/pos_set_filtered_500bp_test.mm10_seq.npz \
-a $NPZ/pos_set_filtered_500bp_test.mm10_hypscore.npz \
-n 500 \
-w 500 \
-o $MODIR/pos_test_mm10_max500seqlet_results.h5

modisco report \
-i $MODIR/pos_test_mm10_max500seqlet_results.h5 \
-o $MODIR/pos_test_mm10_max500seqlet \
-m $JASPAR

modisco motifs \
-s $NPZ/pos_set_filtered_500bp_test.mm10_seq.npz \
-a $NPZ/pos_set_filtered_500bp_test.mm10_hypscore.npz \
-n 2000 \
-w 500 \
-o $MODIR/pos_test_mm10_max2000seqlet_results.h5

modisco report \
-i $MODIR/pos_test_mm10_max2000seqlet_results.h5 \
-o $MODIR/pos_test_mm10_max2000seqlet \
# -m $JASPAR

modisco motifs \
-s $NPZ/heart_union_enhancer_pos.galGal6_seq.npz \
-a $NPZ/heart_union_enhancer_pos.galGal6_hypscore.npz \
-n 2000 \
-w 500 \
-o $MODIR/heart_union_enhancer_galGal6_max2k_results.h5

modisco report \
-i $MODIR/heart_union_enhancer_galGal6_max2k_results.h5 \
-o $MODIR/heart_union_enhancer_galGal6_max2k \
-m $JASPAR

modisco motifs \
-s $NPZ/heart_union_enhancer_pos.galGal6_seq.npz \
-a $NPZ/heart_union_enhancer_pos.galGal6_hypscore.npz \
-n 3000 \
-w 500 \
-o $MODIR/heart_union_enhancer_galGal6_max3k_results.h5

modisco report \
-i $MODIR/heart_union_enhancer_galGal6_max3k_results.h5 \
-o $MODIR/heart_union_enhancer_galGal6_max3k \
-m $JASPAR

## Merge Mo output

In [20]:
%matplotlib inline
import numpy as np
import h5py as h5
import modiscolite
from modiscolite.aggregator import SimilarPatternsCollapser
from modiscolite.core import TrackSet, Seqlet, SeqletSet
from matplotlib import pyplot as plt
from modisco.visualization import viz_sequence
from modiscolite.io import save_hdf5
import os

In [22]:
#provide the list of modisco_results files to be merged, along with their
# corresponding one-hot encodings and hypothetical importance scores.

merge_list = [
    ("../data/svm/modisco/heart_union_enhancer_galGal6_max3k_results.h5",
     "../data/svm/npz/heart_union_enhancer_pos.galGal6_seq.npz", 
     "../data/svm/npz/heart_union_enhancer_pos.galGal6_hypscore.npz"),
    ("../data/svm/modisco/pos_test_mm10_max2000seqlet_results.h5", 
     "../data/svm/npz/pos_set_filtered_500bp_test.mm10_seq.npz", 
     "../data/svm/npz/pos_set_filtered_500bp_test.mm10_hypscore.npz")
]

#hyperparameters for merging - these are set
# to the defaults in tfmodisco-lite
min_overlap = 0.7
prob_and_pertrack_sim_merge_thresholds = [(0.8,0.8), (0.5, 0.85), (0.2, 0.9)]
prob_and_pertrack_sim_dealbreaker_thresholds = [(0.4, 0.75), (0.2,0.8), (0.1, 0.85), (0.0,0.9)]
min_frac = 0.2 # also called frac_support_to_trim_to
min_num = 30 # also called min_num_to_trim_to
flank_to_add = 5 # also called initial_flank_to_add
window_size = 20 # also called trim_to_window_size
max_seqlets_subsample = 300 # also called merging_max_seqlets_subsample

pattern_group = 'pos_patterns' #pos_patterns or neg_patterns

union_onehot = []
union_hypscores = []
union_patterncoords = []

exampleidx_offset = 0 #incremented after each modisco results file
for (results_file, onehot_file, hypscores_file) in merge_list:

  print("On file "+results_file)

  results_fh = h5.File(results_file)
  onehot = (np.load(onehot_file)["arr_0"]).transpose(0,2,1)
  hypscores = (np.load(hypscores_file)["arr_0"]).transpose(0,2,1)

  allpattern_exampleidxs = []
  #first, iterate through the patterns and get all the example indices
  # (Note: "example_idx" refers to the index of the sequence that contained
  #  the seqlet)
  for pattern_name in results_fh[pattern_group].keys():
    seqlets_grp = results_fh[pattern_group][pattern_name]['seqlets']
    allpattern_exampleidxs.extend(np.array(seqlets_grp['example_idx']))
    
  #figure out the subset of indices that actually have seqlets, sort it.
  surviving_indices = sorted(list(set(allpattern_exampleidxs)))
  print(str(len(surviving_indices))+" indices had seqlets out of "
        +str(len(onehot)))
  #add the scores for the subset of sequences that have scores to the
  # 'union' list.
  for idx in surviving_indices:
    union_onehot.append(onehot[idx])
    union_hypscores.append(hypscores[idx])
  
  #create an index remapping based on the subset of surviving indices
  # (we will add exampleidx_offset later)
  idx_remapping = dict(zip(surviving_indices,
                           np.arange(len(surviving_indices))))
  
  #Now iterate through the patterns again and prep the seqlet coordinates,
  # remapping the example indices as needed.
  #We also add in exampleidx_offset to account for all the previous seqeuences
  # that have already been added to the 'union' lists
  for pattern_name in results_fh[pattern_group].keys():
    seqlets_grp = results_fh[pattern_group][pattern_name]['seqlets']
    pattern_exampleidxs = np.array(seqlets_grp['example_idx'])
    #remap the example idxs
    pattern_remapped_exampleidxs = np.array([
        (exampleidx_offset+idx_remapping[idx]) for idx in pattern_exampleidxs])
    pattern_start = np.array(seqlets_grp['start'])
    pattern_end = np.array(seqlets_grp['end'])
    pattern_isrevcomp = np.array(seqlets_grp['is_revcomp'])
    union_patterncoords.append((pattern_remapped_exampleidxs,
                                pattern_start, pattern_end, pattern_isrevcomp))
  #increment exampleidx_offset
  exampleidx_offset = (exampleidx_offset + len(surviving_indices))
  results_fh.close()

#create the trackset object
union_onehot = np.array(union_onehot)
union_hypscores = np.array(union_hypscores)
track_set = TrackSet(one_hot=union_onehot,
                     contrib_scores=union_onehot*union_hypscores,
                     hypothetical_contribs=union_hypscores)

#Create pattern objects using the new track_set and modified coordinates
print("Patterns to be merged:")
all_patterns = []
for (example_idxs, starts, ends, isrevcomps) in union_patterncoords:
  #tfmlite reuses the same object for representing seqlet
  # coordinates as well as seqlets
  seqlet_coords = [Seqlet(example_idx, start, end, isrevcomp) for
                   (example_idx, start, end, isrevcomp) in zip(
                   example_idxs, starts, ends, isrevcomps)]
  seqlets = track_set.create_seqlets(seqlet_coords)
  pattern = SeqletSet(seqlets) #SeqletSet in tfm lite = AggregatedSeqlet in tfm
  all_patterns.append(pattern)
  print("numseqlets:",len(pattern.seqlets))
  #viz_sequence.plot_weights(pattern.contrib_scores)

#bg_freq is used for identifying the region within a merged motif of
# highest information content and then expanding around that region.
bg_freq = np.mean(union_onehot, axis=(0, 1))

#Do the merging
merged_patterns, pattern_merge_hierarchy = SimilarPatternsCollapser(
    patterns=all_patterns,
    track_set=track_set,
    min_overlap=min_overlap,
    prob_and_pertrack_sim_merge_thresholds=prob_and_pertrack_sim_merge_thresholds,
	  prob_and_pertrack_sim_dealbreaker_thresholds=prob_and_pertrack_sim_dealbreaker_thresholds,
	  min_frac=min_frac,
    min_num=min_num,
    flank_to_add=flank_to_add,
    window_size=window_size,
    bg_freq=bg_freq,
	  max_seqlets_subsample=max_seqlets_subsample)

#print("Merged patterns:")

# for pattern in merged_patterns:
#   print("numseqlets:",len(pattern.seqlets))
#   viz_sequence.plot_weights(pattern.contrib_scores)

save_hdf5('../data/svm/modisco/merged_mmus-pos-test_ggal-heart-enh_results.h5', 
          pos_patterns= merged_patterns, neg_patterns=None)

On file ../data/svm/modisco/heart_union_enhancer_galGal6_max3k_results.h5
1271 indices had seqlets out of 15935
On file ../data/svm/modisco/pos_test_mm10_max2000seqlet_results.h5
846 indices had seqlets out of 9626
Patterns to be merged:
numseqlets: 596
numseqlets: 231
numseqlets: 95
numseqlets: 84
numseqlets: 77
numseqlets: 41
numseqlets: 27
numseqlets: 24
numseqlets: 231
numseqlets: 179
numseqlets: 167
numseqlets: 166
numseqlets: 165
numseqlets: 146
numseqlets: 145
numseqlets: 113
numseqlets: 234
numseqlets: 212
numseqlets: 56
numseqlets: 53
numseqlets: 53
numseqlets: 49
numseqlets: 48
numseqlets: 43
numseqlets: 40
numseqlets: 33
numseqlets: 24
numseqlets: 150
numseqlets: 136
numseqlets: 117
numseqlets: 98
numseqlets: 86
numseqlets: 65
numseqlets: 65
numseqlets: 57


## Generate MEME from Mo output

In [23]:
%%bash

HERE=$(realpath ..)

MODIR=$HERE/data/svm/modisco
JASPAR=$HERE/_data/references/motifs/filtered_JASPAR_300TFs.meme

modisco meme \
-i $MODIR/merged_mmus-pos-test_ggal-heart-enh_results.h5 \
-t PFM \
-o $MODIR/merged_mmus-pos-test_ggal-heart-enh.PFM.meme

modisco meme \
-i $MODIR/heart_union_enhancer_galGal6_max3k_results.h5 \
-t PFM \
-o $MODIR/heart_union_enhancer_galGal6_max3k.PFM.meme

modisco meme \
-i $MODIR/pos_test_mm10_max2000seqlet_results.h5 \
-t PFM \
-o $MODIR/pos_test_mm10_max2000seqlet.PFM.meme

In [1]:
%%bash

HERE=$(realpath ..)

MODIR=$HERE/data/svm/modisco
JASPAR=$HERE/_data/references/motifs/filtered_JASPAR_300TFs.meme

modisco meme \
-i $MODIR/merged_mmus-pos-test_ggal-heart-enh_results.h5 \
-t CWM-PFM \
-o $MODIR/merged_mmus-pos-test_ggal-heart-enh.CWM-PFM.meme

modisco meme \
-i $MODIR/heart_union_enhancer_galGal6_max3k_results.h5 \
-t CWM-PFM \
-o $MODIR/heart_union_enhancer_galGal6_max3k.CWM-PFM.meme

modisco meme \
-i $MODIR/pos_test_mm10_max2000seqlet_results.h5 \
-t CWM-PFM \
-o $MODIR/pos_test_mm10_max2000seqlet.CWM-PFM.meme

In [2]:
%%bash

HERE=$(realpath ..)

MODIR=$HERE/data/svm/modisco
JASPAR=$HERE/_data/references/motifs/filtered_JASPAR_300TFs.meme

modisco report \
-i $MODIR/merged_mmus-pos-test_ggal-heart-enh_results.h5 \
-o $MODIR/merged_mmus-pos-test_ggal-heart-enh \
-m $JASPAR