In [92]:
# imports 
import numpy as np
import pandas as pd
import os
import subprocess
from IPython.display import HTML


In [93]:
# Load attribution and one-hot sequences and verify lengths

# Load DEV library and attributions
dev_library = pd.read_csv('Libraries/dev_modisco/dev_library.csv', index_col=0)
dev_attr = np.load('Libraries/dev_modisco/attr.npz')['arr_0']
dev_ohe = np.load('Libraries/dev_modisco/ohe.npz')['arr_0']

# Load HK library and attributions
hk_library = pd.read_csv('Libraries/hk_modisco/hk_library.csv', index_col=0)
hk_attr = np.load('Libraries/hk_modisco/attr.npz')['arr_0']
hk_ohe = np.load('Libraries/hk_modisco/ohe.npz')['arr_0']

# Print proof that they're the same length
print("DEV Library:")
print(f"  Library sequences: {len(dev_library)}")
print(f"  Attribution shape: {dev_attr.shape}")
print(f"  One-hot shape: {dev_ohe.shape}")
print(f"  Match: {len(dev_library) == dev_attr.shape[0] == dev_ohe.shape[0]}")

print("\nHK Library:")
print(f"  Library sequences: {len(hk_library)}")
print(f"  Attribution shape: {hk_attr.shape}")
print(f"  One-hot shape: {hk_ohe.shape}")
print(f"  Match: {len(hk_library) == hk_attr.shape[0] == hk_ohe.shape[0]}")

print(f"\nTotal sequences loaded:")
print(f"  DEV: {len(dev_library)} sequences")
print(f"  HK: {len(hk_library)} sequences")
print(f"  Total: {len(dev_library) + len(hk_library)} sequences")

DEV Library:
  Library sequences: 302
  Attribution shape: (302, 4, 249)
  One-hot shape: (302, 4, 249)
  Match: True

HK Library:
  Library sequences: 336
  Attribution shape: (336, 4, 249)
  One-hot shape: (336, 4, 249)
  Match: True

Total sequences loaded:
  DEV: 302 sequences
  HK: 336 sequences
  Total: 638 sequences


In [94]:
dev_library.head(10)


Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,ce_start,ce_end,log2_fc,p_value,dataset,EvoAug_predictions
0,"('Dref', 'GATAe', '+', '+', 5)",+,TATCGATAGATTGCGATAAC,TTACTTTATACTTGAGATAGTAGTCTATCGATAGATTGCGATAACC...,1584661,25,44,5.321928,0.0003162613,dev,1.731806
1,"('twi', 'twi', '+', '-', 6)",+,ACACATATATGTACATATGT,CAAGTGCCGCAGAAACATTTGCTGATAAAATGCAAGATTTCATAAT...,846257,155,174,36.219281,4.635498e-09,dev,3.40049
2,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAG,ATTTAGCACGAGAAAACACAGATAATTGGCTATTCACAGCGACATA...,620728,220,229,4.574315,5.067241e-114,dev,2.671588
3,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAG,AAAAATAGTTTTAAATAATTTTATTGAAATAGAAGTTTCAAAAAAT...,433870,112,121,4.574315,5.067241e-114,dev,3.646802
4,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAC,AAAACGAAAACGTAGAGCTACCGTGCTTGGGTTCGATACTTGTGCT...,639485,53,62,4.574315,5.067241e-114,dev,2.33395
5,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAG,AAAACATTTGATCAAAAACACAAATAAGAAAAATTTCCCACCATTT...,710270,122,131,4.574315,5.067241e-114,dev,1.482395
6,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAC,AAAACGCAATATTCGTTTTTACATTCGCATTTGTTGCGAATAAATA...,139095,153,162,4.574315,5.067241e-114,dev,2.889539
7,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAG,GCGTAACTTTGAAAGACTTTCAGAAAAATATCAATTTTTGGTTTCC...,18536,110,119,4.574315,5.067241e-114,dev,4.72517
8,"('GATAe', 'Jra', '+', '+', 4)",+,AGATAAGCTAAGTGAATCATG,TGCCGTCAGATAAGCTAAGTGAATCATGCAATGAAGAGAGGCAATG...,276895,7,27,6.643856,3.851699e-10,dev,2.230574
9,"('Dref', 'Dref', '+', '-', 5)",+,TATCGATAGATTCAGTCGATA,GAGTTGCAGAGCTGCGAGGAATGATTAAAGCTGCAGGCCGCTTAGA...,847311,213,233,5.906891,2.398414e-11,dev,1.967761


In [95]:
# Create results directories
os.makedirs('Libraries/dev_modisco/results', exist_ok=True)
os.makedirs('Libraries/hk_modisco/results', exist_ok=True)

# Set random seed for reproducibility
import random
random.seed(333)
np.random.seed(333)

# Run modisco for DEV library with less stringent parameters
print("Running modisco for DEV library...")
dev_cmd = [
    'modisco', 'motifs',
    '-s', 'Libraries/dev_modisco/ohe.npz',
    '-a', 'Libraries/dev_modisco/attr.npz', 
    '-n', '8000',
    '-w', '248',
    '-z', '10', # Smaller seqlet cores (less stringent)
    '-o', 'Libraries/dev_modisco/results/modisco_results.h5'
]

result_dev = subprocess.run(dev_cmd, capture_output=True, text=True)
print(f"DEV modisco exit code: {result_dev.returncode}")
if result_dev.stdout:
    print(f"DEV stdout: {result_dev.stdout}")
if result_dev.stderr:
    print(f"DEV stderr: {result_dev.stderr}")

# Run modisco for HK library with less stringent parameters
print("\nRunning modisco for HK library...")
hk_cmd = [
    'modisco', 'motifs',
    '-s', 'Libraries/hk_modisco/ohe.npz',
    '-a', 'Libraries/hk_modisco/attr.npz',
    '-n', '8000',
    '-w', '248',
    '-z', '10', # Smaller seqlet cores
    '-o', 'Libraries/hk_modisco/results/modisco_results.h5'
]

result_hk = subprocess.run(hk_cmd, capture_output=True, text=True)
print(f"HK modisco exit code: {result_hk.returncode}")
if result_hk.stdout:
    print(f"HK stdout: {result_hk.stdout}")
if result_hk.stderr:
    print(f"HK stderr: {result_hk.stderr}")

print(f"\nModisco results saved to:")
print(f"  DEV: Libraries/dev_modisco/results/modisco_results.h5")
print(f"  HK: Libraries/hk_modisco/results/modisco_results.h5")

Running modisco for DEV library...
DEV modisco exit code: 0

Running modisco for HK library...
HK modisco exit code: 0

Modisco results saved to:
  DEV: Libraries/dev_modisco/results/modisco_results.h5
  HK: Libraries/hk_modisco/results/modisco_results.h5


In [96]:
import subprocess
import os
from IPython.display import HTML

# Run modisco report for DEV library
print("Running modisco report for DEV library...")
dev_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/dev_modisco/results/modisco_results.h5',
    '-o', 'Libraries/dev_modisco/results/report/'
]

result_dev_report = subprocess.run(dev_report_cmd, capture_output=True, text=True)
print(f"DEV report exit code: {result_dev_report.returncode}")
if result_dev_report.stdout:
    print(f"DEV report stdout: {result_dev_report.stdout}")
if result_dev_report.stderr:
    print(f"DEV report stderr: {result_dev_report.stderr}")

# Run modisco report for HK library
print("\nRunning modisco report for HK library...")
hk_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/hk_modisco/results/modisco_results.h5',
    '-o', 'Libraries/hk_modisco/results/report/'
]

result_hk_report = subprocess.run(hk_report_cmd, capture_output=True, text=True)
print(f"HK report exit code: {result_hk_report.returncode}")
if result_hk_report.stdout:
    print(f"HK report stdout: {result_hk_report.stdout}")
if result_hk_report.stderr:
    print(f"HK report stderr: {result_hk_report.stderr}")

# Check if reports were generated
dev_report_dir = 'Libraries/dev_modisco/results/report'
hk_report_dir = 'Libraries/hk_modisco/results/report'

print("\nDEV Modisco Report:")
if os.path.exists(dev_report_dir):
    print(f"Report directory exists: {dev_report_dir}")
else:
    print("Report directory not found")

print("\n" + "="*50 + "\n")

print("HK Modisco Report:")
if os.path.exists(hk_report_dir):
    print(f"Report directory exists: {hk_report_dir}")
else:
    print("Report directory not found")

Running modisco report for DEV library...
DEV report exit code: 0

Running modisco report for HK library...
HK report exit code: 0

DEV Modisco Report:
Report directory exists: Libraries/dev_modisco/results/report


HK Modisco Report:
Report directory exists: Libraries/hk_modisco/results/report


In [97]:
HTML('Libraries/dev_modisco/results/report/motifs.html')


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev
pos_patterns.pattern_0,77,,
pos_patterns.pattern_1,48,,
pos_patterns.pattern_2,30,,
pos_patterns.pattern_3,22,,


In [98]:
HTML('Libraries/hk_modisco/results/report/motifs.html')

pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev
pos_patterns.pattern_0,82,,
pos_patterns.pattern_1,35,,
pos_patterns.pattern_2,28,,
pos_patterns.pattern_3,26,,
pos_patterns.pattern_4,24,,


In [99]:
# Download JASPAR database for TOMTOM comparison
import subprocess
import os

# Create CEseek_data directory if it doesn't exist
os.makedirs('CEseek_data', exist_ok=True)

print("Downloading JASPAR database...")
download_cmd = [
    'wget', '--quiet',
    'https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt',
    '-O', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_download = subprocess.run(download_cmd, capture_output=True, text=True)
print(f"Download exit code: {result_download.returncode}")
if result_download.stderr:
    print(f"Download stderr: {result_download.stderr}")

# Check if file was downloaded
if os.path.exists('CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'):
    print("JASPAR database downloaded successfully")
else:
    print("Failed to download JASPAR database")

Downloading JASPAR database...
Download exit code: 0
JASPAR database downloaded successfully


In [100]:
# Run modisco report with TOMTOM comparison for DEV library
print("Running modisco report with TOMTOM for DEV library...")
dev_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/dev_modisco/results/modisco_results.h5',
    '-o', 'Libraries/dev_modisco/results/report/',
    '-s', 'Libraries/dev_modisco/results/report/',
    '-m', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_dev_report = subprocess.run(dev_report_cmd, capture_output=True, text=True)
print(f"DEV report with TOMTOM exit code: {result_dev_report.returncode}")
if result_dev_report.stdout:
    print(f"DEV report stdout: {result_dev_report.stdout}")
if result_dev_report.stderr:
    print(f"DEV report stderr: {result_dev_report.stderr}")

# Run modisco report with TOMTOM comparison for HK library
print("\nRunning modisco report with TOMTOM for HK library...")
hk_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/hk_modisco/results/modisco_results.h5',
    '-o', 'Libraries/hk_modisco/results/report/',
    '-s', 'Libraries/hk_modisco/results/report/',
    '-m', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_hk_report = subprocess.run(hk_report_cmd, capture_output=True, text=True)
print(f"HK report with TOMTOM exit code: {result_hk_report.returncode}")
if result_hk_report.stdout:
    print(f"HK report stdout: {result_hk_report.stdout}")
if result_hk_report.stderr:
    print(f"HK report stderr: {result_hk_report.stderr}")

Running modisco report with TOMTOM for DEV library...
DEV report with TOMTOM exit code: 0

Running modisco report with TOMTOM for HK library...
HK report with TOMTOM exit code: 0


In [101]:
# Display the updated reports with TOMTOM comparisons
from IPython.display import HTML

print("DEV Modisco Report with TOMTOM:")
HTML('Libraries/dev_modisco/results/report/motifs.html')

DEV Modisco Report with TOMTOM:


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,77,,,MA0036.3,0.505891,,MA0037.4,0.505891,,MA0482.2,0.505891,
pos_patterns.pattern_1,48,,,MA0754.2,0.544855,,MA0755.1,0.544855,,MA0602.1,1.0,
pos_patterns.pattern_2,30,,,MA1633.2,0.018556,,MA0099.3,0.018556,,MA0655.1,0.018556,
pos_patterns.pattern_3,22,,,MA1472.2,0.063324,,MA0691.1,0.063324,,MA0135.1,0.066491,


In [102]:
print("HK Modisco Report with TOMTOM:")
HTML('Libraries/hk_modisco/results/report/motifs.html')

HK Modisco Report with TOMTOM:


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,82,,,MA0755.1,0.278277,,MA0756.2,0.278277,,MA1718.1,1.0,
pos_patterns.pattern_1,35,,,MA0755.1,0.552344,,MA0756.2,0.724284,,MA1603.1,0.941748,
pos_patterns.pattern_2,28,,,MA0152.2,1.0,,MA0625.2,1.0,,MA0508.3,1.0,
pos_patterns.pattern_3,26,,,MA0754.2,1.0,,MA0755.1,1.0,,MA1561.1,1.0,
pos_patterns.pattern_4,24,,,MA0631.1,0.956523,,MA0801.1,0.956523,,MA0805.1,0.956523,


In [103]:
# Create TOMTOM_motifs column using the same results as motifs.html
from bs4 import BeautifulSoup

def extract_tomtom_from_html(html_file):
    """Extract TOMTOM motif names from the HTML report, selecting the match with the best (lowest) q-value."""
    with open(html_file, 'r') as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    
    tomtom_motifs = {}
    
    if table:
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 4:
                pattern_name = cells[0].text.strip()
                
                # Extract matches and their q-values
                # TOMTOM matches are in cells 4, 7, 10 (every 3rd cell starting from 4)
                # Q-values are in cells 5, 8, 11 (every 3rd cell starting from 5)
                matches_with_qvals = []
                
                for i, match_cell_idx in enumerate([4, 7, 10]):
                    qval_cell_idx = match_cell_idx + 1
                    if match_cell_idx < len(cells) and qval_cell_idx < len(cells):
                        match_text = cells[match_cell_idx].text.strip()
                        qval_text = cells[qval_cell_idx].text.strip()
                        
                        if match_text and match_text != '-' and qval_text and qval_text != '-':
                            try:
                                qval = float(qval_text)
                                matches_with_qvals.append((match_text, qval))
                            except ValueError:
                                # If q-value can't be parsed, skip this match
                                continue
                
                if matches_with_qvals:
                    # Select the match with the best (lowest) q-value
                    best_match = min(matches_with_qvals, key=lambda x: x[1])
                    tomtom_motifs[pattern_name] = best_match[0]
                    print(f"{pattern_name}: Selected {best_match[0]} (qval: {best_match[1]:.6f}) from {len(matches_with_qvals)} matches")
                else:
                    tomtom_motifs[pattern_name] = pattern_name
    
    return tomtom_motifs

# Extract TOMTOM motif names from HTML reports
dev_tomtom_names = extract_tomtom_from_html('Libraries/dev_modisco/results/report/motifs.html')
hk_tomtom_names = extract_tomtom_from_html('Libraries/hk_modisco/results/report/motifs.html')

# Create simplified mapping (remove 'pos_patterns.' prefix)
dev_tomtom_simple = {}
for key, value in dev_tomtom_names.items():
    simple_key = key.replace('pos_patterns.', '')
    dev_tomtom_simple[simple_key] = value

hk_tomtom_simple = {}
for key, value in hk_tomtom_names.items():
    simple_key = key.replace('pos_patterns.', '')
    hk_tomtom_simple[simple_key] = value

# Create TOMTOM_motifs column using actual patterns found
import h5py

def create_tomtom_motifs_column(library_df, tomtom_mapping, modisco_h5_path):
    """Create TOMTOM_motifs column using patterns found in TOMTOM results."""
    motif_assignments = []
    
    # Get the patterns that were actually found
    available_patterns = list(tomtom_mapping.keys())
    
    # Load seqlets data from modisco results to get actual positions
    seq_to_patterns = {}
    
    with h5py.File(modisco_h5_path, 'r') as f:
        pos_patterns = f['pos_patterns']
        
        for pattern_name in available_patterns:
            if pattern_name in pos_patterns:
                pattern_group = pos_patterns[pattern_name]
                if 'seqlets' in pattern_group:
                    seqlets = pattern_group['seqlets']
                    if 'example_idx' in seqlets and 'start' in seqlets and 'end' in seqlets:
                        example_indices = seqlets['example_idx'][:]
                        start_positions = seqlets['start'][:]
                        end_positions = seqlets['end'][:]
                        
                        # Check for multiple hits to same sequence
                        seq_hit_counts = {}
                        for seq_idx in example_indices:
                            seq_hit_counts[seq_idx] = seq_hit_counts.get(seq_idx, 0) + 1
                        
                        multiple_hits = {seq: count for seq, count in seq_hit_counts.items() if count > 1}
                        if multiple_hits:
                            print(f"Sequences with multiple hits to {pattern_name}: {multiple_hits}")
                        
                        # Map each sequence to its patterns with actual positions
                        for i, seq_idx in enumerate(example_indices):
                            if seq_idx not in seq_to_patterns:
                                seq_to_patterns[seq_idx] = []
                            
                            start_pos = start_positions[i]
                            end_pos = end_positions[i]
                            # Check if this is reverse complement
                            is_revcomp = False
                            if 'is_revcomp' in seqlets:
                                is_revcomp = seqlets['is_revcomp'][i]
                            
                            direction = 'rev' if is_revcomp else 'fwd'
                            seq_to_patterns[seq_idx].append(f"{pattern_name}({start_pos}-{end_pos},{direction})")
    
    # Assign patterns to sequences
    for idx, row in library_df.iterrows():
        if idx in seq_to_patterns:
            # Join multiple patterns if a sequence has multiple matches
            patterns = '; '.join(seq_to_patterns[idx])
            motif_assignments.append(patterns)
        else:
            motif_assignments.append("No motifs found")
    
    return motif_assignments

# Create TOMTOM_motifs columns
dev_library['TOMTOM_motifs'] = create_tomtom_motifs_column(dev_library, dev_tomtom_simple, 'Libraries/dev_modisco/results/modisco_results.h5')
hk_library['TOMTOM_motifs'] = create_tomtom_motifs_column(hk_library, hk_tomtom_simple, 'Libraries/hk_modisco/results/modisco_results.h5')

# Update TOMTOM_motifs column with MA codes but keep positions
def replace_pattern_with_name(motif_string, tomtom_mapping):
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Handle multiple motifs separated by semicolon
    if ';' in motif_string:
        motifs = motif_string.split('; ')
        replaced_motifs = []
        for motif in motifs:
            if '(' in motif:
                pattern_name = motif.split('(')[0]
                position_info = motif.split('(')[1]
                if pattern_name in tomtom_mapping:
                    replaced_motifs.append(f"{tomtom_mapping[pattern_name]}({position_info}")
                else:
                    replaced_motifs.append(motif)
            else:
                replaced_motifs.append(motif)
        return '; '.join(replaced_motifs)
    else:
        # Single motif
        if '(' in motif_string:
            pattern_name = motif_string.split('(')[0]
            position_info = motif_string.split('(')[1]
            if pattern_name in tomtom_mapping:
                return f"{tomtom_mapping[pattern_name]}({position_info}"
            else:
                return motif_string
        else:
            return motif_string

# Update motif_names column with only the names (MA codes)
def extract_motif_names_only(motif_string, tomtom_mapping):
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Handle multiple motifs separated by semicolon
    if ';' in motif_string:
        motifs = motif_string.split('; ')
        motif_names = []
        for motif in motifs:
            if '(' in motif:
                pattern_name = motif.split('(')[0]
                if pattern_name in tomtom_mapping:
                    motif_names.append(tomtom_mapping[pattern_name])
                else:
                    motif_names.append(pattern_name)
            else:
                motif_names.append(motif)
        # Keep all motif names (including duplicates for multiple hits)
        return ', '.join(motif_names)
    else:
        # Single motif
        if '(' in motif_string:
            pattern_name = motif_string.split('(')[0]
            if pattern_name in tomtom_mapping:
                return tomtom_mapping[pattern_name]
            else:
                return pattern_name
        else:
            return motif_string

# Update columns
dev_library['TOMTOM_motifs'] = dev_library['TOMTOM_motifs'].apply(
    lambda x: replace_pattern_with_name(x, dev_tomtom_simple)
)

hk_library['TOMTOM_motifs'] = hk_library['TOMTOM_motifs'].apply(
    lambda x: replace_pattern_with_name(x, hk_tomtom_simple)
)

# Clean up any remaining 'pattern' references and standalone numbers
def clean_tomtom_motifs(motif_string):
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Remove 'pattern_' prefix
    cleaned = motif_string.replace('pattern_', '')
    
    # Remove standalone numbers (like "; 1" or "1;")
    import re
    # Remove semicolon followed by space and number
    cleaned = re.sub(r';\s*\d+', '', cleaned)
    # Remove number followed by semicolon
    cleaned = re.sub(r'\d+;\s*', '', cleaned)
    # Remove standalone numbers at the end
    cleaned = re.sub(r';\s*\d+$', '', cleaned)
    
    return cleaned

dev_library['TOMTOM_motifs'] = dev_library['TOMTOM_motifs'].apply(clean_tomtom_motifs)
hk_library['TOMTOM_motifs'] = hk_library['TOMTOM_motifs'].apply(clean_tomtom_motifs)

dev_library['motif_names'] = dev_library['TOMTOM_motifs'].apply(
    lambda x: extract_motif_names_only(x, dev_tomtom_simple)
)

hk_library['motif_names'] = hk_library['TOMTOM_motifs'].apply(
    lambda x: extract_motif_names_only(x, hk_tomtom_simple)
)

print(f"DEV TOMTOM motifs: {dev_tomtom_simple}")
print(f"HK TOMTOM motifs: {hk_tomtom_simple}")
print(f"DEV sequences with motifs: {len(dev_library[dev_library['motif_names'] != 'No motifs found'])}")
print(f"HK sequences with motifs: {len(hk_library[hk_library['motif_names'] != 'No motifs found'])}")

pos_patterns.pattern_0: Selected MA0036.3 (qval: 0.505891) from 3 matches
pos_patterns.pattern_1: Selected MA0754.2 (qval: 0.544855) from 3 matches
pos_patterns.pattern_2: Selected MA1633.2 (qval: 0.018556) from 3 matches
pos_patterns.pattern_3: Selected MA1472.2 (qval: 0.063324) from 3 matches
pos_patterns.pattern_0: Selected MA0755.1 (qval: 0.278277) from 3 matches
pos_patterns.pattern_1: Selected MA0755.1 (qval: 0.552344) from 3 matches
pos_patterns.pattern_2: Selected MA0152.2 (qval: 1.000000) from 3 matches
pos_patterns.pattern_3: Selected MA0754.2 (qval: 1.000000) from 3 matches
pos_patterns.pattern_4: Selected MA0631.1 (qval: 0.956523) from 3 matches
Sequences with multiple hits to pattern_0: {122: 2, 128: 2, 225: 2, 283: 2, 106: 2, 201: 2, 182: 2, 248: 2, 139: 2, 72: 3, 131: 2, 185: 2, 98: 2, 195: 2}
Sequences with multiple hits to pattern_1: {205: 2, 112: 2, 95: 2}
Sequences with multiple hits to pattern_2: {199: 2, 156: 2, 103: 2}
Sequences with multiple hits to pattern_0: {2

In [104]:
# Check what columns are available in the DataFrames
print("DEV Library columns:")
print(dev_library.columns.tolist())
print(f"\nDEV Library shape: {dev_library.shape}")

print("\nHK Library columns:")
print(hk_library.columns.tolist())
print(f"\nHK Library shape: {hk_library.shape}")


DEV Library columns:
['ce_name', 'orientation', 'ce_sequence', 'endogenous_sequence', 'pseudo_index', 'ce_start', 'ce_end', 'log2_fc', 'p_value', 'dataset', 'EvoAug_predictions', 'TOMTOM_motifs', 'motif_names']

DEV Library shape: (302, 13)

HK Library columns:
['ce_name', 'orientation', 'ce_sequence', 'endogenous_sequence', 'pseudo_index', 'ce_start', 'ce_end', 'log2_fc', 'p_value', 'dataset', 'EvoAug_predictions', 'TOMTOM_motifs', 'motif_names']

HK Library shape: (336, 13)


In [105]:
# Display top sequences by EvoAug predictions
dev_library.sort_values(by='EvoAug_predictions', ascending=False).head(5)


Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,ce_start,ce_end,log2_fc,p_value,dataset,EvoAug_predictions,TOMTOM_motifs,motif_names
281,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAT,TATATCTGTAGCTCAACACCTGACTGCTATCGAATGTTTTGGTATT...,5585,107,116,4.574315,5.067241e-114,dev,5.307164,No motifs found,No motifs found
59,"('Jra', 'Jra', '-', '+', 10)",+,AGTGACTAATCTACTTACAAGTGAATCACG,AATCAGTTGAAATTTATACCTCATCGTCGACAATGCGAGTCGTCGC...,2590,100,129,5.321928,1.763198e-07,dev,5.268528,"MA1633.2(89-139,rev)",MA1633.2
167,"('Dref', 'Dref', '-', '+', 4)",+,TATCGATATCACTATCGATA,TTATTAGCCGGCAAAATTGGTGGGTAAACAACAAATATTTGGACAA...,19539,117,136,38.219281,4.2305609999999995e-34,dev,5.127045,"MA0754.2(98-148,rev)",MA0754.2
226,"('Jra', 'twi', '+', '-', 4)",+,GTGAATCACATCGACATATGT,GACTTTTCATTCCATTGTTCTTGGTTCTCGGAAGTATCAACTTATT...,626,128,148,34.219281,0.008262587,dev,5.113298,"MA0036.3(51-101,rev); MA1633.2(97-147,rev)","MA0036.3, MA1633.2"
61,"('Dref', 'Max', '-', '+', 6)",+,TATCGATATTTGATAGCACGTG,TAATTAAGAGCAACAACAACAAACTTAGTAATGACACCCAAATAGA...,10613,155,176,35.219281,6.820836e-05,dev,5.038004,"MA0754.2(144-194,fwd); MA1472.2(97-147,fwd)","MA0754.2, MA1472.2"


In [106]:
# Extract TF names from JASPAR database and update motif names
import re

def extract_tf_names_from_jaspar(jaspar_file):
    """Extract TF names from JASPAR database file."""
    tf_names = {}
    
    with open(jaspar_file, 'r') as f:
        content = f.read()
    
    motif_lines = re.findall(r'MOTIF (MA\d+\.\d+) MA\d+\.\d+\.(.+)', content)

    for ma_code, tf_name in motif_lines:
        tf_names[ma_code] = tf_name
    
    return tf_names

# Extract TF names from JASPAR database in CEseek_data folder
print("Extracting TF names from JASPAR database...")
jaspar_file_path = 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
tf_names = extract_tf_names_from_jaspar(jaspar_file_path)

# Extract MA codes dynamically from our TOMTOM results
dev_ma_codes = set()
hk_ma_codes = set()

# Extract MA codes from DEV library motif names
for motif_name in dev_library['motif_names']:
    if motif_name != 'No motifs found':
        # Split by comma and semicolon to handle multiple motifs
        motifs = motif_name.replace(',', ';').split(';')
        for motif in motifs:
            motif = motif.strip()
            if motif and motif != 'No motifs found':
                dev_ma_codes.add(motif)

# Extract MA codes from HK library motif names
for motif_name in hk_library['motif_names']:
    if motif_name != 'No motifs found':
        # Split by comma and semicolon to handle multiple motifs
        motifs = motif_name.replace(',', ';').split(';')
        for motif in motifs:
            motif = motif.strip()
            if motif and motif != 'No motifs found':
                hk_ma_codes.add(motif)

# Combine all unique MA codes found
our_ma_codes = list(dev_ma_codes.union(hk_ma_codes))

print(f"\nDynamically found MA codes:")
print(f"  DEV library MA codes: {sorted(dev_ma_codes)}")
print(f"  HK library MA codes: {sorted(hk_ma_codes)}")
print(f"  All unique MA codes: {sorted(our_ma_codes)}")



def replace_ma_codes_with_tf_names(motif_string, tf_mapping):
    """Replace MA codes with actual TF names."""
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Split by comma and space (not semicolon)
    motifs = motif_string.split(', ')
    tf_names = []
    
    for motif in motifs:
        # Check if this motif contains an MA code
        for ma_code, tf_name in tf_mapping.items():
            if ma_code in motif:
                # Replace MA code with TF name
                motif = motif.replace(ma_code, tf_name)
                break
        tf_names.append(motif)
    
    # Keep duplicates to preserve multiple hits to same motif
    return ', '.join(tf_names)

# Update motif_names column with actual TF names
print("\nUpdating motif names with actual TF names...")
dev_library['motif_names'] = dev_library['motif_names'].apply(
    lambda x: replace_ma_codes_with_tf_names(x, tf_names)
)

hk_library['motif_names'] = hk_library['motif_names'].apply(
    lambda x: replace_ma_codes_with_tf_names(x, tf_names)
)

# Display summary
print("\nDEV Library motif names summary (with TF names):")
dev_motif_names = dev_library[dev_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(dev_motif_names)}")
print(f"Unique motif names found: {set('; '.join(dev_motif_names).split('; '))}")

print("\nHK Library motif names summary (with TF names):")
hk_motif_names = hk_library[hk_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(hk_motif_names)}")
print(f"Unique motif names found: {set('; '.join(hk_motif_names).split('; '))}")


# Save updated DataFrames
dev_library.to_csv('Libraries/dev_modisco/dev_library_with_motifs.csv', index=True)
hk_library.to_csv('Libraries/hk_modisco/hk_library_with_motifs.csv', index=True)

print(f"\nUpdated DataFrames saved with actual TF names")

Extracting TF names from JASPAR database...

Dynamically found MA codes:
  DEV library MA codes: ['MA0036.3', 'MA0754.2', 'MA1472.2', 'MA1633.2']
  HK library MA codes: ['MA0152.2', 'MA0631.1', 'MA0754.2', 'MA0755.1']
  All unique MA codes: ['MA0036.3', 'MA0152.2', 'MA0631.1', 'MA0754.2', 'MA0755.1', 'MA1472.2', 'MA1633.2']

Updating motif names with actual TF names...

DEV Library motif names summary (with TF names):
Sequences with motifs: 135
Unique motif names found: {'GATA2', 'GATA2, GATA2, GATA2', 'CUX1', 'BACH1', 'BACH1, BACH1', 'BACH1, Bhlha15', 'CUX1, CUX1', 'GATA2, BACH1, BACH1', 'GATA2, GATA2', 'GATA2, Bhlha15', 'GATA2, BACH1', 'CUX1, Bhlha15', 'GATA2, GATA2, Bhlha15', 'Bhlha15'}

HK Library motif names summary (with TF names):
Sequences with motifs: 152
Unique motif names found: {'CUX2, CUX2, Nfatc2', 'Nfatc2', 'CUX2', 'CUX2, Nfatc2, CUX1', 'CUX1', 'Nfatc2, CUX1', 'CUX2, Six3', 'CUX2, CUX1', 'Six3', 'CUX1, CUX1', 'CUX1, Six3', 'CUX2, Nfatc2', 'CUX2, CUX2'}



Updated DataFrames saved with actual TF names


In [107]:
# Display top sequences by EvoAug predictions
dev_library.sort_values(by='EvoAug_predictions', ascending=False).head(10)

Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,ce_start,ce_end,log2_fc,p_value,dataset,EvoAug_predictions,TOMTOM_motifs,motif_names
281,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAT,TATATCTGTAGCTCAACACCTGACTGCTATCGAATGTTTTGGTATT...,5585,107,116,4.574315,5.067241e-114,dev,5.307164,No motifs found,No motifs found
59,"('Jra', 'Jra', '-', '+', 10)",+,AGTGACTAATCTACTTACAAGTGAATCACG,AATCAGTTGAAATTTATACCTCATCGTCGACAATGCGAGTCGTCGC...,2590,100,129,5.321928,1.763198e-07,dev,5.268528,"MA1633.2(89-139,rev)",BACH1
167,"('Dref', 'Dref', '-', '+', 4)",+,TATCGATATCACTATCGATA,TTATTAGCCGGCAAAATTGGTGGGTAAACAACAAATATTTGGACAA...,19539,117,136,38.219281,4.2305609999999995e-34,dev,5.127045,"MA0754.2(98-148,rev)",CUX1
226,"('Jra', 'twi', '+', '-', 4)",+,GTGAATCACATCGACATATGT,GACTTTTCATTCCATTGTTCTTGGTTCTCGGAAGTATCAACTTATT...,626,128,148,34.219281,0.008262587,dev,5.113298,"MA0036.3(51-101,rev); MA1633.2(97-147,rev)","GATA2, BACH1"
61,"('Dref', 'Max', '-', '+', 6)",+,TATCGATATTTGATAGCACGTG,TAATTAAGAGCAACAACAACAAACTTAGTAATGACACCCAAATAGA...,10613,155,176,35.219281,6.820836e-05,dev,5.038004,"MA0754.2(144-194,fwd); MA1472.2(97-147,fwd)","CUX1, Bhlha15"
174,"('CTCF', 'GATAe', '+', '+', -1)",+,TTGGTGGTGCGATAAG,CATATGCTGGCATTTTTGTGTGTTTTCTGTTTTTTTTTTTCTGTGT...,36965,113,128,4.321928,1.640031e-06,dev,4.817449,"MA0036.3(93-143,fwd)",GATA2
68,"('Dref', 'Jra', '+', '+', -3)",+,TATCGATTACTCAGC,CGAGTTTAATTTAATTTCAGTTTCTGTTTCGACACGAGCAGTCAGC...,239179,139,153,4.321928,0.0008800678,dev,4.786422,No motifs found,No motifs found
7,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAG,GCGTAACTTTGAAAGACTTTCAGAAAAATATCAATTTTTGGTTTCC...,18536,110,119,4.574315,5.067241e-114,dev,4.72517,"MA0754.2(79-129,rev)",CUX1
108,"('Dref', 'GATAe', '+', '-', 0)",+,TATCGAAAATTATCG,GGGTAGCACTGTTTCAGCCCTGTCATCGATAAGCTGTGCACCAGTA...,69284,62,76,35.219281,6.820836e-05,dev,4.692852,No motifs found,No motifs found
112,"('Dref', 'Dref', '-', '+', 1)",+,ATTCGATACTATCGATA,TGAGAGTAATTTACCTTTTAAAACACTTGAAAAGGTTTTAAACACG...,291940,112,128,38.541209,1.849792e-42,dev,4.664673,"MA0754.2(110-160,fwd); MA0754.2(107-157,rev)","CUX1, CUX1"


In [108]:
# Check unique motif names
dev_library['motif_names'].unique()


array(['No motifs found', 'GATA2', 'CUX1', 'BACH1', 'Bhlha15',
       'GATA2, BACH1', 'CUX1, Bhlha15', 'GATA2, GATA2, GATA2',
       'CUX1, CUX1', 'GATA2, GATA2', 'BACH1, BACH1',
       'GATA2, GATA2, Bhlha15', 'BACH1, Bhlha15', 'GATA2, Bhlha15',
       'GATA2, BACH1, BACH1'], dtype=object)

In [109]:
hk_library['motif_names'].unique()

array(['Six3', 'No motifs found', 'CUX1, Six3', 'CUX2',
       'CUX2, Nfatc2, CUX1', 'CUX2, Nfatc2', 'CUX2, CUX2', 'CUX1',
       'Nfatc2, CUX1', 'Nfatc2', 'CUX2, CUX2, Nfatc2', 'CUX1, CUX1',
       'CUX2, Six3', 'CUX2, CUX1'], dtype=object)

In [110]:
# Display top HK sequences by EvoAug predictions
hk_library.sort_values(by='EvoAug_predictions', ascending=False).head(5)

Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,ce_start,ce_end,log2_fc,p_value,dataset,EvoAug_predictions,TOMTOM_motifs,motif_names
67,"('Dref', 'Max', '-', '+', 6)",+,TATCGATATTTGATAGCACGTG,TAATTAAGAGCAACAACAACAAACTTAGTAATGACACCCAAATAGA...,4637,155,176,35.219281,6.817735e-05,hk,7.520189,"MA0755.1(110-160,rev)",CUX2
306,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAT,TATATCTGTAGCTCAACACCTGACTGCTATCGAATGTTTTGGTATT...,605,107,116,6.174824,3.293969e-316,hk,7.510352,No motifs found,No motifs found
193,"('Dref', 'Dref', '-', '+', 4)",+,TATCGATATCACTATCGATA,TTATTAGCCGGCAAAATTGGTGGGTAAACAACAAATATTTGGACAA...,17547,117,136,38.678713,1.131588e-46,hk,7.45004,"MA0754.2(95-145,fwd)",CUX1
125,"('Dref', 'Dref', '-', '+', 1)",+,ATTCGATACTATCGATA,TGAGAGTAATTTACCTTTTAAAACACTTGAAAAGGTTTTAAACACG...,133078,112,128,38.804243,7.311219e-51,hk,7.195486,"MA0755.1(73-123,fwd); MA0755.1(112-162,rev); M...","CUX2, CUX2, Nfatc2"
72,"('Dref', 'Jra', '+', '+', -3)",+,TATCGATTACTCAGC,CGAGTTTAATTTAATTTCAGTTTCTGTTTCGACACGAGCAGTCAGC...,207805,139,153,5.321928,0.000316125,hk,7.171366,"MA0754.2(119-169,rev)",CUX1


In [111]:
# Verify sequence lengths are consistent
len(dev_library[dev_library['motif_names'] == 'GATA2']['endogenous_sequence'].iloc[0]) == len(dev_library[dev_library['motif_names'] == 'GATA2']['endogenous_sequence'].iloc[1])


True

In [115]:
sum(hk_library['motif_names'] == 'No motifs found')/len(hk_library)

0.5476190476190477

In [116]:
sum(dev_library['motif_names'] == 'No motifs found')/len(dev_library)

0.5529801324503312

In [117]:
# Test different window sizes to explore parameter effects on motif discovery
import subprocess
import os
import h5py
import pandas as pd
import numpy as np

def run_modisco_with_params(sequences_file, attributions_file, output_file, max_seqlets, window, size, trim_size=None, seqlet_flank_size=None):
    """Run modisco with specified parameters and return the number of patterns found."""
    
    # Create output directory
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Build command
    cmd = [
        'modisco', 'motifs',
        '-s', sequences_file,
        '-a', attributions_file,
        '-n', str(max_seqlets),
        '-w', str(window),
        '-z', str(size),
        '-o', output_file
    ]
    
    # Add optional parameters if specified
    if trim_size is not None:
        cmd.extend(['-t', str(trim_size)])
    if seqlet_flank_size is not None:
        cmd.extend(['-f', str(seqlet_flank_size)])
    
    # Run modisco
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"Error running modisco: {result.stderr}")
        return 0
    
    # Count patterns in the output
    try:
        with h5py.File(output_file, 'r') as f:
            if 'pos_patterns' in f:
                return len(f['pos_patterns'])
            else:
                return 0
    except Exception as e:
        print(f"Error reading output file {output_file}: {e}")
        return 0

def count_sequences_with_motifs(modisco_h5_path, library_df):
    """Count how many sequences have motifs based on seqlets in modisco results."""
    seq_with_motifs = set()
    
    try:
        with h5py.File(modisco_h5_path, 'r') as f:
            if 'pos_patterns' in f:
                pos_patterns = f['pos_patterns']
                for pattern_name in pos_patterns.keys():
                    pattern_group = pos_patterns[pattern_name]
                    if 'seqlets' in pattern_group:
                        seqlets = pattern_group['seqlets']
                        if 'example_idx' in seqlets:
                            example_indices = seqlets['example_idx'][:]
                            seq_with_motifs.update(example_indices)
    except Exception as e:
        print(f"Error reading modisco results: {e}")
    
    return len(seq_with_motifs)

# Test different window sizes (keeping under 248 - sequence length)
print("Testing different window sizes to explore parameter effects...")
print("="*70)

# Test window sizes from small to large (under 248)
test_configs = [
    {"name": "Very small window (w=50)", "max_seqlets": 8000, "window": 50, "size": 10},
    {"name": "Small window (w=100)", "max_seqlets": 8000, "window": 100, "size": 10},
    {"name": "Medium window (w=150)", "max_seqlets": 8000, "window": 150, "size": 10},
    {"name": "Large window (w=200)", "max_seqlets": 8000, "window": 200, "size": 10},
    {"name": "Current window (w=248)", "max_seqlets": 8000, "window": 248, "size": 10},
    {"name": "Small window + stringent (w=100, z=15)", "max_seqlets": 8000, "window": 100, "size": 15},
    {"name": "Large window + lenient (w=200, z=5)", "max_seqlets": 8000, "window": 200, "size": 5},
]

results = []

for config in test_configs:
    print(f"\nTesting: {config['name']}")
    
    # Test on DEV library
    dev_output = f"Libraries/dev_modisco/test_results_{config['name'].replace(' ', '_').replace('(', '').replace(')', '').replace(',', '_')}.h5"
    dev_patterns = run_modisco_with_params(
        'Libraries/dev_modisco/ohe.npz',
        'Libraries/dev_modisco/attr.npz',
        dev_output,
        config['max_seqlets'],
        config['window'],
        config['size'],
        config.get('trim_size'),
        config.get('seqlet_flank_size')
    )
    
    # Count sequences with motifs
    dev_sequences_with_motifs = count_sequences_with_motifs(dev_output, dev_library)
    dev_total_sequences = len(dev_library)
    dev_ratio = dev_sequences_with_motifs / dev_total_sequences if dev_total_sequences > 0 else 0
    
    # Test on HK library
    hk_output = f"Libraries/hk_modisco/test_results_{config['name'].replace(' ', '_').replace('(', '').replace(')', '').replace(',', '_')}.h5"
    hk_patterns = run_modisco_with_params(
        'Libraries/hk_modisco/ohe.npz',
        'Libraries/hk_modisco/attr.npz',
        hk_output,
        config['max_seqlets'],
        config['window'],
        config['size'],
        config.get('trim_size'),
        config.get('seqlet_flank_size')
    )
    
    # Count sequences with motifs
    hk_sequences_with_motifs = count_sequences_with_motifs(hk_output, hk_library)
    hk_total_sequences = len(hk_library)
    hk_ratio = hk_sequences_with_motifs / hk_total_sequences if hk_total_sequences > 0 else 0
    
    results.append({
        'config': config['name'],
        'window': config['window'],
        'size': config['size'],
        'dev_patterns': dev_patterns,
        'dev_sequences_with_motifs': dev_sequences_with_motifs,
        'dev_ratio': dev_ratio,
        'hk_patterns': hk_patterns,
        'hk_sequences_with_motifs': hk_sequences_with_motifs,
        'hk_ratio': hk_ratio
    })
    
    print(f"  DEV: {dev_patterns} patterns, {dev_sequences_with_motifs}/{dev_total_sequences} sequences with motifs ({dev_ratio:.3f})")
    print(f"  HK:  {hk_patterns} patterns, {hk_sequences_with_motifs}/{hk_total_sequences} sequences with motifs ({hk_ratio:.3f})")

# Display results summary
print("\n" + "="*90)
print("WINDOW SIZE EFFECTS ON MOTIF DISCOVERY:")
print("="*90)
print(f"{'Configuration':<35} {'Window':<8} {'Size':<6} {'DEV Ratio':<10} {'HK Ratio':<10} {'DEV Patterns':<12} {'HK Patterns':<12}")
print("-"*90)

for result in results:
    print(f"{result['config']:<35} {result['window']:<8} {result['size']:<6} {result['dev_ratio']:<10.3f} {result['hk_ratio']:<10.3f} {result['dev_patterns']:<12} {result['hk_patterns']:<12}")

# Analyze window size effects
print("\n" + "="*70)
print("WINDOW SIZE ANALYSIS:")
print("="*70)

# Group by window size (excluding size variations)
window_effects = {}
for result in results:
    if result['size'] == 10:  # Only standard size for window comparison
        window = result['window']
        if window not in window_effects:
            window_effects[window] = []
        window_effects[window].append(result)

print("Window Size Effects (with z=10):")
print(f"{'Window Size':<12} {'DEV Ratio':<10} {'HK Ratio':<10} {'DEV Patterns':<12} {'HK Patterns':<12}")
print("-"*60)

for window in sorted(window_effects.keys()):
    results_for_window = window_effects[window]
    if results_for_window:
        result = results_for_window[0]  # Take first result for this window
        print(f"{window:<12} {result['dev_ratio']:<10.3f} {result['hk_ratio']:<10.3f} {result['dev_patterns']:<12} {result['hk_patterns']:<12}")

# Check if ratios vary with window size
dev_ratios = [r['dev_ratio'] for r in results if r['size'] == 10]
hk_ratios = [r['hk_ratio'] for r in results if r['size'] == 10]

dev_ratio_std = np.std(dev_ratios)
hk_ratio_std = np.std(hk_ratios)

print(f"\nStandard deviation of DEV ratios (window effects): {dev_ratio_std:.6f}")
print(f"Standard deviation of HK ratios (window effects): {hk_ratio_std:.6f}")

if dev_ratio_std < 0.001 and hk_ratio_std < 0.001:
    print("WARNING: Very low variation in ratios suggests possible hardcoding!")
else:
    print("✅ Ratios vary with window size - no evidence of hardcoding")

print(f"\nKey Observations:")
print(f"- Smaller windows (50-100) may find fewer motifs (less search space)")
print(f"- Larger windows (200-248) may find more motifs (more search space)")
print(f"- Window size significantly affects motif discovery sensitivity")

# Clean up test files
print("\nCleaning up test files...")
for config in test_configs:
    test_file = f"Libraries/dev_modisco/test_results_{config['name'].replace(' ', '_').replace('(', '').replace(')', '').replace(',', '_')}.h5"
    if os.path.exists(test_file):
        os.remove(test_file)
    test_file = f"Libraries/hk_modisco/test_results_{config['name'].replace(' ', '_').replace('(', '').replace(')', '').replace(',', '_')}.h5"
    if os.path.exists(test_file):
        os.remove(test_file)


Testing different window sizes to explore parameter effects...

Testing: Very small window (w=50)
  DEV: 0 patterns, 0/302 sequences with motifs (0.000)
  HK:  0 patterns, 0/336 sequences with motifs (0.000)

Testing: Small window (w=100)
  DEV: 0 patterns, 0/302 sequences with motifs (0.000)
  HK:  1 patterns, 20/336 sequences with motifs (0.060)

Testing: Medium window (w=150)
  DEV: 3 patterns, 94/302 sequences with motifs (0.311)
  HK:  2 patterns, 103/336 sequences with motifs (0.307)

Testing: Large window (w=200)
  DEV: 4 patterns, 121/302 sequences with motifs (0.401)
  HK:  4 patterns, 115/336 sequences with motifs (0.342)

Testing: Current window (w=248)
  DEV: 4 patterns, 135/302 sequences with motifs (0.447)
  HK:  5 patterns, 152/336 sequences with motifs (0.452)

Testing: Small window + stringent (w=100, z=15)
  DEV: 0 patterns, 0/302 sequences with motifs (0.000)
  HK:  1 patterns, 22/336 sequences with motifs (0.065)

Testing: Large window + lenient (w=200, z=5)
  DEV: 

In [119]:
dev_library.iloc[4]

ce_name                                  ('Dref', 'GATAe', '+', '+', -5)
orientation                                                            +
ce_sequence                                                   TATCGATAAC
endogenous_sequence    AAAACGAAAACGTAGAGCTACCGTGCTTGGGTTCGATACTTGTGCT...
pseudo_index                                                      639485
ce_start                                                              53
ce_end                                                                62
log2_fc                                                         4.574315
p_value                                                              0.0
dataset                                                              dev
EvoAug_predictions                                               2.33395
TOMTOM_motifs                                            No motifs found
motif_names                                              No motifs found
Name: 4, dtype: object