In [128]:
# imports 
import numpy as np
import pandas as pd
import os
import subprocess
from IPython.display import HTML


In [None]:
# Load attribution and one-hot sequences and verify lengths

# Load DEV library and attributions
dev_library = pd.read_csv('Libraries/dev_modisco/dev_library.csv', index_col=0)
dev_attr = np.load('Libraries/dev_modisco/attr.npz')['arr_0']
dev_ohe = np.load('Libraries/dev_modisco/ohe.npz')['arr_0']

# Load HK library and attributions
hk_library = pd.read_csv('Libraries/hk_modisco/hk_library.csv', index_col=0)
hk_attr = np.load('Libraries/hk_modisco/attr.npz')['arr_0']
hk_ohe = np.load('Libraries/hk_modisco/ohe.npz')['arr_0']

# Print proof that they're the same length
print("DEV Library:")
print(f"  Library sequences: {len(dev_library)}")
print(f"  Attribution shape: {dev_attr.shape}")
print(f"  One-hot shape: {dev_ohe.shape}")
print(f"  Match: {len(dev_library) == dev_attr.shape[0] == dev_ohe.shape[0]}")

print("\nHK Library:")
print(f"  Library sequences: {len(hk_library)}")
print(f"  Attribution shape: {hk_attr.shape}")
print(f"  One-hot shape: {hk_ohe.shape}")
print(f"  Match: {len(hk_library) == hk_attr.shape[0] == hk_ohe.shape[0]}")

print(f"\nTotal sequences loaded:")
print(f"  DEV: {len(dev_library)} sequences")
print(f"  HK: {len(hk_library)} sequences")
print(f"  Total: {len(dev_library) + len(hk_library)} sequences")

DEV Library:
  Library sequences: 302
  Attribution shape: (302, 4, 249)
  One-hot shape: (302, 4, 249)
  Match: True

HK Library:
  Library sequences: 336
  Attribution shape: (336, 4, 249)
  One-hot shape: (336, 4, 249)
  Match: True

Total sequences loaded:
  DEV: 302 sequences
  HK: 336 sequences
  Total: 638 sequences


: 

: 

In [None]:
# Create results directories
os.makedirs('Libraries/dev_modisco/results', exist_ok=True)
os.makedirs('Libraries/hk_modisco/results', exist_ok=True)

# Run modisco for DEV library with window size 249
print("Running modisco for DEV library...")
dev_cmd = [
    'modisco', 'motifs',
    '-s', 'Libraries/dev_modisco/ohe.npz',
    '-a', 'Libraries/dev_modisco/attr.npz', 
    '-n', '2000',
    '-w', '248',  # Set window size to 248 (same as sequence length)
    '-o', 'Libraries/dev_modisco/results/modisco_results.h5'
]

result_dev = subprocess.run(dev_cmd, capture_output=True, text=True)
print(f"DEV modisco exit code: {result_dev.returncode}")
if result_dev.stdout:
    print(f"DEV stdout: {result_dev.stdout}")
if result_dev.stderr:
    print(f"DEV stderr: {result_dev.stderr}")

# Run modisco for HK library with window size 249
print("\nRunning modisco for HK library...")
hk_cmd = [
    'modisco', 'motifs',
    '-s', 'Libraries/hk_modisco/ohe.npz',
    '-a', 'Libraries/hk_modisco/attr.npz',
    '-n', '2000',
    '-w', '248',  # Set window size to 248 (same as sequence length)
    '-o', 'Libraries/hk_modisco/results/modisco_results.h5'
]

result_hk = subprocess.run(hk_cmd, capture_output=True, text=True)
print(f"HK modisco exit code: {result_hk.returncode}")
if result_hk.stdout:
    print(f"HK stdout: {result_hk.stdout}")
if result_hk.stderr:
    print(f"HK stderr: {result_hk.stderr}")

print(f"\nModisco results saved to:")
print(f"  DEV: Libraries/dev_modisco/results/modisco_results.h5")
print(f"  HK: Libraries/hk_modisco/results/modisco_results.h5")

Running modisco for DEV library...
DEV modisco exit code: 0

Running modisco for HK library...
HK modisco exit code: 0

Modisco results saved to:
  DEV: Libraries/dev_modisco/results/modisco_results.h5
  HK: Libraries/hk_modisco/results/modisco_results.h5


: 

: 

In [None]:
import subprocess
import os
from IPython.display import HTML

# Run modisco report for DEV library
print("Running modisco report for DEV library...")
dev_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/dev_modisco/results/modisco_results.h5',
    '-o', 'Libraries/dev_modisco/results/report/'
]

result_dev_report = subprocess.run(dev_report_cmd, capture_output=True, text=True)
print(f"DEV report exit code: {result_dev_report.returncode}")
if result_dev_report.stdout:
    print(f"DEV report stdout: {result_dev_report.stdout}")
if result_dev_report.stderr:
    print(f"DEV report stderr: {result_dev_report.stderr}")

# Run modisco report for HK library
print("\nRunning modisco report for HK library...")
hk_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/hk_modisco/results/modisco_results.h5',
    '-o', 'Libraries/hk_modisco/results/report/'
]

result_hk_report = subprocess.run(hk_report_cmd, capture_output=True, text=True)
print(f"HK report exit code: {result_hk_report.returncode}")
if result_hk_report.stdout:
    print(f"HK report stdout: {result_hk_report.stdout}")
if result_hk_report.stderr:
    print(f"HK report stderr: {result_hk_report.stderr}")

# Check if reports were generated
dev_report_dir = 'Libraries/dev_modisco/results/report'
hk_report_dir = 'Libraries/hk_modisco/results/report'

print("\nDEV Modisco Report:")
if os.path.exists(dev_report_dir):
    print(f"Report directory exists: {dev_report_dir}")
    print("Files in report directory:")
    for file in os.listdir(dev_report_dir):
        print(f"  - {file}")
else:
    print("Report directory not found")

print("\n" + "="*50 + "\n")

print("HK Modisco Report:")
if os.path.exists(hk_report_dir):
    print(f"Report directory exists: {hk_report_dir}")
    print("Files in report directory:")
    for file in os.listdir(hk_report_dir):
        print(f"  - {file}")
else:
    print("Report directory not found")

Running modisco report for DEV library...
DEV report exit code: 0

Running modisco report for HK library...
HK report exit code: 0

DEV Modisco Report:
Report directory exists: Libraries/dev_modisco/results/report
Files in report directory:
  - motifs.html
  - trimmed_logos


HK Modisco Report:
Report directory exists: Libraries/hk_modisco/results/report
Files in report directory:
  - motifs.html
  - trimmed_logos


: 

: 

In [None]:
HTML('Libraries/dev_modisco/results/report/motifs.html')


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev
pos_patterns.pattern_0,47,,
pos_patterns.pattern_1,47,,
pos_patterns.pattern_2,44,,


: 

: 

In [None]:
HTML('Libraries/hk_modisco/results/report/motifs.html')

pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev
pos_patterns.pattern_0,96,,


: 

: 

In [None]:
# Download JASPAR database for TOMTOM comparison
import subprocess
import os

# Create CEseek_data directory if it doesn't exist
os.makedirs('CEseek_data', exist_ok=True)

print("Downloading JASPAR database...")
download_cmd = [
    'wget', '--quiet',
    'https://jaspar.genereg.net/download/data/2022/CORE/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt',
    '-O', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_download = subprocess.run(download_cmd, capture_output=True, text=True)
print(f"Download exit code: {result_download.returncode}")
if result_download.stderr:
    print(f"Download stderr: {result_download.stderr}")

# Check if file was downloaded
if os.path.exists('CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'):
    print("JASPAR database downloaded successfully")
else:
    print("Failed to download JASPAR database")

Downloading JASPAR database...
Download exit code: 0
JASPAR database downloaded successfully


: 

: 

In [None]:
# Run modisco report with TOMTOM comparison for DEV library
print("Running modisco report with TOMTOM for DEV library...")
dev_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/dev_modisco/results/modisco_results.h5',
    '-o', 'Libraries/dev_modisco/results/report/',
    '-s', 'Libraries/dev_modisco/results/report/',
    '-m', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_dev_report = subprocess.run(dev_report_cmd, capture_output=True, text=True)
print(f"DEV report with TOMTOM exit code: {result_dev_report.returncode}")
if result_dev_report.stdout:
    print(f"DEV report stdout: {result_dev_report.stdout}")
if result_dev_report.stderr:
    print(f"DEV report stderr: {result_dev_report.stderr}")

# Run modisco report with TOMTOM comparison for HK library
print("\nRunning modisco report with TOMTOM for HK library...")
hk_report_cmd = [
    'modisco', 'report',
    '-i', 'Libraries/hk_modisco/results/modisco_results.h5',
    '-o', 'Libraries/hk_modisco/results/report/',
    '-s', 'Libraries/hk_modisco/results/report/',
    '-m', 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
]

result_hk_report = subprocess.run(hk_report_cmd, capture_output=True, text=True)
print(f"HK report with TOMTOM exit code: {result_hk_report.returncode}")
if result_hk_report.stdout:
    print(f"HK report stdout: {result_hk_report.stdout}")
if result_hk_report.stderr:
    print(f"HK report stderr: {result_hk_report.stderr}")

Running modisco report with TOMTOM for DEV library...
DEV report with TOMTOM exit code: 0

Running modisco report with TOMTOM for HK library...
HK report with TOMTOM exit code: 0


: 

: 

In [None]:
# Display the updated reports with TOMTOM comparisons
from IPython.display import HTML

print("DEV Modisco Report with TOMTOM:")
HTML('Libraries/dev_modisco/results/report/motifs.html')

DEV Modisco Report with TOMTOM:


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,47,,,MA0754.2,1.0,,MA0755.1,1.0,,MA0660.1,1.0,
pos_patterns.pattern_1,47,,,MA1141.1,0.000273,,MA1142.1,0.000273,,MA1988.1,0.000273,
pos_patterns.pattern_2,44,,,MA0036.3,0.325289,,MA0037.4,0.325289,,MA0482.2,0.325289,


: 

: 

In [None]:
print("HK Modisco Report with TOMTOM:")
HTML('Libraries/hk_modisco/results/report/motifs.html')

HK Modisco Report with TOMTOM:


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,96,,,MA0755.1,0.260965,,MA0756.2,0.283614,,MA1718.1,1.0,


: 

: 

In [None]:
import re
import pandas as pd
from bs4 import BeautifulSoup

def extract_tomtom_from_html(html_file):
    """Extract TOMTOM motif names from the HTML report."""
    with open(html_file, 'r') as f:
        html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    
    tomtom_motifs = {}
    
    if table:
        rows = table.find_all('tr')[1:]  # Skip header row
        
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 4:
                pattern_name = cells[0].text.strip()
                # TOMTOM matches are in cells 4, 7, 10 (every 3rd cell starting from 4)
                tomtom_matches = []
                for i in [4, 7, 10]:  # Specific cells with TOMTOM matches
                    if i < len(cells):
                        match_text = cells[i].text.strip()
                        if match_text and match_text != '-':
                            tomtom_matches.append(match_text)
                
                if tomtom_matches:
                    # Use the first (best) match
                    tomtom_motifs[pattern_name] = tomtom_matches[0]
                else:
                    tomtom_motifs[pattern_name] = pattern_name
    
    return tomtom_motifs

# Extract TOMTOM motif names from HTML reports
print("Extracting TOMTOM motif names from HTML reports...")
dev_tomtom_motifs = extract_tomtom_from_html('Libraries/dev_modisco/results/report/motifs.html')
hk_tomtom_motifs = extract_tomtom_from_html('Libraries/hk_modisco/results/report/motifs.html')

print(f"DEV TOMTOM motifs: {dev_tomtom_motifs}")
print(f"HK TOMTOM motifs: {hk_tomtom_motifs}")

# Create simplified mapping (remove 'pos_patterns.' prefix)
dev_tomtom_simple = {}
for key, value in dev_tomtom_motifs.items():
    simple_key = key.replace('pos_patterns.', '')
    dev_tomtom_simple[simple_key] = value

hk_tomtom_simple = {}
for key, value in hk_tomtom_motifs.items():
    simple_key = key.replace('pos_patterns.', '')
    hk_tomtom_simple[simple_key] = value

print(f"DEV TOMTOM simple: {dev_tomtom_simple}")
print(f"HK TOMTOM simple: {hk_tomtom_simple}")

def replace_with_tomtom_names(motif_string, tomtom_mapping):
    """Replace pattern names with TOMTOM motif names."""
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Split by semicolon and replace pattern names
    motifs = motif_string.split('; ')
    motif_names = []
    
    for motif in motifs:
        # Extract the pattern name (everything before the first parenthesis)
        pattern_name = motif.split('(')[0]
        
        # Replace with TOMTOM name if available
        if pattern_name in tomtom_mapping:
            motif_names.append(tomtom_mapping[pattern_name])
        else:
            motif_names.append(pattern_name)
    
    # Remove duplicates and join
    unique_motifs = list(set(motif_names))
    return '; '.join(unique_motifs)

# Update motif_names column with TOMTOM names
dev_library['motif_names'] = dev_library['TOMTOM_motifs'].apply(
    lambda x: replace_with_tomtom_names(x, dev_tomtom_simple)
)

hk_library['motif_names'] = hk_library['TOMTOM_motifs'].apply(
    lambda x: replace_with_tomtom_names(x, hk_tomtom_simple)
)



# Display summary
print("\nDEV Library motif names summary:")
dev_motif_names = dev_library[dev_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(dev_motif_names)}")
print(f"Unique motif names found: {set('; '.join(dev_motif_names).split('; '))}")

print("\nHK Library motif names summary:")
hk_motif_names = hk_library[hk_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(hk_motif_names)}")
print(f"Unique motif names found: {set('; '.join(hk_motif_names).split('; '))}")

# Show some examples
print(f"\nExample DEV sequences with TOMTOM motif names:")
if len(dev_motif_names) > 0:
    for i, (idx, row) in enumerate(dev_library[dev_library['motif_names'] != 'No motifs found'].head(3).iterrows()):
        print(f"  Sequence {idx}: {row['motif_names']}")

print(f"\nExample HK sequences with TOMTOM motif names:")
if len(hk_motif_names) > 0:
    for i, (idx, row) in enumerate(hk_library[hk_library['motif_names'] != 'No motifs found'].head(3).iterrows()):
        print(f"  Sequence {idx}: {row['motif_names']}")

# Save updated DataFrames
dev_library.to_csv('Libraries/dev_modisco/dev_library_with_motifs.csv', index=True)
hk_library.to_csv('Libraries/hk_modisco/hk_library_with_motifs.csv', index=True)

print(f"\nUpdated DataFrames saved with TOMTOM motif names")

Extracting TOMTOM motif names from HTML reports...
DEV TOMTOM motifs: {'pos_patterns.pattern_0': 'MA0754.2', 'pos_patterns.pattern_1': 'MA1141.1', 'pos_patterns.pattern_2': 'MA0036.3'}
HK TOMTOM motifs: {'pos_patterns.pattern_0': 'MA0755.1'}
DEV TOMTOM simple: {'pattern_0': 'MA0754.2', 'pattern_1': 'MA1141.1', 'pattern_2': 'MA0036.3'}
HK TOMTOM simple: {'pattern_0': 'MA0755.1'}

DEV Library motif names summary:
Sequences with motifs: 117
Unique motif names found: {'MA0036.3', 'MA0754.2', 'MA1141.1'}

HK Library motif names summary:
Sequences with motifs: 90
Unique motif names found: {'MA0755.1'}

Example DEV sequences with TOMTOM motif names:
  Sequence 208: MA0036.3
  Sequence 135: MA1141.1
  Sequence 275: MA1141.1

Example HK sequences with TOMTOM motif names:
  Sequence 3: MA0755.1
  Sequence 5: MA0755.1
  Sequence 6: MA0755.1

Updated DataFrames saved with TOMTOM motif names


: 

: 

In [None]:
dev_library.sort_values(by='EvoAug_predictions', ascending=False).head(5)


Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,log2_fc,p_value,dataset,EvoAug_predictions,TOMTOM_motifs,motif_names
281,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAT,TATATCTGTAGCTCAACACCTGACTGCTATCGAATGTTTTGGTATTTATTTTTATCGAAGTATTTTGCCGAAATTTGTGATTTGCCAGCCCTATTTCAATGTAAATATATCGATAATCGCCCGATTTTACTGAAAAATATTTCGATAAGAGCATCGATATTTCAAACAAATATCGACCCATTGACATGCGTCAGGTTTGTCCATCCCTAACAAAATCGTAGTACGAAATATCGAAAACGAAACACATCA,5585,4.574315,5.067241e-114,dev,5.307164,No motifs found,No motifs found
59,"('Jra', 'Jra', '-', '+', 10)",+,AGTGACTAATCTACTTACAAGTGAATCACG,AATCAGTTGAAATTTATACCTCATCGTCGACAATGCGAGTCGTCGCGTTCCCATTGTTTACAAAATGTACCATAAAGATTGCGAGCGAGAGAATTTGATTAGTGACTAATCTACTTACAAGTGAATCACGCGATCCAAAAATATTCATACAGAGCATTTTATCATTCGATTCGCTTTGCACCGAAACTCGGGACATTTCCACTGGGTGGGCAATCAATCAAGACCCATTTCGCCTAAACCTAAACATAT,2590,5.321928,1.763198e-07,dev,5.268528,"pattern_1(104-154,fwd); pattern_1(76-126,rev)",MA1141.1
167,"('Dref', 'Dref', '-', '+', 4)",+,TATCGATATCACTATCGATA,TTATTAGCCGGCAAAATTGGTGGGTAAACAACAAATATTTGGACAAAAGCGCAGACAAGAACTGATAAGAGGATCAGGGATGACGAATTTCAAATGGTTGCGCCAGAGATGACAGGGTATCGATATCACTATCGATAGACTTATCGAAGACGGCTTAGCACTTATACCTACTCTGCAGCCCTGGTCGGCAGAAAAACAATAATCAGCTGATGCCATCGCTTATTTCCAACAATTTGCACCGCAAAAGTG,19539,38.219281,4.2305609999999995e-34,dev,5.127045,"pattern_0(104-154,rev)",MA0754.2
226,"('Jra', 'twi', '+', '-', 4)",+,GTGAATCACATCGACATATGT,GACTTTTCATTCCATTGTTCTTGGTTCTCGGAAGTATCAACTTATTGCCGAACTCGTGATTTAGTCTTATCGTTGTTTTTTTCTGGTTTTAGATATACATATGTTTTCTTTTATGCTAACTTCTGTCAGTGAATCACATCGACATATGTATTTGCATTGATATCTTGCGGATTTCCGAATGCCAGTAACGACGATTCACTGTACTTGGTTGGCGGAGCGCAATCTTTGTTCGAGAGAGTTCGTTTGCCA,626,34.219281,0.008262587,dev,5.113298,"pattern_1(112-162,fwd)",MA1141.1
61,"('Dref', 'Max', '-', '+', 6)",+,TATCGATATTTGATAGCACGTG,TAATTAAGAGCAACAACAACAAACTTAGTAATGACACCCAAATAGAGATGAGCGATAGTTAATCACCTGTTATCAGCGCACAGCCAACGCTCATCCCTAAGCCTGCTGAGCAGCTGTTTCGACAATCGCTATCGATATATCTTGTTTAAACTTGCTATCGATATTTGATAGCACGTGTTGGGTTGTTCGCAATTTACGAATTGTATCTTTGTTAGTTATCCGCAATTTACATTTAAAACGAAGTAGATT,10613,35.219281,6.820836e-05,dev,5.038004,"pattern_0(100-150,fwd); pattern_0(142-192,rev)",MA0754.2


: 

: 

In [None]:
import re

def extract_tf_names_from_jaspar(jaspar_file):
    """Extract TF names from JASPAR database file."""
    tf_names = {}
    
    with open(jaspar_file, 'r') as f:
        content = f.read()
    
    # Find all MOTIF lines
    motif_lines = re.findall(r'MOTIF (MA\d+\.\d+) (MA\d+\.\d+\.(.+))', content)
    
    for ma_code, full_name, tf_name in motif_lines:
        tf_names[ma_code] = tf_name
    
    return tf_names

# Extract TF names from JASPAR database in CEseek_data folder
print("Extracting TF names from JASPAR database...")
jaspar_file_path = 'CEseek_data/JASPAR2022_CORE_vertebrates_non-redundant_pfms_meme.txt'
tf_names = extract_tf_names_from_jaspar(jaspar_file_path)

# Check what we found for our specific MA codes
our_ma_codes = ['MA0754.2', 'MA1141.1', 'MA0036.3', 'MA0755.1']
print(f"\nTF names for our MA codes:")
for ma_code in our_ma_codes:
    if ma_code in tf_names:
        print(f"  {ma_code}: {tf_names[ma_code]}")
    else:
        print(f"  {ma_code}: Not found")

def replace_ma_codes_with_tf_names(motif_string, tf_mapping):
    """Replace MA codes with actual TF names."""
    if motif_string == 'No motifs found':
        return 'No motifs found'
    
    # Split by semicolon and replace MA codes
    motifs = motif_string.split('; ')
    tf_names = []
    
    for motif in motifs:
        # Check if this motif contains an MA code
        for ma_code, tf_name in tf_mapping.items():
            if ma_code in motif:
                # Replace MA code with TF name
                motif = motif.replace(ma_code, tf_name)
                break
        tf_names.append(motif)
    
    # Remove duplicates and join
    unique_motifs = list(set(tf_names))
    return '; '.join(unique_motifs)

# Update motif_names column with actual TF names
print("\nUpdating motif names with actual TF names...")
dev_library['motif_names'] = dev_library['motif_names'].apply(
    lambda x: replace_ma_codes_with_tf_names(x, tf_names)
)

hk_library['motif_names'] = hk_library['motif_names'].apply(
    lambda x: replace_ma_codes_with_tf_names(x, tf_names)
)

# Display summary
print("\nDEV Library motif names summary (with TF names):")
dev_motif_names = dev_library[dev_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(dev_motif_names)}")
print(f"Unique motif names found: {set('; '.join(dev_motif_names).split('; '))}")

print("\nHK Library motif names summary (with TF names):")
hk_motif_names = hk_library[hk_library['motif_names'] != 'No motifs found']['motif_names']
print(f"Sequences with motifs: {len(hk_motif_names)}")
print(f"Unique motif names found: {set('; '.join(hk_motif_names).split('; '))}")

# Show some examples
print(f"\nExample DEV sequences with TF names:")
if len(dev_motif_names) > 0:
    for i, (idx, row) in enumerate(dev_library[dev_library['motif_names'] != 'No motifs found'].head(3).iterrows()):
        print(f"  Sequence {idx}: {row['motif_names']}")

print(f"\nExample HK sequences with TF names:")
if len(hk_motif_names) > 0:
    for i, (idx, row) in enumerate(hk_library[hk_library['motif_names'] != 'No motifs found'].head(3).iterrows()):
        print(f"  Sequence {idx}: {row['motif_names']}")

# Save updated DataFrames
dev_library.to_csv('Libraries/dev_modisco/dev_library_with_motifs.csv', index=True)
hk_library.to_csv('Libraries/hk_modisco/hk_library_with_motifs.csv', index=True)

print(f"\nUpdated DataFrames saved with actual TF names")

Extracting TF names from JASPAR database...

TF names for our MA codes:
  MA0754.2: CUX1
  MA1141.1: FOS::JUND
  MA0036.3: GATA2
  MA0755.1: CUX2

Updating motif names with actual TF names...

DEV Library motif names summary (with TF names):
Sequences with motifs: 117
Unique motif names found: {'CUX1', 'GATA2', 'FOS::JUND'}

HK Library motif names summary (with TF names):
Sequences with motifs: 90
Unique motif names found: {'CUX2'}

Example DEV sequences with TF names:
  Sequence 208: GATA2
  Sequence 135: FOS::JUND
  Sequence 275: FOS::JUND

Example HK sequences with TF names:
  Sequence 3: CUX2
  Sequence 5: CUX2
  Sequence 6: CUX2

Updated DataFrames saved with actual TF names


: 

: 

In [None]:
dev_library.sort_values(by='EvoAug_predictions', ascending=False).head(5)

Unnamed: 0,ce_name,orientation,ce_sequence,endogenous_sequence,pseudo_index,log2_fc,p_value,dataset,EvoAug_predictions,TOMTOM_motifs,motif_names
281,"('Dref', 'GATAe', '+', '+', -5)",+,TATCGATAAT,TATATCTGTAGCTCAACACCTGACTGCTATCGAATGTTTTGGTATTTATTTTTATCGAAGTATTTTGCCGAAATTTGTGATTTGCCAGCCCTATTTCAATGTAAATATATCGATAATCGCCCGATTTTACTGAAAAATATTTCGATAAGAGCATCGATATTTCAAACAAATATCGACCCATTGACATGCGTCAGGTTTGTCCATCCCTAACAAAATCGTAGTACGAAATATCGAAAACGAAACACATCA,5585,4.574315,5.067241e-114,dev,5.307164,No motifs found,No motifs found
59,"('Jra', 'Jra', '-', '+', 10)",+,AGTGACTAATCTACTTACAAGTGAATCACG,AATCAGTTGAAATTTATACCTCATCGTCGACAATGCGAGTCGTCGCGTTCCCATTGTTTACAAAATGTACCATAAAGATTGCGAGCGAGAGAATTTGATTAGTGACTAATCTACTTACAAGTGAATCACGCGATCCAAAAATATTCATACAGAGCATTTTATCATTCGATTCGCTTTGCACCGAAACTCGGGACATTTCCACTGGGTGGGCAATCAATCAAGACCCATTTCGCCTAAACCTAAACATAT,2590,5.321928,1.763198e-07,dev,5.268528,"pattern_1(104-154,fwd); pattern_1(76-126,rev)",FOS::JUND
167,"('Dref', 'Dref', '-', '+', 4)",+,TATCGATATCACTATCGATA,TTATTAGCCGGCAAAATTGGTGGGTAAACAACAAATATTTGGACAAAAGCGCAGACAAGAACTGATAAGAGGATCAGGGATGACGAATTTCAAATGGTTGCGCCAGAGATGACAGGGTATCGATATCACTATCGATAGACTTATCGAAGACGGCTTAGCACTTATACCTACTCTGCAGCCCTGGTCGGCAGAAAAACAATAATCAGCTGATGCCATCGCTTATTTCCAACAATTTGCACCGCAAAAGTG,19539,38.219281,4.2305609999999995e-34,dev,5.127045,"pattern_0(104-154,rev)",CUX1
226,"('Jra', 'twi', '+', '-', 4)",+,GTGAATCACATCGACATATGT,GACTTTTCATTCCATTGTTCTTGGTTCTCGGAAGTATCAACTTATTGCCGAACTCGTGATTTAGTCTTATCGTTGTTTTTTTCTGGTTTTAGATATACATATGTTTTCTTTTATGCTAACTTCTGTCAGTGAATCACATCGACATATGTATTTGCATTGATATCTTGCGGATTTCCGAATGCCAGTAACGACGATTCACTGTACTTGGTTGGCGGAGCGCAATCTTTGTTCGAGAGAGTTCGTTTGCCA,626,34.219281,0.008262587,dev,5.113298,"pattern_1(112-162,fwd)",FOS::JUND
61,"('Dref', 'Max', '-', '+', 6)",+,TATCGATATTTGATAGCACGTG,TAATTAAGAGCAACAACAACAAACTTAGTAATGACACCCAAATAGAGATGAGCGATAGTTAATCACCTGTTATCAGCGCACAGCCAACGCTCATCCCTAAGCCTGCTGAGCAGCTGTTTCGACAATCGCTATCGATATATCTTGTTTAAACTTGCTATCGATATTTGATAGCACGTGTTGGGTTGTTCGCAATTTACGAATTGTATCTTTGTTAGTTATCCGCAATTTACATTTAAAACGAAGTAGATT,10613,35.219281,6.820836e-05,dev,5.038004,"pattern_0(100-150,fwd); pattern_0(142-192,rev)",CUX1


: 

: 

In [None]:
dev_library['motif_names'].unique()


array(['No motifs found', 'GATA2', 'FOS::JUND', 'CUX1; GATA2', 'CUX1',
       'GATA2; FOS::JUND'], dtype=object)

: 

: 

In [None]:
len(dev_library[dev_library['motif_names'] == 'GATA2']['endogenous_sequence'].iloc[0]) == len(dev_library[dev_library['motif_names'] == 'GATA2']['endogenous_sequence'].iloc[1])


True

: 

: 