# Load SELEX Matricies and Run MOODS from Python

We're going to load the SELEX matricies from Regulation into Python, and output them as nicely formatted JSON. Then we'll select the first 50 matricies, and use this to scan some enhancer sequences (also loaded into Python, not using the file system).

In [1]:
import os
import itertools
import csv
import random
import json
from multiprocessing import Pool
from pathlib import Path
from tfbs.pfm_reader import pfm_reader
from tfbs.moods import PWM, Scanner, iter_fasta


In [2]:
SELEX = Path("/home/malcolm/Data/Regulation/PWMs/SELEX")

# build an iterator that makes everything look like one big file
def load_pfms(d: Path):
    for pfm_file in d.iterdir():
        with open(pfm_file) as f:
            yield [pfm_file.stem, *f.readlines()]


pfm_iter = pfm_reader(itertools.chain.from_iterable(load_pfms(SELEX / "matrices")))

# read in SELEX PFMs and index by file name
SELEX_pfms = []

for pfm in pfm_iter:
    pfm["id"] = pfm["info"][0]
    SELEX_pfms.append(pfm)

# unfortunately it does not seem trivial to tell python not to split the PFMs by line
print(json.dumps(SELEX_pfms[0], indent=4))


{
    "info": [
        "HOXD12_HOXA3_AAB_TCTGTG40NAGA_NTAATKRSNMRTAAAN_m1_c3"
    ],
    "PFM": [
        [
            201,
            219,
            1439,
            1439,
            166,
            57,
            455,
            408,
            18,
            887,
            1105,
            17,
            1439,
            1439,
            1439,
            567
        ],
        [
            622,
            65,
            67,
            70,
            150,
            183,
            44,
            630,
            388,
            552,
            41,
            0,
            132,
            12,
            188,
            412
        ],
        [
            200,
            39,
            199,
            56,
            493,
            609,
            985,
            809,
            20,
            252,
            334,
            21,
            0,
            1,
            48,
            246
        ],
        [
            416,
            

Now we can select the PWMs we want to scan, in this case 50 random PWMs from the SELEX data:

In [3]:
chosen_pfms = random.choices(SELEX_pfms, k=50)


In [4]:

PWMs = [PWM(p["PFM"], p["id"], threshold=("pvalue", 0.0001)) for p in chosen_pfms]

Build the Scanner object:

In [5]:
s = Scanner(PWMs)

Read in the fasta sequences:

In [6]:
seqs = iter_fasta("fasta/ENCFF503GCK_chr11.fa")

def scan(fa: tuple[str, str]): return s.scan(fa)

with Pool(16) as p:
    results = list(p.map(scan, seqs))


Now we have a list of hits for each sequence:

In [7]:
results[:10]

[{'header': 'chr11:87214:87280:ID11.100666', 'hits': []},
 {'header': 'chr11:87240:87307:ID11.100675', 'hits': []},
 {'header': 'chr11:108018:108077:ID11.10073',
  'hits': [Hit(TF='FLI1_V_TCTAAT20NCG_NTCGTAAATGCN_m1_c2_elife2014', start=21, end=33, score=6.434171621506307, strand='+')]},
 {'header': 'chr11:108060:108095:ID11.10078',
  'hits': [Hit(TF='MYBL1_MAX_AX_TCCTTG40NGGT_YAACGGNNNNNNNNNNNCACGTG_m1_c2', start=9, end=32, score=6.118261961082351, strand='+')]},
 {'header': 'chr11:110913:111017:ID11.10083', 'hits': []},
 {'header': 'chr11:128422:128507:ID11.100883', 'hits': []},
 {'header': 'chr11:130594:130654:ID11.100888', 'hits': []},
 {'header': 'chr11:131180:131340:ID11.100893',
  'hits': [Hit(TF='TFAP2C_DLX3_AY_TTGCAT40NTGG_NSCCNNNRGGCANNNNNNTAATKR_m1_c3', start=88, end=112, score=7.865389803174972, strand='-')]},
 {'header': 'chr11:132586:132706:ID11.100898',
  'hits': [Hit(TF='POU1F1_AI_TACACC40NCAC_NWTATGCWAATKAG_m1_c3_Cell2013', start=20, end=34, score=7.406128538100299, st

We can output or store this however we want, in this case we can convert to chromosomal co-ordinates in a BED-like file:

NB I have no idea if the MOODS output is 0 or 1 based, we will have to test this.

In [8]:
header = ["chr", "start", "end", "name", "score", "strand", "PWM"]

print("\t".join(header))

for r in results[:50]:
    chrom, start, end, name = r['header'].split(':')
    for h in r['hits']:
        h_start = int(start) + h.start
        h_end = int(start) + h.end
        print("\t".join(map(str, [chrom, h_start, h_end, name, h.score, h.strand, h.TF])))

chr	start	end	name	score	strand	PWM
chr11	108039	108051	ID11.10073	6.434171621506307	+	FLI1_V_TCTAAT20NCG_NTCGTAAATGCN_m1_c2_elife2014
chr11	108069	108092	ID11.10078	6.118261961082351	+	MYBL1_MAX_AX_TCCTTG40NGGT_YAACGGNNNNNNNNNNNCACGTG_m1_c2
chr11	131268	131292	ID11.100893	7.865389803174972	-	TFAP2C_DLX3_AY_TTGCAT40NTGG_NSCCNNNRGGCANNNNNNTAATKR_m1_c3
chr11	132606	132620	ID11.100898	7.406128538100299	+	POU1F1_AI_TACACC40NCAC_NWTATGCWAATKAG_m1_c3_Cell2013
chr11	132669	132682	ID11.100898	8.426606045916373	-	TEAD4_HOXB13_AY_TCTATG40NTAG_CYAATAAAATGYN_m1_c3
chr11	132612	132628	ID11.100898	9.084878718667925	-	POU2F1_HOXB13_AX_TACATT40NTAC_NRMATATACCAATAAA_m2_c2
chr11	146846	146861	ID11.10107	7.347840094955376	-	FOXJ2_HOXB13_AAF_TACGCA40NAAT_NTTTATNRNTMAACA_m1_c2
chr11	175109	175137	ID11.101262	6.348688502676113	+	TEAD4_RFX5_AY_TCGGTT40NTGT_RCATTCNNNNNNNNNNNNNNNNGCAACN_m1_c2
chr11	175965	175984	ID11.101269	3.9136364221884934	+	ESRRA_AI_TGACGA30NGCA_SAAGGTCANNTSAAGGTCA_m2_c4_Cell2013
chr11	179