# Reading in various PFM datasets

The closest thing to a standard we seem to have is to write PFMs to a single file, with 4 lines specifying the `[ACGT]` counts and bases 1..N preceeded by a header line, separated either by commas or tabs. The `pfm_reader` function expects a file (or set of lines) in this form. The header is split out into its component parts (which change depending on the source file), and the counts are split and then the number fields are extracted and turned into a matrix (technically `list[list[int]]`).

The 'id' field I've created here is to match the IDs used in the ENCODE Fooprinting Paper [Veerstra et al., Nature 2020], it's very ad hoc and definitely not suitable for production use. It would also be possible to pull in metadata while we read in the PFMs and output that straight into the JSON, `ENSG` in particular would be very useful, as would filtering out non-human TFs.

In [1]:
from pathlib import Path
from tfbs.pfm_reader import pfm_reader, PFM

In [2]:
pwms = Path.home() / "Projects" / "Enhancers" / "pwms"

In [3]:
jaspar_pfms = []

with open(pwms / "data/JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt") as jaspar:
    for info, pfm in pfm_reader(jaspar):

        metadata = {
            'jaspar_id': info[0],
            'symbol': info[1],
            'source': "JASPAR2018",
        }
        
        pfm_id = info[1] + "_" + info[0]

        jaspar_pfms.append(PFM(id=pfm_id, metadata=metadata, PFM=pfm))

jaspar_pfms[1].dict()

{'id': 'Ahr::Arnt_MA0006.1',
 'PFM': [[3, 0, 0, 0, 0, 0],
  [8, 0, 23, 0, 0, 0],
  [2, 23, 0, 23, 0, 24],
  [11, 1, 1, 1, 24, 0]],
 'metadata': {'jaspar_id': 'MA0006.1',
  'symbol': 'Ahr::Arnt',
  'source': 'JASPAR2018'}}

In [4]:
hoco_pfms = []

with open(pwms / "data/HOCOMOCOv11_core_HUMAN_mono_jaspar_format.txt") as hoco:
    for info, pfm in pfm_reader(hoco):
        pfm_id = info[0]
        metadata = {'source': "HOCOMOCOv11"}
        hoco_pfms.append(PFM(id=pfm_id, metadata=metadata, PFM=pfm))

hoco_pfms[1].dict()

{'id': 'AIRE_HUMAN.H11MO.0.C',
 'PFM': [[16, 8, 6, 2, 0, 13, 16, 15, 14, 21, 16, 9, 0, 0, 9, 3, 18, 17],
  [11, 8, 6, 0, 0, 2, 4, 6, 3, 4, 2, 6, 1, 0, 4, 8, 1, 11],
  [5, 6, 8, 36, 33, 1, 1, 6, 6, 6, 3, 3, 36, 40, 10, 8, 7, 4],
  [9, 19, 21, 3, 8, 25, 20, 14, 18, 10, 20, 23, 4, 1, 18, 22, 15, 9]],
 'metadata': {'source': 'HOCOMOCOv11'}}

In [5]:
taipale_pfms = []

tf_counter = {}

def id_reg(symbol, tf_class):
    stub = symbol + "_" + tf_class
    i = tf_counter.get(stub, 0) + 1
    tf_counter[stub] = i
    return stub + "_" + str(i)

with open(pwms / "data/taipale_pwms.csv") as taipale:
    lines = taipale.readlines()[17:] # skip header

    for info, pfm in pfm_reader(lines, delimiter=","):
        metadata = {
            'symbol': info[0],
            'tf_class': info[1],
            'source': "Taipale 2013",
        }

        pfm_id = id_reg(info[0], info[1])

        taipale_pfms.append(PFM(id=pfm_id, metadata=metadata, PFM=pfm))

taipale_pfms[1].dict()

{'id': 'CTCF_C2H2_1',
 'PFM': [[5423,
   2600,
   0,
   64,
   641,
   46,
   5823,
   87,
   0,
   37,
   10566,
   90,
   183,
   2,
   406,
   2660,
   4213],
  [546,
   371,
   8052,
   0,
   9591,
   10107,
   5201,
   7091,
   8872,
   49,
   290,
   1616,
   5157,
   0,
   114,
   4936,
   3590],
  [1733,
   11366,
   0,
   12480,
   8,
   0,
   207,
   0,
   1,
   23,
   1625,
   9828,
   0,
   8472,
   7463,
   143,
   2456],
  [4208,
   632,
   505,
   86,
   46,
   0,
   241,
   3888,
   0,
   14151,
   1355,
   764,
   7952,
   121,
   2221,
   9766,
   1302]],
 'metadata': {'symbol': 'CTCF', 'tf_class': 'C2H2', 'source': 'Taipale 2013'}}