# Reading in various PFM datasets

The closest thing to a standard we seem to have is to write PFMs to a single file, with 4 lines specifying the `[ACGT]` counts and bases 1..N preceeded by a header line, separated either by commas or tabs. The `pfm_reader` function expects a file (or set of lines) in this form. The header is split out into its component parts (which change depending on the source file), and the counts are split and then the number fields are extracted and turned into a matrix (technically `list[list[int]]`).

The 'id' field I've created here is to match the IDs used in the ENCODE Fooprinting Paper [Veerstra et al., Nature 2020], it's very ad hoc and definitely not suitable for production use. It would also be possible to pull in metadata while we read in the PFMs and output that straight into the JSON, `ENSG` in particular would be very useful, as would filtering out non-human TFs.

In [1]:
import json
import collections
from tfbs.pfm_reader import pfm_reader


In [2]:
all_pfms = {}

In [3]:
jaspar_pfms = []

with open("data/JASPAR2018_CORE_vertebrates_non-redundant_pfms_jaspar.txt") as jaspar:
    for pfm in pfm_reader(jaspar):
        pfm['jaspar_id'] = pfm['info'][0]
        pfm['symbol'] = pfm['info'][1]
        pfm['id'] = pfm['symbol'] + "_" + pfm['jaspar_id']
        pfm['source'] = "JASPAR2018"
        jaspar_pfms.append(pfm)

print(json.dumps(jaspar_pfms[:1], indent=4))

[
    {
        "info": [
            "MA0004.1",
            "Arnt"
        ],
        "PFM": [
            [
                4,
                19,
                0,
                0,
                0,
                0
            ],
            [
                16,
                0,
                20,
                0,
                0,
                0
            ],
            [
                0,
                1,
                0,
                20,
                0,
                20
            ],
            [
                0,
                0,
                0,
                0,
                20,
                0
            ]
        ],
        "jaspar_id": "MA0004.1",
        "symbol": "Arnt",
        "id": "Arnt_MA0004.1",
        "source": "JASPAR2018"
    }
]


In [4]:
hoco_pfms = []

with open("data/HOCOMOCOv11_core_HUMAN_mono_jaspar_format.txt") as hoco:
    for pfm in pfm_reader(hoco):
        pfm['id'] = pfm['info'][0]
        pfm['source'] = "HOCOMOCOv11"
        hoco_pfms.append(pfm)

print(json.dumps(hoco_pfms[:1], indent=4))

[
    {
        "info": [
            "AHR_HUMAN.H11MO.0.B"
        ],
        "PFM": [
            [
                41,
                11,
                22,
                3,
                1,
                3,
                0,
                0,
                43
            ],
            [
                18,
                12,
                44,
                1,
                150,
                1,
                3,
                0,
                67
            ],
            [
                56,
                35,
                21,
                146,
                1,
                149,
                1,
                154,
                16
            ],
            [
                39,
                96,
                67,
                4,
                2,
                1,
                150,
                0,
                28
            ]
        ],
        "id": "AHR_HUMAN.H11MO.0.B",
        "source": "HOCOMOCOv11"
    }
]


In [5]:
taipale_pfms = []

with open("data/taipale_pwms.csv") as taipale:
    lines = taipale.readlines()[17:] # skip header
    tf_counter = collections.Counter()
    for pfm in pfm_reader(lines, delimiter=","):
        pfm['symbol'] = pfm['info'][0]
        pfm['tf_class'] = pfm['info'][1]
        id = pfm['symbol'] + "_" + pfm['tf_class']
        tf_counter[id] += 1
        pfm['id'] = id + "_" + str(tf_counter[id])
        pfm['source'] = "Taipale 2013"
        taipale_pfms.append(pfm)

print(json.dumps(taipale_pfms[:1], indent=4))

[
    {
        "info": [
            "BCL6B",
            "C2H2",
            "DBD",
            "TGCGGG20NGA",
            "AC",
            "TGCTTTCTAGGAATTMM",
            "2",
            "4",
            "monomeric",
            "",
            "yes",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            ""
        ],
        "PFM": [
            [
                0,
                3,
                0,
                0,
                0,
                0,
                0,
                0,
                280,
                0,
                0,
                289,
                290,
                0,
                0,
                113,
                183
            ],
            [
                19,
                0,
                367,
                0,
                0,
                0,
               

In [6]:
all_pfms = jaspar_pfms + hoco_pfms + taipale_pfms

with open("json/all_pfms.json", 'w') as f:
    json.dump(all_pfms, f, indent=4)