In [25]:
with open("output/araport11-headers.txt") as f:
    araport_header_ls = [int(line) - 1 for line in f]

with open("datasets/tair-protein-locations.csv") as f:
    araport_locations = [tuple(line.split()) for line in f]
    araport_locations = dict([kv for kv in araport_locations if len(kv) == 2])

with open("datasets/araport11.fasta") as f:
    lines = [line for line in f]
    araport = {}
    for i, header_l in enumerate(araport_header_ls):
        next_header_l = araport_header_ls[i + 1] if i + 1 < len(araport_header_ls) else len(araport) - 1
        header = lines[header_l].removeprefix(">").replace("\n", "")
        agi, _, meta = header.partition(" ")
        meta = meta.removeprefix("| ")
        sequence = "".join(lines[header_l + 1:next_header_l]).replace("\n", "")
        location = araport_locations[agi]
        araport[agi] = (meta, location, sequence)

In [18]:
with open("datasets/araport11.csv", "w+") as f:
    lines = ["\t".join([agi, location, "\"" + meta + "\"", sequence]) for (agi, (meta, location, sequence)) in araport.items()]
    content = "\n".join(lines)
    f.write(content)

In [67]:
# helper functions

from bisect import bisect
from datetime import datetime
import os

def identify_protein(match_line, araport):
    return araport[araport_header_ls[bisect(araport_header_ls, match_line) - 1]]

def parse_agi_descr(header):
    header = header.removeprefix('>').removesuffix('\n')
    (agi, _, descr) = header.partition(' | ')

    return (agi, descr)

def group_agis(agis_with_meta):
    split_agis = [(*agi.split('.'), meta) for (agi, meta) in agis_with_locations.items()]
    grouped = { agi_core:[] for (agi_core, _, _) in split_agis }
    
    for (agi_core, agi_suffix, meta) in split_agis:
        grouped[agi_core].append((agi_suffix, meta))
    
    return grouped

def save_proteins(proteins, prefix, dir_path):
    output = "\n".join(["\t".join([agi, match, meta]) for (agi, (meta, _location, match)) in proteins.items()])
    timestamp = datetime.now().strftime("%Y-%m-%dT%H-%M")
    output_path = os.path.join(dir_path, prefix + "-" + timestamp + ".txt")
    
    with open(output_path, "x") as f:
        f.write(output)

def filter_proteins(re, searched_location):
    proteins = {
        agi: (meta, location, re.search(sequence))
        for (agi, (meta, location, sequence)) in araport.items()
    }
    proteins = {
        agi: (meta, location, match.group(0))
        for (agi, (meta, location, match)) in proteins.items()
        if match
    }
    proteins_located = {
        agi: (meta, location, match)
        for (agi, (meta, location, match)) in proteins.items()
        if location == searched_location
    }

    return proteins_located

In [168]:
# the location dataset

print("the number proteins in the location dataset:")
print(len(araport_locations))

the number proteins in the location dataset:
48359


# KA-box

> Recently, by exploiting a random peptide display library, Xu et al.
identified a novel PCNA binding motif, **`K-A-(A/L/I)-(A/L/Q)-x-x-(L/V)`**,
termed the **KA-box**.

The proteins were identified by running

```
rg 'KA[ALI][ALQ]..[LV]' <araport11 dataset> -n | rg d+ -o > <output file>
```

In [148]:
1*1*3*3*26*26*2

12168

# APIM

> [...] **APIM** was defined as **`[KR]-[FYW]-[LIVA]-[LIVA]-[KR]`**.

The proteins were identified by running

```
rg '[KR][FYW][LIVA][LIVA][KR]' <araport11 dataset> -n | rg '\d+' -o > <output file>
```

In [147]:
2*3*4*4*2

192

# PIP-box

> This conserved PCNA-binding motif was termed the PCNA interaction
protein box (**PIP-box**) (26). An alignment of these binding motifs shows
that it consists of the sequence **`Q-x-x-(h)-x-x-(a)-(a)`** (where “h”
represents residues with moderately hydrophobic side chains, e.g., L,
I, M; “a” represents residues with highly hydrophobic, aromatic side
chains, e.g., F, Y; and “x” is any residue) (25, 26).

> jeszcze raz przeszukać bazę pod kątem PIP-box i dodatkowo w pozycji h dodać (E oraz D)


The proteins were identified by running

```
~~rg 'Q..[LIM]..[FY][FY]' <araport11 dataset> -n | rg '\d+' -o > <output file>~~
```

```
rg 'Q..[LIMED]..[FY][FY]' <araport11 dataset> -n | rg '\d+' -o > <output file>
```

In [156]:
1*26*26*5*26*26*2*2

9139520

In [68]:
import re

pipbox_re = re.compile(r"Q..[LIMED]..[FY][FY]")
kabox_re = re.compile(r"KA[ALI][ALQ]..[LV]")
apim_re = re.compile(r"[KR][FYW][LIVA][LIVA][KR]")

prefixes_with_res = [
    ("kabox", kabox_re),
    ("apim", apim_re),
    ("pipbox", pipbox_re),
]

for (motif, motif_re) in prefixes_with_res:
    for location in ["Nucleus", "Unknown"]:
        prefix = motif + "-" + location.lower()
        
        proteins = filter_proteins(motif_re, location)
        save_proteins(proteins, prefix, "output")