#### Prerequesites:
* Define schema (make figure) 
* Define structure (make figure)
* Define schema class in python (src/models/model_schemata.py)
* Convert schema + structure into model template (models/templates/p_model.pl)

#### Procedure:
* Load data
* Adjust model script for lfi accordingly
* Create evidence file (if I don't need different evidence files, move to builmodel notebook)
* LFI

Multiple instances, 1 pst, X enzymes, one sample per instance, multiple enzymes per instance, test how many enzymes can have in one instances, how computing time depends on N p::f for 60 samples

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import random

from src.models import model_schemata as schema
from src.models import build_model as build
from src.visualization import visualize as viz

In [3]:
# os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

#### Training/Testing

In [4]:
# reading csv from files into dict
data = {}
data['e_ksea'] = pd.read_csv('data/processed/ebdt_data/run7/e_ksea.csv')
data['p_fc'] = pd.read_csv('data/processed/ebdt_data/run7/p_fc.csv')
data['es_interaction'] = pd.read_csv('data/processed/ebdt_data/run7/es_interaction_top7enz.csv')
data['e_ksea']

Unnamed: 0,enzyme,sample,tc,value,prob,p_dec,p_inc
0,AAK1,AC220,16,dec,0.855122,0.855122,0.001000
1,AAK1,AT13148,16,dec,0.857701,0.857701,0.001000
2,AAK1,AZ20,16,dec,0.915060,0.915060,0.001000
3,AAK1,AZD1480,16,inc,0.532053,0.001000,0.532053
4,AAK1,AZD3759,16,inc,0.508320,0.001000,0.508320
...,...,...,...,...,...,...,...
28543,ZAK,Torin,42,dec,0.950913,0.950913,0.001000
28544,ZAK,Trametinib,42,dec,0.694789,0.694789,0.001000
28545,ZAK,U73122,42,inc,0.605670,0.001000,0.605670
28546,ZAK,Ulixertinib,42,dec,0.793373,0.793373,0.001000


Splitting data into training/testing

In [5]:
# getting sample names (union of all samples in all datasets)
samples = list(set(data['e_ksea']['sample']).union(set(data['p_fc']['sample'])))
samples.sort()
# sample x% of samples randomly without replacement with seed
random.seed(612)
train = random.sample(samples, int(len(samples)*1))
print(train)
# filter data
training_data = {}
training_data['e_ksea'] = data['e_ksea'][data['e_ksea']['sample'].isin(train)].reset_index(drop=True)
training_data['p_fc'] = data['p_fc'][data['p_fc']['sample'].isin(train)].reset_index(drop=True)

['TAK715', 'TGX221', 'CHIR99021', 'CAL101', 'Ripasudil', 'KN93', 'LY2584702', 'LY2090314', 'Ku0063794', 'SP600125', 'Ribociclib', 'U73122', 'AZD5438', 'GDC0994', 'Edelfosine', 'PF3758309', 'Amuvatinib', 'AT13148', 'AZD3759', 'Go6976', 'JNJ', 'NU7441', 'GSK2334470', 'Ipatasertib', 'AZD5363', 'LY2835219', 'Torin', 'Bosutinib', 'H89', 'DNAPK', 'GDC0941', 'FRAX486', 'CX4945', 'HS173', 'Linsitinib', 'Dabrafenib', 'AZD1480', 'BX912', 'AZ20', 'TBCA', 'Selumetinib', 'GF109203X', 'AZD6482', 'AC220', 'PH797804', 'AZD8055', 'Trametinib', 'JNK', 'GSK690693', 'KD025', 'Vemurafenib', 'PF4708671', 'Tofacitinib', 'MK2206', 'GO6983', 'Dasatinib', 'AZD6738', 'PIK294', 'PD153035', 'KN62', 'Ulixertinib']


In [6]:
import multiprocessing
import concurrent.futures

# Define a function to process a single sample
def process_sample(sample, phosphosites, interactions, predicates, evidence_dict):
    evidence_dict[sample] = {}
    count = -1

    # Precompute data
    evid_generator_e = build.ProblogStatementGenerator(predicates['e_ksea'])
    evid_generator_p = build.ProblogStatementGenerator(predicates['p_fc'])

    for p in phosphosites:
        count += 1
        # Print count if divisible by 1000 to track progress
        if count % 100 == 0:
            print(f"{sample}: {count}")

        enzymes = interactions['enzyme'][interactions['substrate'] == p].tolist()

        evidence_e = []
        for e in enzymes:
            evidence_e.extend(evid_generator_e.generate_facts(build.EvidenceTemplate, select=[sample, e]))

        evidence_p = evid_generator_p.generate_facts(build.EvidenceTemplate, select=[sample, p])
        evidence_dict[sample][p] = evidence_e + evidence_p

    return evidence_dict

if __name__ == "__main__":
    num_processes = multiprocessing.cpu_count()  # Use all available CPU cores

    samples = list(set(training_data['e_ksea']['sample']) | set(training_data['p_fc']['sample']))
    phosphosites = list(set(training_data['p_fc']['phosphosite']))
    evidence_dict = {}

    # Create a ThreadPoolExecutor with the desired number of threads
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_processes) as executor:
        futures = []

        for sample in samples:
            # Precompute data
            # Mapping data to Problog predicates
            predicates = {}
            predicates['e_ksea'] = schema.EKseaPredicate()
            predicates['e_ksea'].add_data(training_data['e_ksea'][training_data['e_ksea']['sample'] == sample], 'enzyme', 'sample', 'value')
            predicates['p_fc'] = schema.PFoldChangePredicate()
            predicates['p_fc'].add_data(training_data['p_fc'][training_data['p_fc']['sample'] == sample], 'phosphosite', 'sample', 'value')
            future = executor.submit(process_sample, sample, phosphosites, data['es_interaction'], predicates, evidence_dict)
            futures.append(future)

        # Wait for all tasks to complete and retrieve their results
        for future in concurrent.futures.as_completed(futures):
            evidence_dict = future.result()

    # Now, evidence_dict contains the results of parallel processing using ThreadPoolExecutor

Selumetinib: 0
LY2090314: 0
Amuvatinib: 0
Amuvatinib: 100
LY2090314: 100
LY2090314: 200
Tofacitinib: 0
KN62: 0
GDC0994: 0
CHIR99021: 0
KN93: 0
SP600125: 0
Selumetinib: 100
AZD5438: 0
HS173: 0
PH797804: 0
AT13148: 0
GSK2334470: 0
Ribociclib: 0
Amuvatinib: 200
AZD6738: 0
Ku0063794: 0
LY2090314: 300KN62: 100

Tofacitinib: 100
GDC0994: 100
AC220: 0
DNAPK: 0
CHIR99021: 100
KN93: 100
SP600125: 100
GSK690693: 0
AZD5438: 100
Selumetinib: 200
HS173: 100
JNJ: 0
AT13148: 100
Ribociclib: 100
CX4945: 0
GSK2334470: 100
Amuvatinib: 300
Torin: 0
KN62: 200
AZD6738: 100
PH797804: 100
PIK294: 0
Tofacitinib: 200
SP600125: 200
GDC0994: 200
Ku0063794: 100
AC220: 100
CHIR99021: 200
LY2090314: 400
KN93: 200
HS173: 200
GSK690693: 100
AT13148: 200
Selumetinib: 300
AZD5438: 200
DNAPK: 100
CX4945: 100
Ribociclib: 200
JNJ: 100
GSK2334470: 200
Amuvatinib: 400
Torin: 100
PIK294: 100
GDC0994: 300
KN62: 300
Tofacitinib: 300
Ku0063794: 200
SP600125: 300
PH797804: 200
CHIR99021: 300
AZD6738: 200
LY2090314: 500
AC220: 20

In [8]:
# write evidence_dict to text file: loop over samples and separate by line of hyphens
with open('models/ebdt_data/p_model/evidence_run7.pl', 'w') as f:
    for s in evidence_dict:
        for p in evidence_dict[s]:
            for e in evidence_dict[s][p]:
                f.write(e + '\n')
            f.write('--------------------' + '\n')

LFI

In [14]:
DIR = 'models/ebdt_data/p_model/'
max_iter = 10
# learning from interpretation
cmd = f'problog lfi {DIR}p_model_lfi_run7.pl {DIR}evidence_run7.pl -O {DIR}p_model_run7_10i.pl --logger {DIR}log_run7-10i.txt -k ddnnf -v -n {max_iter}'
os.system(cmd)

In [9]:
DIR = 'models/ebdt_data/p_model/'
# learning from interpretation
cmd = f'problog ground {DIR}p_model_lfi_run7.pl -o {DIR}p_model_run7_ground.pl --format pl'
os.system(cmd)

(True, 't(0.33,"CDK10","BX912")::e_activity("CDK10","BX912",inc); t(0.33,"CDK10","BX912")::e_activity("CDK10","BX912",dec).\nt(0.33,"CDK13","BX912")::e_activity("CDK13","BX912",inc); t(0.33,"CDK13","BX912")::e_activity("CDK13","BX912",dec).\nt(0.33,"CDK16","BX912")::e_activity("CDK16","BX912",inc); t(0.33,"CDK16","BX912")::e_activity("CDK16","BX912",dec).\nt(0.33,"CDK18","BX912")::e_activity("CDK18","BX912",inc); t(0.33,"CDK18","BX912")::e_activity("CDK18","BX912",dec).\nt(0.33,"CDK20","BX912")::e_activity("CDK20","BX912",inc); t(0.33,"CDK20","BX912")::e_activity("CDK20","BX912",dec).\nt(0.33,"CDK8","BX912")::e_activity("CDK8","BX912",inc); t(0.33,"CDK8","BX912")::e_activity("CDK8","BX912",dec).\nt(0.33,"CDK9","BX912")::e_activity("CDK9","BX912",inc); t(0.33,"CDK9","BX912")::e_activity("CDK9","BX912",dec).\nt(0.33,"CTDSPL","BX912")::e_activity("CTDSPL","BX912",inc); t(0.33,"CTDSPL","BX912")::e_activity("CTDSPL","BX912",dec).\n0.001::occ_dec("RB1(S807)","BX912").\n0.001::occ_dec("RB1(S8

256