#### Prerequesites:
* Define schema (make figure) 
* Define structure (make figure)
* Define schema class in python (src/models/model_schemata.py)
* Convert schema + structure into model template (models/templates/p_model.pl)

#### Procedure:
* Load data
* Adjust model script for lfi accordingly
* Create evidence file (if I don't need different evidence files, move to builmodel notebook)
* LFI

Multiple instances, 1 pst, X enzymes, one sample per instance, multiple enzymes per instance, test how many enzymes can have in one instances, how computing time depends on N p::f for 60 samples

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import pickle
import random

from src.models import model_schemata as schema
from src.models import build_model as build
from src.models import parameter_learning as lfi
from src.models import inference as inf
from src.visualization import visualize as viz


In [3]:
os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
# os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

#### Inference (I)

In [4]:
# reading csv from files into dict
data = {}
data['e_ksea'] = pd.read_csv('data/processed/ebdt_data/sub_network_e/e_ksea.csv')
data['p_fc'] = pd.read_csv('data/processed/ebdt_data/sub_network_e/p_fc.csv')

In [5]:
# getting sample names (union of all samples in all datasets)
samples = list(set(data['e_ksea']['sample']).union(set(data['p_fc']['sample'])))
samples.sort()
# sample x% of samples randomly without replacement with seed
random.seed(612)
train = random.sample(samples, int(len(samples)*0.8))
test = [x for x in samples if x not in train]
print(test)
# filter data
testing_data = {}
testing_data['e_ksea'] = data['e_ksea'][data['e_ksea']['sample'].isin(test)].reset_index(drop=True)
testing_data['p_fc'] = data['p_fc'][data['p_fc']['sample'].isin(test)].reset_index(drop=True)

['AZD6738', 'Dasatinib', 'GO6983', 'GSK690693', 'KD025', 'KN62', 'MK2206', 'PD153035', 'PF4708671', 'PIK294', 'Tofacitinib', 'Ulixertinib', 'Vemurafenib']


In [6]:
# Adding evidence
model_inference = 'models/ebdt_data/sub_network/e_model/e_model_exp1_testing_100i.pl'

#### Inferring p_occupancy

Adding queries

In [7]:
sample_ids = testing_data['e_ksea']['sample'].unique().tolist()
enzymes = testing_data['e_ksea']['enzyme'].unique().tolist()
queries = inf.generate_queries(schema.EActivityPredicate, sample_ids=sample_ids, enzymes=enzymes)
len(queries)

78

In [8]:
queries[:10]

['query(e_activity("ABL1", "AZD6738", _)).',
 'query(e_activity("FYN", "AZD6738", _)).',
 'query(e_activity("HIPK2", "AZD6738", _)).',
 'query(e_activity("PTK2", "AZD6738", _)).',
 'query(e_activity("PTPRG", "AZD6738", _)).',
 'query(e_activity("SRC", "AZD6738", _)).',
 'query(e_activity("ABL1", "Dasatinib", _)).',
 'query(e_activity("FYN", "Dasatinib", _)).',
 'query(e_activity("HIPK2", "Dasatinib", _)).',
 'query(e_activity("PTK2", "Dasatinib", _)).']

Adding evidence (phosphosite and relevant enzymes)

In [9]:
data['p_regulates'] = pd.read_csv('data/processed/ebdt_data/sub_network_e/p_regulates.csv')

In [10]:
# Mapping data to Problog predicates
predicates = {}
predicates['p_fc'] = schema.PFoldChangePredicate()
predicates['p_fc'].add_data(testing_data['p_fc'], 'phosphosite', 'sample', 'value')
predicates['e_ksea'] = schema.EKseaPredicate()
predicates['e_ksea'].add_data(testing_data['e_ksea'], 'enzyme', 'sample', 'value')

In [11]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    for e in enzymes:
        phosphosites = data['p_regulates']['phosphosite'][data['p_regulates']['protein'] == e].tolist()
        evid_generator = build.ProblogStatementGenerator(predicates['p_fc'])
        evidence_p = []
        for p in phosphosites:
            evidence_p.extend(evid_generator.generate_facts(build.EvidenceTemplate, select=[s, p]))
        evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
        evidence_e = evid_generator.generate_facts(build.EvidenceTemplate, select=[s, e])
        evidence_dict[s][e] = evidence_p + evidence_e

In [12]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    for e in evidence_dict[s]:
        evidence_list.append(evidence_dict[s][e])
evidence_list[:10]

[['evidence(p_fc("ABL1(S569)", "AZD6738", dec)).',
  'evidence(p_fc("ABL1(S718)", "AZD6738", dec)).',
  'evidence(p_fc("ABL1(T735)", "AZD6738", inc)).',
  'evidence(e_ksea("ABL1", "AZD6738", dec)).'],
 ['evidence(e_ksea("FYN", "AZD6738", inc)).'],
 ['evidence(p_fc("HIPK2(Y361)", "AZD6738", dec)).',
  'evidence(e_ksea("HIPK2", "AZD6738", inc)).'],
 ['evidence(p_fc("PTK2(S29)", "AZD6738", inc)).',
  'evidence(p_fc("PTK2(S722)", "AZD6738", dec)).',
  'evidence(p_fc("PTK2(S910)", "AZD6738", dec)).',
  'evidence(p_fc("PTK2(S843)", "AZD6738", inc)).',
  'evidence(e_ksea("PTK2", "AZD6738", inc)).'],
 ['evidence(p_fc("PTPRG(S995)", "AZD6738", dec)).',
  'evidence(e_ksea("PTPRG", "AZD6738", inc)).'],
 ['evidence(p_fc("SRC(S17)", "AZD6738", dec)).',
  'evidence(p_fc("SRC(S75)", "AZD6738", inc)).',
  'evidence(e_ksea("SRC", "AZD6738", dec)).'],
 ['evidence(p_fc("ABL1(S569)", "Dasatinib", dec)).',
  'evidence(p_fc("ABL1(S718)", "Dasatinib", dec)).',
  'evidence(p_fc("ABL1(T735)", "Dasatinib", inc)

In [13]:
inference_output = 'models/ebdt_data/sub_network/e_model/inference_log_1.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence_list)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/e_model/testing_exp1_100i.pkl', 'wb'))

['query(e_activity("ABL1", "AZD6738", _)).']
1/78
Inferring data (['query(e_activity("ABL1", "AZD6738", _)).'])...
problog models/ebdt_data/sub_network/e_model/e_model_exp1_testing_100i.pl -o models/ebdt_data/sub_network/e_model/inference_log_1.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0404s
[INFO] Cycle breaking: 0.0028s
[INFO] Clark's completion: 0.0002s
[INFO] DSharp compilation: 0.0383s
[INFO] Total time: 1.7761s
Finished inference (['query(e_activity("ABL1", "AZD6738", _)).'])...
['query(e_activity("FYN", "AZD6738", _)).']
2/78
Inferring data (['query(e_activity("FYN", "AZD6738", _)).'])...
problog models/ebdt_data/sub_network/e_model/e_model_exp1_testing_100i.pl -o models/ebdt_data/sub_network/e_model/inference_log_1.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0090s
[INFO] Cycle breaking: 0.0003s
[INFO] Clark's completion: 0.0001s
[INFO] DSharp compilation: 0.0117s
[INF