#### Prerequesites:
* Define schema (make figure) 
* Define structure (make figure)
* Define schema class in python (src/models/model_schemata.py)
* Convert schema + structure into model template (models/templates/p_model.pl)

#### Procedure:
* Load data
* Adjust model script for lfi accordingly
* Create evidence file (if I don't need different evidence files, move to builmodel notebook)
* LFI

Multiple instances, 1 pst, X enzymes, one sample per instance, multiple enzymes per instance, test how many enzymes can have in one instances, how computing time depends on N p::f for 60 samples

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import pickle

from src.models import model_schemata as schema
from src.models import build_model as build
from src.models import parameter_learning as lfi
from src.models import inference as inf
from src.visualization import visualize as viz


In [4]:
# os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

#### Inference (I)

In [5]:
# reading csv from files into dict
data = {}
data['e_ksea'] = pd.read_csv('data/processed/ebdt_data/sub_network_p/e_ksea.csv')
data['p_fc'] = pd.read_csv('data/processed/ebdt_data/sub_network_p/p_fc.csv')

In [6]:
# Adding evidence
model_inference = 'models/ebdt_data/sub_network/p_model/p_model_exp5_inference_200i.pl'

Adding queries

In [7]:
sample_ids = data['p_fc']['sample'].unique().tolist()
phosphosites = data['p_fc']['phosphosite'].unique().tolist()
queries = inf.generate_queries(schema.POccupancyPredicate, sample_ids=sample_ids, phosphosites=phosphosites)
len(queries)

549

In [8]:
queries[:10]

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).',
 'query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).',
 'query(p_occupancy("ATF1(S198)", "AZD5438", _)).',
 'query(p_occupancy("BCLAF1(Y284)", "AZD5438", _)).',
 'query(p_occupancy("HIPK2(Y361)", "AZD5438", _)).',
 'query(p_occupancy("PTTG1IP(Y174)", "AZD5438", _)).',
 'query(p_occupancy("PXN(Y118)", "AZD5438", _)).',
 'query(p_occupancy("PXN(Y88)", "AZD5438", _)).',
 'query(p_occupancy("RBM39(Y95)", "AZD5438", _)).',
 'query(p_occupancy("ABI1(Y213)", "FRAX486", _)).']

In [25]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_noev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0102s
[INFO] Cycle breaking: 0.0010s
[INFO] Clark's completion: 0.0001s
[INFO] DSharp compilation: 0.0182s
[INFO] Total time: 1.7056s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0107s
[INFO] Cycle breaking: 0.0009s
[INFO] Clark's completion

Adding evidence (phosphosite only)

In [9]:
# Mapping data to Problog predicates
predicates = {}
predicates['p_fc'] = schema.PFoldChangePredicate()
predicates['p_fc'].add_data(data['p_fc'], 'phosphosite', 'sample', 'value')

In [43]:
evidence = []
for predicate in predicates:
    evid_generator = build.ProblogStatementGenerator(predicates[predicate])
    evidence = evidence + evid_generator.generate_facts(build.EvidenceTemplate, select='all')
evidence[:10]

['evidence(p_fc("ABI1(Y213)", "AZD5438", inc)).',
 'evidence(p_fc("ANXA2P2(Y24)", "AZD5438", dec)).',
 'evidence(p_fc("ATF1(S198)", "AZD5438", dec)).',
 'evidence(p_fc("BCLAF1(Y284)", "AZD5438", inc)).',
 'evidence(p_fc("HIPK2(Y361)", "AZD5438", dec)).',
 'evidence(p_fc("PTTG1IP(Y174)", "AZD5438", dec)).',
 'evidence(p_fc("PXN(Y118)", "AZD5438", dec)).',
 'evidence(p_fc("PXN(Y88)", "AZD5438", inc)).',
 'evidence(p_fc("RBM39(Y95)", "AZD5438", dec)).',
 'evidence(p_fc("ABI1(Y213)", "FRAX486", dec)).']

In [44]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_pev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0168s
[INFO] Cycle breaking: 0.0019s
[INFO] Clark's completion: 0.0001s
[INFO] DSharp compilation: 0.0171s
[INFO] Total time: 1.7383s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0149s
[INFO] Cycle breaking: 0.0017s
[INFO] Clark's completion

Adding evidence (phosphosite and all enzymes)

In [10]:
# Mapping data to Problog predicates
predicates = {}
predicates['p_fc'] = schema.PFoldChangePredicate()
predicates['p_fc'].add_data(data['p_fc'], 'phosphosite', 'sample', 'value')
predicates['e_ksea'] = schema.EKseaPredicate()
predicates['e_ksea'].add_data(data['e_ksea'], 'enzyme', 'sample', 'value')

In [27]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    evidence = []
    evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
    evidence = evidence + evid_generator.generate_facts(build.EvidenceTemplate, select=[s])
    for p in phosphosites:
        evid_generator = build.ProblogStatementGenerator(predicates['p_fc'])
        evidence_p = evidence + evid_generator.generate_facts(build.EvidenceTemplate, select=[s, p])
        evidence_dict[s][p] = evidence_p

In [28]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    for p in evidence_dict[s]:
        evidence_list.append(evidence_dict[s][p])
evidence_list[:10]

[['evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(e_ksea("FYN", "AZD5438", inc)).',
  'evidence(e_ksea("HIPK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTK6", "AZD5438", inc)).',
  'evidence(e_ksea("PTPRG", "AZD5438", inc)).',
  'evidence(e_ksea("PTPRR", "AZD5438", inc)).',
  'evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("ABI1(Y213)", "AZD5438", inc)).'],
 ['evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(e_ksea("FYN", "AZD5438", inc)).',
  'evidence(e_ksea("HIPK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTK6", "AZD5438", inc)).',
  'evidence(e_ksea("PTPRG", "AZD5438", inc)).',
  'evidence(e_ksea("PTPRR", "AZD5438", inc)).',
  'evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("ANXA2P2(Y24)", "AZD5438", dec)).'],
 ['evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(e_ksea("FYN", "AZD5438", inc)).',
  'evidence(e_ksea("HIPK2", "AZD5438", dec)).',

In [29]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence_list)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_allepev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0001s
[INFO] Grounding: 0.0529s
[INFO] Cycle breaking: 0.0057s
[INFO] Clark's completion: 0.0005s
[INFO] DSharp compilation: 0.0311s
[INFO] Total time: 1.6793s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0001s
[INFO] Grounding: 0.0381s
[INFO] Cycle breaking: 0.0031s
[INFO] Clark's completion

Adding evidence (phosphosite and relevant enzymes)

In [11]:
data['es_interaction'] = pd.read_csv('data/processed/ebdt_data/sub_network_p/es_interaction.csv')

In [38]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    for p in phosphosites:
        enzymes = data['es_interaction']['enzyme'][data['es_interaction']['phosphosite'] == p].tolist()
        evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
        evidence_e = []
        for e in enzymes:
            evidence_e.extend(evid_generator.generate_facts(build.EvidenceTemplate, select=[s, e]))
        evid_generator = build.ProblogStatementGenerator(predicates['p_fc'])
        evidence_p = evid_generator.generate_facts(build.EvidenceTemplate, select=[s, p])
        evidence_dict[s][p] = evidence_e + evidence_p

In [39]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    for p in evidence_dict[s]:
        evidence_list.append(evidence_dict[s][p])
evidence_list[:10]

[['evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(p_fc("ABI1(Y213)", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("ANXA2P2(Y24)", "AZD5438", dec)).'],
 ['evidence(e_ksea("HIPK2", "AZD5438", dec)).',
  'evidence(p_fc("ATF1(S198)", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("BCLAF1(Y284)", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(e_ksea("HIPK2", "AZD5438", dec)).',
  'evidence(p_fc("HIPK2(Y361)", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("PTTG1IP(Y174)", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(e_ksea("PTK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTPRG", "AZD5438", inc)).',
  'evidence(e_ksea("FYN", "AZD5438", inc)).',
  'evidence(e_ksea("PTK6", "AZD5438", inc)).',
  'evidence(p_fc("PXN(Y118)", "AZD5438", dec)).'],
 ['evidence(e_ksea("S

In [33]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence_list)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_relepev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0170s
[INFO] Cycle breaking: 0.0020s
[INFO] Clark's completion: 0.0001s
[INFO] DSharp compilation: 0.0182s
[INFO] Total time: 1.6506s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0182s
[INFO] Cycle breaking: 0.0020s
[INFO] Clark's completion

Adding evidence (relevant enzymes only)

In [40]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    for p in phosphosites:
        enzymes = data['es_interaction']['enzyme'][data['es_interaction']['phosphosite'] == p].tolist()
        evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
        evidence_e = []
        for e in enzymes:
            evidence_e.extend(evid_generator.generate_facts(build.EvidenceTemplate, select=[s, e]))
        evidence_dict[s][p] = evidence_e

In [41]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    for p in evidence_dict[s]:
        evidence_list.append(evidence_dict[s][p])
evidence_list[:10]

[['evidence(e_ksea("ABL1", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).'],
 ['evidence(e_ksea("HIPK2", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(e_ksea("HIPK2", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(e_ksea("PTK2", "AZD5438", dec)).',
  'evidence(e_ksea("PTPRG", "AZD5438", inc)).',
  'evidence(e_ksea("FYN", "AZD5438", inc)).',
  'evidence(e_ksea("PTK6", "AZD5438", inc)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(e_ksea("PTPRR", "AZD5438", inc)).'],
 ['evidence(e_ksea("ABL1", "AZD5438", inc)).'],
 ['evidence(e_ksea("ABL1", "FRAX486", dec)).']]

In [36]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence_list)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_releev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0122s
[INFO] Cycle breaking: 0.0012s
[INFO] Clark's completion: 0.0001s
[INFO] DSharp compilation: 0.0218s
[INFO] Total time: 1.7004s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_3000i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0115s
[INFO] Cycle breaking: 0.0014s
[INFO] Clark's completion

Adding evidence (all phosphosite and relevant enzymes)

In [12]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    for p in phosphosites:
        enzymes = data['es_interaction']['enzyme'][data['es_interaction']['phosphosite'] == p].tolist()
        evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
        evidence_e = []
        for e in enzymes:
            evidence_e.extend(evid_generator.generate_facts(build.EvidenceTemplate, select=[s, e]))
        evid_generator = build.ProblogStatementGenerator(predicates['p_fc'])
        evidence_p = evid_generator.generate_facts(build.EvidenceTemplate, select=[s])
        evidence_dict[s][p] = evidence_e + evidence_p

In [13]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    for p in evidence_dict[s]:
        evidence_list.append(evidence_dict[s][p])
evidence_list[:10]

[['evidence(e_ksea("ABL1", "AZD5438", inc)).',
  'evidence(p_fc("ABI1(Y213)", "AZD5438", inc)).',
  'evidence(p_fc("ANXA2P2(Y24)", "AZD5438", dec)).',
  'evidence(p_fc("ATF1(S198)", "AZD5438", dec)).',
  'evidence(p_fc("BCLAF1(Y284)", "AZD5438", inc)).',
  'evidence(p_fc("HIPK2(Y361)", "AZD5438", dec)).',
  'evidence(p_fc("PTTG1IP(Y174)", "AZD5438", dec)).',
  'evidence(p_fc("PXN(Y118)", "AZD5438", dec)).',
  'evidence(p_fc("PXN(Y88)", "AZD5438", inc)).',
  'evidence(p_fc("RBM39(Y95)", "AZD5438", dec)).'],
 ['evidence(e_ksea("SRC", "AZD5438", inc)).',
  'evidence(p_fc("ABI1(Y213)", "AZD5438", inc)).',
  'evidence(p_fc("ANXA2P2(Y24)", "AZD5438", dec)).',
  'evidence(p_fc("ATF1(S198)", "AZD5438", dec)).',
  'evidence(p_fc("BCLAF1(Y284)", "AZD5438", inc)).',
  'evidence(p_fc("HIPK2(Y361)", "AZD5438", dec)).',
  'evidence(p_fc("PTTG1IP(Y174)", "AZD5438", dec)).',
  'evidence(p_fc("PXN(Y118)", "AZD5438", dec)).',
  'evidence(p_fc("PXN(Y88)", "AZD5438", inc)).',
  'evidence(p_fc("RBM39(Y95)"

In [21]:
inference_output = 'models/ebdt_data/sub_network/p_model/inference_log_5.txt'
inf_data_dict = inf.infer_marginf(model_inference, queries, inference_output, evidence=evidence_list)
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/p_model/predictions_marginf_releallpev_exp5_200i.pkl', 'wb'))

['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).']
1/549
Inferring data (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_200i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0452s
[INFO] Cycle breaking: 0.0040s
[INFO] Clark's completion: 0.0004s
[INFO] DSharp compilation: 0.4487s
[INFO] Total time: 1.4265s
Finished inference (['query(p_occupancy("ABI1(Y213)", "AZD5438", _)).'])...
['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).']
2/549
Inferring data (['query(p_occupancy("ANXA2P2(Y24)", "AZD5438", _)).'])...
problog models/ebdt_data/sub_network/p_model/p_model_exp5_inference_200i.pl -o models/ebdt_data/sub_network/p_model/inference_log_5.txt -k ddnnf -v
[INFO] Output level: INFO
[INFO] Propagating evidence: 0.0000s
[INFO] Grounding: 0.0463s
[INFO] Cycle breaking: 0.0039s
[INFO] Clark's completion: 