#### Prerequesites:
* Define schema (make figure) 
* Define structure (make figure)
* Define schema class in python (src/models/model_schemata.py)
* Convert schema + structure into model template (models/templates/p_model.pl)

#### Procedure:
* Load data
* Adjust model script for lfi accordingly
* Create evidence file (if I don't need different evidence files, move to builmodel notebook)
* LFI

Multiple instances, 1 pst, X enzymes, one sample per instance, multiple enzymes per instance, test how many enzymes can have in one instances, how computing time depends on N p::f for 60 samples

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import pickle

from src.models import model_schemata as schema
from src.models import build_model as build
from src.models import parameter_learning as lfi
from src.models import inference as inf
from src.visualization import visualize as viz


In [3]:
os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
# os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

#### MPE (I)

In [4]:
# reading csv from files into dict
data = {}
data['e_ksea'] = pd.read_csv('data/processed/ebdt_data/sub_network_n/e_ksea.csv')
data['p_fc'] = pd.read_csv('data/processed/ebdt_data/sub_network_n/p_fc.csv')

In [5]:
# Adding evidence
model_inference = 'models/ebdt_data/sub_network/n_model/n_model_exp1_mpe.pl'

Adding queries

In [6]:
sample_ids = data['e_ksea']['sample'].unique().tolist()
enzymes = data['e_ksea']['enzyme'].unique().tolist()
queries = inf.generate_queries(schema.EActivityPredicate, sample_ids=sample_ids, enzymes=enzymes)
len(queries)

488

In [7]:
# make evidence_dict['AZD5438'] into list
query_list = []
for s in sample_ids:
    # return where queries contains sample
    matches = [s in q for q in queries]
    # select queries where sample is AC220
    q = [item for item, condition in zip(queries, matches) if condition]
    query_list.append(q)
query_list[:2]

[['query(e_activity("ABL1", "AC220", _)).',
  'query(e_activity("FYN", "AC220", _)).',
  'query(e_activity("HIPK2", "AC220", _)).',
  'query(e_activity("PTK2", "AC220", _)).',
  'query(e_activity("PTK6", "AC220", _)).',
  'query(e_activity("PTPRG", "AC220", _)).',
  'query(e_activity("PTPRR", "AC220", _)).',
  'query(e_activity("SRC", "AC220", _)).'],
 ['query(e_activity("ABL1", "AT13148", _)).',
  'query(e_activity("FYN", "AT13148", _)).',
  'query(e_activity("HIPK2", "AT13148", _)).',
  'query(e_activity("PTK2", "AT13148", _)).',
  'query(e_activity("PTK6", "AT13148", _)).',
  'query(e_activity("PTPRG", "AT13148", _)).',
  'query(e_activity("PTPRR", "AT13148", _)).',
  'query(e_activity("SRC", "AT13148", _)).']]

Adding evidence (all phosphosite and relevant enzymes)

In [8]:
# Mapping data to Problog predicates
predicates = {}
predicates['p_fc'] = schema.PFoldChangePredicate()
predicates['p_fc'].add_data(data['p_fc'], 'phosphosite', 'sample', 'value')
predicates['e_ksea'] = schema.EKseaPredicate()
predicates['e_ksea'].add_data(data['e_ksea'], 'enzyme', 'sample', 'value')

In [9]:
evidence_dict = {}
for s in sample_ids:
    evidence_dict[s] = {}
    #evid_generator = build.ProblogStatementGenerator(predicates['e_ksea'])
    #evidence_e = evid_generator.generate_facts(build.EvidenceTemplate, select=[s])
    evid_generator = build.ProblogStatementGenerator(predicates['p_fc'])
    evidence_p = evid_generator.generate_facts(build.EvidenceTemplate, select=[s])
    # evidence_dict[s] = evidence_e + evidence_p
    evidence_dict[s] = evidence_p

In [10]:
# make evidence_dict['AZD5438'] into list
evidence_list = []
for s in evidence_dict.keys():
    evidence_list.append(evidence_dict[s])
evidence_list[:2]

[['evidence(p_fc("ABI1(Y213)", "AC220", dec)).',
  'evidence(p_fc("ABL1(S569)", "AC220", inc)).',
  'evidence(p_fc("ABL1(S718)", "AC220", inc)).',
  'evidence(p_fc("ABL1(T735)", "AC220", inc)).',
  'evidence(p_fc("ANXA2P2(Y24)", "AC220", dec)).',
  'evidence(p_fc("ATF1(S198)", "AC220", dec)).',
  'evidence(p_fc("BCLAF1(Y284)", "AC220", inc)).',
  'evidence(p_fc("HIPK2(Y361)", "AC220", inc)).',
  'evidence(p_fc("PTK2(S29)", "AC220", dec)).',
  'evidence(p_fc("PTK2(S722)", "AC220", dec)).',
  'evidence(p_fc("PTK2(S910)", "AC220", inc)).',
  'evidence(p_fc("PTPRG(S995)", "AC220", inc)).',
  'evidence(p_fc("PTTG1IP(Y174)", "AC220", inc)).',
  'evidence(p_fc("PXN(Y118)", "AC220", dec)).',
  'evidence(p_fc("PXN(Y88)", "AC220", inc)).',
  'evidence(p_fc("RBM39(Y95)", "AC220", inc)).',
  'evidence(p_fc("SRC(S17)", "AC220", dec)).',
  'evidence(p_fc("SRC(S75)", "AC220", dec)).',
  'evidence(p_fc("PTK2(S843)", "AC220", dec)).'],
 ['evidence(p_fc("ABI1(Y213)", "AT13148", dec)).',
  'evidence(p_fc

In [11]:
query = [query_list[0][0]]
query

['query(e_activity("ABL1", "AC220", _)).']

In [12]:
inference_output = 'models/ebdt_data/sub_network/n_model/inference_log.txt'
inf_data_dict = inf.infer_mpe(model_inference, query, inference_output, evidence=[])
pickle.dump(inf_data_dict, open('models/ebdt_data/sub_network/n_model/predictions_mpe_allpev_exp5.pkl', 'wb'))

['query(e_activity("ABL1", "AC220", _)).']
Inferring data (['query(e_activity("ABL1", "AC220", _)).'])...
problog mpe models/ebdt_data/sub_network/n_model/n_model_exp1_mpe.pl -o models/ebdt_data/sub_network/n_model/inference_log.txt -v
[INFO] Output level: INFO
[INFO] Ground program size: 155
[INFO] Clark's completion: 0.0007s
[INFO] CNF size: 485
[INFO] Transform input: 0.0021s


In [11]:
inf_data_dict['e_activity']

Unnamed: 0,enzyme,sample,value
0,ABL1,AC220,inc
1,ABL1,AT13148,inc
2,ABL1,AZ20,inc
3,ABL1,AZD1480,inc
4,ABL1,AZD3759,dec
...,...,...,...
483,SRC,Torin,inc
484,SRC,Trametinib,inc
485,SRC,U73122,inc
486,SRC,Ulixertinib,inc
