#### Prerequesites:
* Define schema (make figure) 
* Define structure (make figure)
* Define schema class in python (src/models/model_schemata.py)
* Convert schema + structure into model template (models/templates/p_model.pl)

#### Experiment settings:
* Choose network (skeleton)
* Create new folder (models/synthetic_data/toy_network_mini/p_model/)
* Move model template to folder
* Define sampling strategy (how many sampling steps? sampling or inference?)

#### Procedure:
* Add entities and relationships to model script
* Adjust model script for sampling accordingly
* Sample probabilitic attributes (save)
* Adjust model script for learning remaining cpds with mpe
* Add sampled attributes to model script as facts
* Learn remaining cpds with mpe (save)
* Sample probabilities for uncertain evidence (save)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importing libraries
import pandas as pd
import os
import pickle

from src.models import build_model as build
from src.models import model_schemata as schema
from src.models import inference as inf

from src.data import prep_data

In [4]:
# os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

Building skeleton

In [5]:
# Defining skeleton
enzymes = ['e1', 'e2', 'e3']
phosphosites = ['p0']
es_interaction = pd.DataFrame({'enzyme': ['e1', 'e2', 'e3'], 'substrate': ['p0', 'p0', 'p0']})
# Defining fixed attributes
e_function = pd.DataFrame({'enzyme': ['e1', 'e2', 'e3'], 'function': ['kinase', 'kinase', 'phosphatase']})

In [7]:
# Mapping data to Problog predicates
predicates = {}
predicates['enzyme'] = schema.EnzymePredicate(enzyme_list=enzymes)
predicates['phosphosite'] = schema.PhosphositePredicate(phosphosite_list=phosphosites)
predicates['es_interaction'] = schema.ESInteractionPredicate(dataframe=es_interaction, enzyme_col='enzyme', phosphosite_col='substrate')
predicates['e_function'] = schema.EFunctionPredicate(dataframe=e_function, enzyme_col='enzyme', function_col='function')

# Adding entites, relationships, and fixed attributes to template model
model_skeleton = 'models/synthetic_data/toy_network_mini/p_model/p_model_skeleton.pl'

for predicate in predicates:
    fact_generator = build.ProblogStatementGenerator(predicates[predicate])
    problog_facts = fact_generator.generate_facts(build.FactTemplate) # generate Problog facts
    build.insert_statements(model=model_skeleton, statements=problog_facts, location='%% {}'.format(predicate)) # insert into Problog file

Sampling

In [6]:
# Sampling probabilistic attributes
model_sampling = 'models/synthetic_data/toy_network_mini/p_model/data_generation/p_model_sampling.pl'
sampling_output = 'data/processed/synthetic_data/toy_network_mini/p_model/sampled_data.txt'
cmd = f'problog sample {model_sampling} -N 100 -s 612 -o {sampling_output}'
os.system(cmd)
e_activity = build.sampling_file_to_df(sampling_output, schema.EActivityPredicate, build.FactTemplate)
e_activity

(True, <generator object sample at 0x7f8b494575e0>)


Unnamed: 0,enzyme,sample,value
0,e1,s1,inc
1,e1,s10,dec
2,e1,s100,inc
3,e1,s11,base
4,e1,s12,dec
...,...,...,...
295,e3,s95,inc
296,e3,s96,inc
297,e3,s97,inc
298,e3,s98,dec


Inference (MPE)

In [9]:
model_inference = 'models/synthetic_data/toy_network_mini/p_model/data_generation/p_model_inference.pl'

In [16]:
# Mapping e_activity data to Problog predicates
predicates['e_activity'] = schema.EActivityPredicate()
predicates['e_activity'].add_data(e_activity, enzyme_col='enzyme', sample_col='sample', value_col='value')

# Adding fixed attributes to template model
fact_generator = build.ProblogStatementGenerator(predicates['e_activity'])
problog_facts = fact_generator.generate_facts(build.FactTemplate) # generate Problog facts
build.insert_statements(model=model_inference, statements=problog_facts, location='%% {}'.format('e_activity')) # insert into Problog file

In [10]:
# Inferring probabilistic attributes
inference_output = 'data/processed/synthetic_data/toy_network_mini/p_model/inferred_data.txt'
sample_ids = [f's{i}' for i in range(1, 101)]
queries = inf.generate_queries(sample_ids, schema.POccupancyPredicate)
queries[:10]

['query(p_occupancy(_, s1, _)).',
 'query(p_occupancy(_, s2, _)).',
 'query(p_occupancy(_, s3, _)).',
 'query(p_occupancy(_, s4, _)).',
 'query(p_occupancy(_, s5, _)).',
 'query(p_occupancy(_, s6, _)).',
 'query(p_occupancy(_, s7, _)).',
 'query(p_occupancy(_, s8, _)).',
 'query(p_occupancy(_, s9, _)).',
 'query(p_occupancy(_, s10, _)).']

In [None]:
inf_data_dict = inf.infer_mpe(model_inference, queries, inference_output)
# pickle the dictionary
with open('data/processed/synthetic_data/toy_network_mini/p_model/inferred_data_dict.pkl', 'wb') as f:
    pickle.dump(inf_data_dict, f) 

Sampling probabilities

In [42]:
# read the dictionary
with open('data/processed/synthetic_data/toy_network_mini/p_model/inferred_data_dict.pkl', 'rb') as f:
    inf_data_dict = pickle.load(f)


In [44]:
params = {'mean': 0.9, 'lower_bound': 0.5, 'upper_bound': 1.0, 'std_dev': 0.2}

# Call the function to calculate 'p_dec', 'p_base', and 'p_inc' columns
p_fc = prep_data.sample_ad_probabilities(inf_data_dict['p_occupancy'], 'value', params)
e_ksea = prep_data.sample_ad_probabilities(e_activity, 'value', params)
p_fc.min()

phosphosite          p0
sample               s1
value              base
prob             0.5048
p_dec             0.001
p_base         0.011075
p_inc             0.001
dtype: object

In [45]:
# save es_interaction, e_function, e_ksea, p_fc, e_activity and p_occupancy to csv
es_interaction.to_csv('data/processed/synthetic_data/toy_network_mini/p_model/es_interaction.csv', index=False)
e_function.to_csv('data/processed/synthetic_data/toy_network_mini/p_model/e_function.csv', index=False)
e_ksea.to_csv('data/processed/synthetic_data/toy_network_mini/p_model/e_ksea.csv', index=False)
p_fc.to_csv('data/processed/synthetic_data/toy_network_mini/p_model/p_fc.csv', index=False)
e_activity.to_csv('data/processed/synthetic_data/toy_network_mini/p_model/e_activity.csv', index=False)
inf_data_dict['p_occupancy'].to_csv('data/processed/synthetic_data/toy_network_mini/p_model/p_occupancy.csv', index=False)