In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Libraries
import pandas as pd
import os
import random

from src.data import synthesise_data
from src.models import build_model, train_model

In [3]:
os.chdir('/Users/magdalena/OneDrive - Queen Mary, University of London/bezzlab/research/projects/phospho_pi/')
# os.chdir('/home/mhuebner/Desktop/bezzlab/research/projects/phospho_pi/')

### Setting up model architecture

In [4]:
# Prior knowledge
# Enzymes
enzymes = ['e1', 'e2', 'e3', 'e4']
# Phosphosites
phosphosites = ['p41', 'p51', 'p52', 'p61']
# Enzyme classes
enz_classes = [['e1', 'kinase'], ['e2', 'kinase'], ['e3', 'phosphatase'], ['e4', 'kinase']]
# Enzyme-substrate interactions
interactions = [['e1', 'p41'], ['e2', 'p41'], ['e3', 'p41'], 
                ['e4', 'p51'], ['e4', 'p52'], ['e4', 'p61']]

# Model
MODEL = 'modelA'
PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/'
# Templates for sampling/inference
sampling_template = f'models/templates/{MODEL}_sampling.pl'
inference_template = f'models/templates/{MODEL}_inference.pl'
# Models for sampling/inference
sampling_model = f'models/synthetic_data/{MODEL}_sampling.pl'
inference_model = f'models/synthetic_data/{MODEL}_inference.pl'

In [5]:
# Converting prior knowledge to facts and saving to files
pk_dict = {}
pk_dict['enzyme'] = build_model.df_to_facts(pd.DataFrame(enzymes), 'enzyme')
pk_dict['phosphosite'] = build_model.df_to_facts(pd.DataFrame(phosphosites), 'phosphosite')
pk_dict['enz_class'] = build_model.df_to_facts(pd.DataFrame(enz_classes), 'enz_class')
pk_dict['interaction'] = build_model.df_to_facts(pd.DataFrame(interactions), 'interaction')

build_model.write_predicate_files(pk_dict, PATH_TO_DATA, suffix = '')

# Adding prior knowledge to the template models
build_model.add_pk(sampling_template, sampling_model, pk_dict)
build_model.add_pk(inference_template, inference_model, pk_dict)

### Synthesising data (through sampling + inference)

In [6]:
# Settings
experiments = [25, 50, 100, 250, 500, 1000]
infer_from = 'facts'  # 'facts' faster and more reliable 

Ideal data

In [None]:
for n_samples in experiments:
    # Ideal data
    data = synthesise_data.main(MODEL, n_samples, sampling_model=sampling_model, inference_model=inference_model, targets=['fold_change', 'occupancy'], infer_from=infer_from, suffix=f'_{n_samples}')

Noisy data

In [7]:
# REDO!
# Settings for noisy data
# Four noisy datasets (occupancy always the same, either act (which enzymes) or fc or both, if both either independent or correlated
# 1 Interaction wrong (mutate 80 of interaction)
# Random interactions (mutate 80 of all interactions)
# Data for enzyme/pst missing completely ('missing data')
#m1 = [['activity', 'occupancy', 'fold_change'], [0, 1, 2], [0, 1, 2]]
#m2 = [['activity'], [0], [0]]
m3 = [['fold_change'], [0], [0]]
#m4 = [['occupancy'], [0], [0]]
#m5 = [['activity', 'occupancy'], [0, 0], [0, 0]]
#m6 = [['fold_change', 'occupancy'], [0, 0], [0, 0]]
#m7 = [['activity', 'fold_change'], [0, 0], [0, 0]]

# frac = [0.1, 1, 5, 10, 20]
frac = [20]
# mutate = {'m1': m1, 'm2': m2, 'm3': m3, 'm4': m4, 'm5': m5, 'm6': m6, 'm7': m7}
mutate = {'m3': m3}

In [None]:
for n_samples in experiments:
    PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/n{n_samples}/'
    data = synthesise_data.get_data(PATH_TO_DATA)
    # Noisy data
    for f in frac:
        for key, m in mutate.items():
            data_mut = data.copy()
            for idx, predicate in enumerate(m[0]):
                data_mut[predicate] = synthesise_data.mutate_facts(data_mut[predicate], frac=f/100, sample_seed=m[1][idx], value_seed=m[2][idx])
            build_model.write_predicate_files(data_mut, path=f'{PATH_TO_DATA}mutate/{key}/mutate_{key}_{f}/')
            train_model.infer_occupancy(MODEL, f'{PATH_TO_DATA}mutate/{key}/mutate_{key}_{f}/', pk_dict)


In [None]:
# Noisy data (enzym e2 only) 
m8 = [['activity', 'occupancy', 'fold_change'], [0, 1, 1], [0, 1, 1]]
frac = [1, 20, 80]
mutate = {'m1': m1, 'm2': m2, 'm8': m8}
PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/n1000/'
data = synthesise_data.get_data(PATH_TO_DATA)
# Noisy data
for f in frac:
    for key, m in mutate.items():
        data_mut = data.copy()
        for idx, predicate in enumerate(m[0]):
            data_mut[predicate] = synthesise_data.mutate_facts(data_mut[predicate], frac=f/100, sample_seed=m[1][idx], value_seed=m[2][idx], enzymes=['e2'])
        build_model.write_predicate_files(data_mut, path=f'{PATH_TO_DATA}/mutate_e2_{key}_{f}/')

Incomplete data

In [None]:
for n_samples in experiments:
    PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/n{n_samples}/'
    data = synthesise_data.get_data(PATH_TO_DATA)
    # Incomplete data
    for f in frac:
        for key, m in mutate.items():
            data_miss = data.copy()
            for idx, predicate in enumerate(m[0]):
                data_miss[predicate] = synthesise_data.remove_facts(data_miss[predicate], frac=f/100, sample_seed=m[1][idx])
            build_model.write_predicate_files(data_miss, path=f'{PATH_TO_DATA}/missing_{key}_{f}/')

Probabilistic data - random

In [None]:
PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/n1000/'
data = synthesise_data.get_data(f'{PATH_TO_DATA}/mutate_m1_20/')
for predicate, facts in data.items():
        if predicate in ['activity', 'fold_change']:
            for idx, fact in enumerate(facts):
                p = random.uniform(0.5, 1) 
                data[predicate][idx] = f'{p}::{fact}'
build_model.write_predicate_files(data, path=f'{PATH_TO_DATA}/mutate_m1_20_prand/')

Probabilistic data - selected

In [None]:
PATH_TO_DATA = f'data/processed/synthetic_data/{MODEL}/n1000/'
data = build_model.dict_to_list(synthesise_data.get_data(f'{PATH_TO_DATA}/mutate_m1_20/'))
true_data = build_model.dict_to_list(synthesise_data.get_data(PATH_TO_DATA))
err_facts = set(data) - set(true_data)
data = synthesise_data.get_data(f'{PATH_TO_DATA}/mutate_m1_20/')
for predicate, facts in data.items():
        if predicate in ['activity', 'fold_change']:
            for idx, fact in enumerate(facts):
                if fact in err_facts:
                     p = random.uniform(0.5, 0.75)  # needs to be above 0.5
                else:
                     p = random.uniform(0.76, 1)
                data[predicate][idx] = f'{p}::{fact}'
build_model.write_predicate_files(data, path=f'{PATH_TO_DATA}/mutate_m1_20_pspec/')

In [None]:
# add function so can control which enzyme more noisy
# maybe something to make sure same samples are mutated (instead of using df order)