# Dataset stats

Here, we examine a simulated dataset of ca. 181000 circuits and analyse its basic characteristics.

## Background

A large set of randomly generated RNA circuits was first created as a starting point and the binding energies of all RNAs were determined by the RNA simulator. This list of circuits was then filtered for circuits with a minimum number of 2 interacting species and at least 1 self interacting specie. From these, the first 1000 circuits were chosen as reference circuits. Each was then initialised with a set of mutated versions of itself, where each species was mutated at one of [1, 2, 3, 5, 10, 15] locations on its strand, with 10 variations in total. A reference circuit would thereby have a total of 3 (number of species) * 10 (variations) * 6 (number of mutations within a sequence) = 180 mutated subcircuits. 

Once all the circuits have been determined, their function must be determined. This involved first finding the steady state of the unbound and bound RNA species, then simulating the behavior of the circuit upon being perturbed by a step function. In this case, the step signal was a 2x increase in the species 'RNA_0', which happens instantaneously. In a lab experiment, there are many different ways of implementing this, each with a different delay, so the instantaneous increase presents the most extreme case.

* here we have an example of 2 reference steady state circuits and their mutations plotted in a row of 2 and columns of 4 *

In [None]:
import numpy as np
import jax
import jax.numpy as jnp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from functools import partial
import os
import sys
import umap
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler


if __package__ is None:

    module_path = os.path.abspath(os.path.join('..'))
    sys.path.append(module_path)

    __package__ = os.path.basename(module_path)


from src.utils.misc.numerical import count_monotonic_group_lengths, find_monotonic_group_idxs, is_within_range
from src.utils.misc.string_handling import string_to_tuple_list
from src.utils.results.analytics.naming import get_true_interaction_cols
from src.utils.misc.type_handling import flatten_listlike, get_first_elements
from src.utils.misc.string_handling import prettify_keys_for_label
from src.utils.results.analytics.naming import get_analytics_types_all, get_true_names_analytics, get_true_interaction_cols
from tests_local.shared import CONFIG
from explanations.data_enhancing import enhance_data


In [None]:
og_dir = '../data/generate_seqs_flexible/2023_04_17_205800/'
fn_og = os.path.join(og_dir, 'circuit_stats.csv')
mut_dir = '../data/ensemble_mutation_effect_analysis/2023_05_09_220322/mutation_effect_on_interactions_signal'
fn_mut = 

data = pd.read_csv(fn_og)

In [None]:
dfm = data.melt(['name'], get_true_interaction_cols(data, 'energies'), value_name='energies', var_name='idx')
dfm1 = data.melt(['name'], get_true_interaction_cols(data, 'binding_rates_dissociation'), value_name='binding_rates_dissociation', var_name='idx')
dfm['binding_rates_dissociation'] = dfm1['binding_rates_dissociation']
dfm1 = data.melt(['name'], get_true_interaction_cols(data, 'eqconstants'), value_name='eqconstants', var_name='idx')
dfm['eqconstants'] = dfm1['eqconstants']
del dfm1

plt.figure(figsize=(18,5))
ax = plt.subplot(1,3,1)
sns.histplot(dfm, x='energies', log_scale=[False, True], element='step')
ax = plt.subplot(1,3,2)
sns.histplot(dfm, x='binding_rates_dissociation', log_scale=[False, True], element='step')
ax = plt.subplot(1,3,3)
sns.histplot(dfm, x='eqconstants', log_scale=[True, True], element='step')

plt.suptitle('Interaction distributions of\nsource dataset')

In [None]:
data = pd.read_csv()

In [None]:
dfm = data.melt(['name'], get_true_interaction_cols(data, 'energies'), value_name='energies', var_name='idx')
dfm1 = data.melt(['name'], get_true_interaction_cols(data, 'binding_rates_dissociation'), value_name='binding_rates_dissociation', var_name='idx')
dfm['binding_rates_dissociation'] = dfm1['binding_rates_dissociation']
dfm1 = data.melt(['name'], get_true_interaction_cols(data, 'eqconstants'), value_name='eqconstants', var_name='idx')
dfm['eqconstants'] = dfm1['eqconstants']
del dfm1

plt.figure(figsize=(18,5))
ax = plt.subplot(1,3,1)
sns.histplot(dfm, x='energies', log_scale=[False, True], element='step')
ax = plt.subplot(1,3,2)
sns.histplot(dfm, x='binding_rates_dissociation', log_scale=[False, True], element='step')
ax = plt.subplot(1,3,3)
sns.histplot(dfm, x='eqconstants', log_scale=[True, True], element='step')

plt.suptitle('Interaction distributions of\nsource dataset')

In [None]:
def load_fake_circuit(circ_row):

    k_a = config['molecular_params']['association_binding_rate' + '_per_molecule']


    def symmetrical_matrix_length(flattened_length):
        return int((-1 + np.sqrt(1 + 8 * flattened_length)) / 2)

    def flatten_to_matrix(flattened_vector):
    length = symmetrical_matrix_length(len(flattened_vector))
    matrix = [[0] * length for _ in range(length)]

    index = 0
    for i in range(length):
        for j in range(i, length):
            matrix[i][j] = flattened_vector[index]
            matrix[j][i] = flattened_vector[index]
            index += 1

    return matrix

    binding_rates_dissociation = eqconstant_to_rates(eqconstants)[1]

    labels = ['RNA_0', 'RNA_1', 'RNA_2']
    interactions = {'binding_rates_association': config['molecular_params']['association_binding_rate' + '_per_molecule'],
                    'binding_rates_dissociation': os.path.join('explanations', 'binding_rates_dissociation' + '.csv'),
                    'eqconstants': os.path.join('explanations', 'eqconstants' + '.csv'),
                    'energies': os.path.join(top_dir, 'energies', circuit_name + '_' + 'energies' + '.csv'),
                    'binding_sites': os.path.join(top_dir, 'binding_sites', circuit_name + '_' + 'binding_sites' + '.csv')}
    return construct_circuit_from_cfg({
        'data_path': some_circuit,
        'interactions': interactions
    }, config)
