# Random Data Simulation and Fitting Using Neural Network

### _Simulation of random data:_

Import necessary packages:

In [None]:
from metabolabpytools import isotopomerAnalysis
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import copy

Function to generate random percentages summing to 100%:

In [None]:
def generate_isotopomer_percentages():
    unlabelled_percentage = np.random.uniform(20, 80)  # Unlabelled percentage between 20% and 80%
    remaining_percentage = 100 - unlabelled_percentage
    
    # Generate random percentages for the remaining 7 isotopomers
    random_values = np.random.rand(7)
    random_percentages = (random_values / random_values.sum()) * remaining_percentage
    
    # Combine unlabelled and other isotopomer percentages
    percentages = [unlabelled_percentage] + list(random_percentages)
    
    return percentages

Create an isotopomerAnalysis object:

In [None]:
ia = isotopomerAnalysis.IsotopomerAnalysis()

Define metabolite parameters:

In [None]:
isotopomers = [
    [0, 0, 0],   
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 1, 0],
    [1, 0, 1],
    [0, 1, 1],
    [1, 1, 1]
]

print(f"Initial isotopomers: {isotopomers}")
#random_percentages = generate_isotopomer_percentages()
# print(random_percentages)
hsqc = [0,1,1]
metabolite = 'L-LacticAcid'
num_samples = 10


In [None]:
ia.init_metabolite_multiple_samples(metabolite, hsqc, num_samples=num_samples)

In [None]:
print(f"Metabolites: {ia.metabolites}")
print(f"Initial Isotopomer Percentages: {ia.isotopomer_percentages}")

Initialising and set isoptomers for multiple samples:

In [None]:
generated_percentages = []
for exp_index in range(num_samples):
    random_percentages = generate_isotopomer_percentages()  # Generate new random percentages for each sample
    # print(f"Generated percentages for sample {exp_index}: {random_percentages}")
    generated_percentages.append(random_percentages)  # Store generated percentages for comparison
    
    # Deep copy isotopomers for each iteration to ensure it is not modified
    isotopomers_copy = copy.deepcopy(isotopomers)
    
    # Debug prints before setting fit isotopomers
    # print(f"Before set_fit_isotopomers call for exp_index {exp_index}:")
    # print(f"Metabolite: {metabolite}")
    # print(f"Isotopomers: {isotopomers_copy}")
    # print(f"Percentages: {random_percentages}")
    # print(f"Exp Index: {exp_index}")
    
    # Ensure isotopomers_copy is not empty
    if len(isotopomers_copy) == 0:
        print(f"Error: Isotopomers is empty for exp_index {exp_index}")

    ia.set_fit_isotopomers_simple(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages, exp_index=exp_index)
    ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)
    ia.sim_gcms_data(metabolite, exp_index)

Initialise isotopomerAnalysis object with selected data:

In [None]:
#ia.init_metabolite(metabolite, hsqc)
#ia.set_fit_isotopomers(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages)

In [None]:
#print(f'Isotopomers : {ia.fit_isotopomers[metabolite]}\nIsotopomer %: {ia.isotopomer_percentages[metabolite]}')

Simulate HSQC data:

In [None]:
#ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)

In [None]:
# print(f'HSQC Multiplets: {ia.exp_multiplets[metabolite][exp_index]}')
#print(f'Multiplet percentages: {ia.exp_multiplet_percentages[metabolite][exp_index]}')

Simulate GC-MS data:

In [None]:
#ia.sim_gcms_data(metabolite, exp_index)

In [None]:
#ia.exp_gcms[metabolite][exp_index]

### _Fitting to simulated data using basic neural network:_

Use only HSQC multiplets and GC-MS data for analysis:

In [None]:
ia.use_hsqc_multiplet_data = True
ia.use_gcms_data = True
ia.use_nmr1d_data = False

Generate multiple samples:

In [None]:
# num_samples = 10  # Example number of samples
# samples = ia.generate_simulated_samples(num_samples)

Call the new neural network fit method:

In [None]:
ia.fit_data_nn(metabolite=metabolite, fit_isotopomers=isotopomers, num_samples=num_samples)

In [None]:
sample_data_df = ia.gather_sample_data(metabolite=metabolite, num_samples=num_samples)
print(sample_data_df)

In [None]:
#print(f'Fitted Isotoponers : {ia.fitted_isotopomers[metabolite][exp_index]}\nFitted Isotopomer %: {ia.fitted_isotopomer_percentages[metabolite][exp_index]}')  

Export data and analysis to Excel spreadsheet:

In [None]:
#ia.export_data('fittedIsotopomerAnalysis.xlsx')

In [None]:
sample_data_df.to_csv('sample_data.csv', index=False)

## Addressing Overfitting: 

To prevent overfitting in my neural network model for predicting isotopomer distributions, several strategies will be implemented:

- First, I'll use a validation set to monitor the model's performance during training, ensuring it maintains its ability to generalize to unseen data. This involves splitting the data into training and validation sets and using early stopping to halt training when the validation loss stops improving, which helps avoid overfitting by preventing the model from learning noise in the training data. 
 
- Additionally, I'll employ dropout layers within the neural network architecture. Dropout randomly deactivates a fraction of neurons during each training step, which forces the network to learn more robust features and reduces reliance on any specific neurons. 

- Regularization techniques, such as L2 regularization, will be used to penalize large weights, discouraging the model from becoming too complex. 

- Finally, I will ensure that the model is trained with an adequate amount of data and, if necessary, use data augmentation techniques to artificially increase the diversity of the training dataset, enhancing the model's ability to generalize.

In [5]:
import numpy as np
import pandas as pd
from metabolabpytools import isotopomerAnalysis

def generate_isotopomer_percentages():
    unlabelled_percentage = np.random.uniform(20, 80)  # Unlabelled percentage between 20% and 80%
    remaining_percentage = 100 - unlabelled_percentage
    
    # Determine which isotopomers are present with a 0.5% chance for each
    isotopomer_presence = np.random.rand(7) < 0.5
    present_isotopomers = np.sum(isotopomer_presence)
    
    if present_isotopomers == 0:
        # Ensure at least one isotopomer is present if all are zero
        isotopomer_presence[np.random.randint(0, 7)] = True
        present_isotopomers = 1
    
    # Generate random percentages for the present isotopomers
    random_values = np.random.rand(present_isotopomers)
    random_percentages = (random_values / random_values.sum()) * remaining_percentage
    
    # Initialize all isotopomer percentages to zero
    isotopomer_percentages = [0] * 7
    # Assign random percentages to the selected isotopomers
    random_idx = 0
    for i in range(7):
        if isotopomer_presence[i]:
            isotopomer_percentages[i] = random_percentages[random_idx]
            random_idx += 1
    
    # Combine unlabelled and other isotopomer percentages
    percentages = [unlabelled_percentage] + isotopomer_percentages
    
    return percentages




ia = isotopomerAnalysis.IsotopomerAnalysis()

# Ensure isotopomers is correctly initialized
isotopomers = [
    [0, 0, 0],  # Unlabelled
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 1, 0],
    [1, 0, 1],
    [0, 1, 1],
    [1, 1, 1]
]

num_samples = 1000
hsqc = [0, 1, 1]
metabolite = 'L-LacticAcid'

# Initialize the metabolite with multiple samples
ia.init_metabolite_multiple_samples(metabolite, hsqc, num_samples=num_samples)

# Initialize and set isotopomers for multiple samples
generated_percentages = []
for exp_index in range(num_samples):
    random_percentages = generate_isotopomer_percentages()  # Generate new random percentages for each sample
    generated_percentages.append(random_percentages)  # Store generated percentages for comparison
    
    ia.set_fit_isotopomers_simple(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages, exp_index=exp_index)
    ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)
    ia.sim_gcms_data(metabolite, exp_index)

ia.use_hsqc_multiplet_data = True
ia.use_gcms_data = True
ia.use_nmr1d_data = False


# Fit data using neural network with multiple samples
ia.fit_data_nn(metabolite=metabolite, fit_isotopomers=isotopomers, percentages = generated_percentages, num_samples=num_samples)

Epoch 1/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 248.3428 - val_loss: 103.2250 - learning_rate: 0.0010
Epoch 2/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 86.9796 - val_loss: 58.0863 - learning_rate: 0.0010
Epoch 3/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 43.4353 - val_loss: 33.9058 - learning_rate: 0.0010
Epoch 4/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 28.8883 - val_loss: 25.3719 - learning_rate: 0.0010
Epoch 5/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 22.2654 - val_loss: 24.6139 - learning_rate: 0.0010
Epoch 6/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 20.2340 - val_loss: 21.4206 - learning_rate: 0.0010
Epoch 7/100
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 19.1493 - val_loss: 20.2910 - learn

In [None]:
# # #Gather data for all samples and export to a CSV file
# sample_data_df = ia.gather_sample_data(metabolite=metabolite, num_samples=num_samples)

# # # Add generated percentages to the DataFrame for comparison
# sample_data_df['Generated Isotopomer Percentages'] = generated_percentages

# # #print(sample_data_df)

# # # Export to CSV
# sample_data_df.to_csv('sample_data.csv', index=False)

Next step, adding Hyperparameter tuning for increased accuracy:

In [None]:
# Fit data using neural network with multiple samples
ia.fit_data_nn_new(metabolite=metabolite, fit_isotopomers=isotopomers, percentages = generated_percentages, num_samples=num_samples)