# Random Data Simulation and Fitting Using Neural Network

### _Simulation of random data:_

Import necessary packages:

In [None]:
from metabolabpytools import isotopomerAnalysis
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import copy

Function to generate random percentages summing to 100%:

In [None]:
def generate_isotopomer_percentages():
    unlabelled_percentage = np.random.uniform(20, 80)  # Unlabelled percentage between 20% and 80%
    remaining_percentage = 100 - unlabelled_percentage
    
    # Generate random percentages for the remaining 7 isotopomers
    random_values = np.random.rand(7)
    random_percentages = (random_values / random_values.sum()) * remaining_percentage
    
    # Combine unlabelled and other isotopomer percentages
    percentages = [unlabelled_percentage] + list(random_percentages)
    
    return percentages

Create an isotopomerAnalysis object:

In [None]:
ia = isotopomerAnalysis.IsotopomerAnalysis()

Define metabolite parameters:

In [None]:
isotopomers = [
    [0, 0, 0],   
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 1, 0],
    [1, 0, 1],
    [0, 1, 1],
    [1, 1, 1]
]

print(f"Initial isotopomers: {isotopomers}")
#random_percentages = generate_isotopomer_percentages()
# print(random_percentages)
hsqc = [0,1,1]
metabolite = 'L-LacticAcid'
num_samples = 10


In [None]:
ia.init_metabolite_multiple_samples(metabolite, hsqc, num_samples=num_samples)

In [None]:
print(f"Metabolites: {ia.metabolites}")
print(f"Initial Isotopomer Percentages: {ia.isotopomer_percentages}")

Initialising and set isoptomers for multiple samples:

In [None]:
generated_percentages = []
for exp_index in range(num_samples):
    random_percentages = generate_isotopomer_percentages()  # Generate new random percentages for each sample
    # print(f"Generated percentages for sample {exp_index}: {random_percentages}")
    generated_percentages.append(random_percentages)  # Store generated percentages for comparison
    
    # Deep copy isotopomers for each iteration to ensure it is not modified
    isotopomers_copy = copy.deepcopy(isotopomers)
    
    # Debug prints before setting fit isotopomers
    # print(f"Before set_fit_isotopomers call for exp_index {exp_index}:")
    # print(f"Metabolite: {metabolite}")
    # print(f"Isotopomers: {isotopomers_copy}")
    # print(f"Percentages: {random_percentages}")
    # print(f"Exp Index: {exp_index}")
    
    # Ensure isotopomers_copy is not empty
    if len(isotopomers_copy) == 0:
        print(f"Error: Isotopomers is empty for exp_index {exp_index}")

    ia.set_fit_isotopomers_simple(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages, exp_index=exp_index)
    ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)
    ia.sim_gcms_data(metabolite, exp_index)

Initialise isotopomerAnalysis object with selected data:

In [None]:
#ia.init_metabolite(metabolite, hsqc)
#ia.set_fit_isotopomers(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages)

In [None]:
#print(f'Isotopomers : {ia.fit_isotopomers[metabolite]}\nIsotopomer %: {ia.isotopomer_percentages[metabolite]}')

Simulate HSQC data:

In [None]:
#ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)

In [None]:
# print(f'HSQC Multiplets: {ia.exp_multiplets[metabolite][exp_index]}')
#print(f'Multiplet percentages: {ia.exp_multiplet_percentages[metabolite][exp_index]}')

Simulate GC-MS data:

In [None]:
#ia.sim_gcms_data(metabolite, exp_index)

In [None]:
#ia.exp_gcms[metabolite][exp_index]

### _Fitting to simulated data using basic neural network:_

Use only HSQC multiplets and GC-MS data for analysis:

In [None]:
ia.use_hsqc_multiplet_data = True
ia.use_gcms_data = True
ia.use_nmr1d_data = False

Generate multiple samples:

In [None]:
# num_samples = 10  # Example number of samples
# samples = ia.generate_simulated_samples(num_samples)

Call the new neural network fit method:

In [None]:
ia.fit_data_nn(metabolite=metabolite, fit_isotopomers=isotopomers, num_samples=num_samples)

In [None]:
sample_data_df = ia.gather_sample_data(metabolite=metabolite, num_samples=num_samples)
print(sample_data_df)

In [None]:
#print(f'Fitted Isotoponers : {ia.fitted_isotopomers[metabolite][exp_index]}\nFitted Isotopomer %: {ia.fitted_isotopomer_percentages[metabolite][exp_index]}')  

Export data and analysis to Excel spreadsheet:

In [None]:
#ia.export_data('fittedIsotopomerAnalysis.xlsx')

In [None]:
sample_data_df.to_csv('sample_data.csv', index=False)

## Addressing Overfitting: 

To prevent overfitting in my neural network model for predicting isotopomer distributions, several strategies will be implemented:

- First, I'll use a validation set to monitor the model's performance during training, ensuring it maintains its ability to generalize to unseen data. This involves splitting the data into training and validation sets and using early stopping to halt training when the validation loss stops improving, which helps avoid overfitting by preventing the model from learning noise in the training data. 
 
- Additionally, I'll employ dropout layers within the neural network architecture. Dropout randomly deactivates a fraction of neurons during each training step, which forces the network to learn more robust features and reduces reliance on any specific neurons. 

- Regularization techniques, such as L2 regularization, will be used to penalize large weights, discouraging the model from becoming too complex. 

- Finally, I will ensure that the model is trained with an adequate amount of data and, if necessary, use data augmentation techniques to artificially increase the diversity of the training dataset, enhancing the model's ability to generalize.

In [1]:
import numpy as np
import pandas as pd
from metabolabpytools import isotopomerAnalysis

def generate_isotopomer_percentages():
    unlabelled_percentage = np.random.uniform(20, 80)  # Unlabelled percentage between 20% and 80%
    remaining_percentage = 100 - unlabelled_percentage
    
    # Generate random percentages for the remaining 7 isotopomers
    random_values = np.random.rand(7)
    random_percentages = (random_values / random_values.sum()) * remaining_percentage
    
    # Combine unlabelled and other isotopomer percentages
    percentages = [unlabelled_percentage] + list(random_percentages)
    
    return percentages


ia = isotopomerAnalysis.IsotopomerAnalysis()

# Ensure isotopomers is correctly initialized
isotopomers = [
    [0, 0, 0],  # Unlabelled
    [1, 0, 0],
    [0, 1, 0],
    [0, 0, 1],
    [1, 1, 0],
    [1, 0, 1],
    [0, 1, 1],
    [1, 1, 1]
]

num_samples = 10
hsqc = [0, 1, 1]
metabolite = 'L-LacticAcid'

# Initialize the metabolite with multiple samples
ia.init_metabolite_multiple_samples(metabolite, hsqc, num_samples=num_samples)

# Initialize and set isotopomers for multiple samples
generated_percentages = []
for exp_index in range(num_samples):
    random_percentages = generate_isotopomer_percentages()  # Generate new random percentages for each sample
    generated_percentages.append(random_percentages)  # Store generated percentages for comparison
    
    ia.set_fit_isotopomers_simple(metabolite=metabolite, isotopomers=isotopomers, percentages=random_percentages, exp_index=exp_index)
    ia.sim_hsqc_data(metabolite=metabolite, exp_index=exp_index, isotopomers=isotopomers, percentages=random_percentages)
    ia.sim_gcms_data(metabolite, exp_index)

ia.use_hsqc_multiplet_data = True
ia.use_gcms_data = True
ia.use_nmr1d_data = False


# Fit data using neural network with multiple samples
ia.fit_data_nn(metabolite=metabolite, fit_isotopomers=isotopomers, percentages = generated_percentages, num_samples=num_samples)

# #Gather data for all samples and export to a CSV file
sample_data_df = ia.gather_sample_data(metabolite=metabolite, num_samples=num_samples)

# # Add generated percentages to the DataFrame for comparison
sample_data_df['Generated Isotopomer Percentages'] = generated_percentages

# #print(sample_data_df)

# # Export to CSV
sample_data_df.to_csv('sample_data.csv', index=False)


[34.71518938 23.49510282 32.83615479  8.95355301 72.23621919 27.76378081
 43.82383582 27.82837892 26.59110463  1.75668064]
[43.82383581514689, 9.736902990534956, 7.6081695577876065, 10.483306368228131, 5.226679665023729, 13.993239242939524, 7.371185724399198, 1.7566806359399556]
[34.8439234  28.36658095 22.33669637 14.45279928 52.68229598 47.31770402
 34.08793132 30.82456821 29.71591076  5.3715897 ]
[34.08793132458349, 11.394297464468407, 13.644047660121286, 5.7862230886706705, 11.016878250143177, 10.020922493328191, 8.678110017600627, 5.371589701084163]
[50.59485178  3.7483589  44.58484058  1.07194875 64.69889685 35.30110315
 37.98489778 43.68184764 18.21996593  0.11328865]
[37.98489777806584, 10.229547266445172, 14.45153577473458, 19.000764599442135, 0.8225664525226569, 4.800033892017754, 12.597365584936556, 0.11328865183530643]
[57.42098724 20.9464387   8.8515065  12.78106755 75.77005719 24.22994281
 72.80724578 19.31770622  6.01553725  1.85951075]
[72.80724578091598, 3.592439095341

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 962ms/step - loss: 597.9880 - val_loss: 722.0712
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 550.1485 - val_loss: 665.6041
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 505.8438 - val_loss: 616.0464
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - loss: 465.0555 - val_loss: 572.0484
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 427.6199 - val_loss: 530.5849
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 393.0804 - val_loss: 492.4539
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 361.1422 - val_loss: 457.4865
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - loss: 331.6827 - val_loss: 425.3449
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━