In [1]:
from metabolabpytools import isotopomerAnalysis


if __name__ == "__main__":
    ia = isotopomerAnalysis.IsotopomerAnalysis()

    # Example HSQC vectors for different metabolites
    hsqc_vectors = [
        [1, 1, 1],       # 3 carbons
        [0, 1, 1],       # 3 carbons
        [0, 0, 1],       # 3 carbons
        [0, 1, 1, 0],    # 4 carbons
        [1, 1, 1, 0],    # 4 carbons
        [0, 1, 1, 1],    # 4 carbons
        [1, 1, 1, 1],    # 4 carbons
        [0, 1, 1, 1, 0], # 5 carbons
        [1, 1, 1, 1, 0], # 5 carbons
        [0, 1, 1, 1, 1], # 5 carbons
        [1, 1, 1, 1, 1], # 5 carbons
        [1, 1, 1, 1, 1, 1] # 6 carbons
    ]

    for vector in hsqc_vectors:
        num_carbons = len(vector)
        metabolite = f'Metabolite_{num_carbons}C'
        ia.init_metabolite(metabolite, vector)
        hsqc_data, gcms_data, isotopomer_distributions = ia.simulate_data(num_samples=1000, num_carbons=num_carbons, metabolite=metabolite)
        print(f'HSQC Data ({num_carbons} carbons):', hsqc_data.shape)
        print(f'GC-MS Data ({num_carbons} carbons):', gcms_data.shape)
        print(f'Isotopomer Distributions ({num_carbons} carbons):', isotopomer_distributions.shape)


HSQC Data (3 carbons): (1000, 8)
GC-MS Data (3 carbons): (1000, 4)
Isotopomer Distributions (3 carbons): (1000, 8)
HSQC Data (3 carbons): (1000, 8)
GC-MS Data (3 carbons): (1000, 4)
Isotopomer Distributions (3 carbons): (1000, 8)
HSQC Data (3 carbons): (1000, 8)
GC-MS Data (3 carbons): (1000, 4)
Isotopomer Distributions (3 carbons): (1000, 8)
HSQC Data (4 carbons): (1000, 12)
GC-MS Data (4 carbons): (1000, 5)
Isotopomer Distributions (4 carbons): (1000, 16)
HSQC Data (4 carbons): (1000, 12)
GC-MS Data (4 carbons): (1000, 5)
Isotopomer Distributions (4 carbons): (1000, 16)
HSQC Data (4 carbons): (1000, 12)
GC-MS Data (4 carbons): (1000, 5)
Isotopomer Distributions (4 carbons): (1000, 16)
HSQC Data (4 carbons): (1000, 12)
GC-MS Data (4 carbons): (1000, 5)
Isotopomer Distributions (4 carbons): (1000, 16)
HSQC Data (5 carbons): (1000, 16)
GC-MS Data (5 carbons): (1000, 6)
Isotopomer Distributions (5 carbons): (1000, 32)
HSQC Data (5 carbons): (1000, 16)
GC-MS Data (5 carbons): (1000, 6)
Is

In [1]:
from metabolabpytools import isotopomerAnalysis
import re

def sanitize_sheet_name(name):
    return re.sub(r'[:\\/*?\[\]]', '_', name)

# Define the HSQC vectors
hsqc_vectors = [
    [1, 1, 1],
    [0, 1, 1],
    [0, 0, 1],
    [0, 1, 1, 0],
    [1, 1, 1, 0],
    [0, 1, 1, 1],
    [1, 1, 1, 1],
    [0, 1, 1, 1, 0],
    [1, 1, 1, 1, 0],
    [0, 1, 1, 1, 1],
    [1, 1, 1, 1, 1],
    [1, 1, 1, 1, 1, 1]
]

# Loop through each HSQC vector and simulate data
for hsqc_vector in hsqc_vectors:
    num_carbons = len(hsqc_vector)
    metabolite = f'{num_carbons}C_HSQC_{hsqc_vector}'
    
    # Sanitize the sheet name
    sanitized_metabolite = sanitize_sheet_name(metabolite)
    
    # Initialize the analysis class and metabolite
    ia = isotopomerAnalysis.IsotopomerAnalysis()
    ia.init_metabolite(sanitized_metabolite, hsqc_vector)
    
    # Simulate data
    hsqc_data, gcms_data, isotopomer_distributions = ia.simulate_and_save_data(
        num_samples=1000,
        num_carbons=num_carbons,
        metabolite=sanitized_metabolite
    )
    
    print(f'Simulated data for {sanitized_metabolite}')
    print(f'HSQC Data Shape: {hsqc_data.shape}')
    print(f'GC-MS Data Shape: {gcms_data.shape}')
    print(f'Isotopomer Distributions Shape: {isotopomer_distributions.shape}')



Simulated data for 3C_HSQC__1, 1, 1_
HSQC Data Shape: (1000, 8)
GC-MS Data Shape: (1000, 4)
Isotopomer Distributions Shape: (1000, 8)
Simulated data for 3C_HSQC__0, 1, 1_
HSQC Data Shape: (1000, 8)
GC-MS Data Shape: (1000, 4)
Isotopomer Distributions Shape: (1000, 8)
Simulated data for 3C_HSQC__0, 0, 1_
HSQC Data Shape: (1000, 8)
GC-MS Data Shape: (1000, 4)
Isotopomer Distributions Shape: (1000, 8)
Simulated data for 4C_HSQC__0, 1, 1, 0_
HSQC Data Shape: (1000, 12)
GC-MS Data Shape: (1000, 5)
Isotopomer Distributions Shape: (1000, 16)
Simulated data for 4C_HSQC__1, 1, 1, 0_
HSQC Data Shape: (1000, 12)
GC-MS Data Shape: (1000, 5)
Isotopomer Distributions Shape: (1000, 16)
Simulated data for 4C_HSQC__0, 1, 1, 1_
HSQC Data Shape: (1000, 12)
GC-MS Data Shape: (1000, 5)
Isotopomer Distributions Shape: (1000, 16)
Simulated data for 4C_HSQC__1, 1, 1, 1_
HSQC Data Shape: (1000, 12)
GC-MS Data Shape: (1000, 5)
Isotopomer Distributions Shape: (1000, 16)
Simulated data for 5C_HSQC__0, 1, 1, 1, 0_

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
import joblib

def flatten_hsqc_multiplets(hsqc_multiplet_percentages):
    # Flatten the nested lists and ensure all elements are floats
    return [item for sublist in hsqc_multiplet_percentages for item in (sublist if isinstance(sublist, list) else [sublist])]

def add_noise_to_hsqc(hsqc_data, noise_level=0.03):
    noise = np.random.normal(0, noise_level, hsqc_data.shape)
    noisy_hsqc_data = hsqc_data * (1 + noise)
    return noisy_hsqc_data

def add_noise_to_gcms(gcms_data, noise_level=0.075):
    noise = np.random.normal(0, noise_level, gcms_data.shape)
    noisy_gcms_data = gcms_data * (1 + noise)
    return noisy_gcms_data

def prepare_data(hsqc_data, gcms_data, isotopomer_distributions, test_size=0.2):
    # Add noise
    noisy_hsqc_data = add_noise_to_hsqc(hsqc_data)
    noisy_gcms_data = add_noise_to_gcms(gcms_data)
    
    # Combine HSQC and GC-MS data
    X = np.hstack((noisy_hsqc_data, noisy_gcms_data))
    y = isotopomer_distributions
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test

def build_and_train_model(X_train, y_train, X_val, y_val, input_dim, epochs=100, batch_size=32):
    model = Sequential()
    model.add(Dense(64, input_dim=input_dim, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(y_train.shape[1], activation='linear'))
    
    model.compile(optimizer=Adam(), loss='mean_absolute_error')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=[early_stopping])
    
    return model

def monte_carlo_simulation(model, X_test, y_test, num_simulations=1000):
    predictions = []
    
    for _ in range(num_simulations):
        noisy_X_test = add_noise_to_hsqc(X_test[:, :X_test.shape[1]//2])  # HSQC part
        noisy_X_test = np.hstack((noisy_X_test, add_noise_to_gcms(X_test[:, X_test.shape[1]//2:])))  # GC-MS part
        
        preds = model.predict(noisy_X_test)
        predictions.append(preds)
    
    predictions = np.array(predictions)
    mean_preds = np.mean(predictions, axis=0)
    std_preds = np.std(predictions, axis=0)
    
    return mean_preds, std_preds

def save_model(model, filename='best_model.pkl'):
    joblib.dump(model, filename)

def generate_results_sheet(X_test, y_test, model, mean_preds, std_preds, filename='results.xlsx'):
    import pandas as pd
    from openpyxl import Workbook
    
    predictions = model.predict(X_test)
    
    wb = Workbook()
    sheet = wb.active
    sheet.title = 'Results'
    sheet.append(["Sample", "Real Isotopomer Distribution", "HSQC %", "GC-MS %", "Predicted Isotopomer Distribution", "MAE", "Std Dev"])
    
    for i in range(X_test.shape[0]):
        sample = i + 1
        real_isotopomer_dist = y_test[i].tolist()
        hsqc_percentage = X_test[i][:X_test.shape[1]//2].tolist()
        gcms_percentage = X_test[i][X_test.shape[1]//2:].tolist()
        predicted_isotopomer_dist = predictions[i].tolist()
        mae = np.mean(np.abs(real_isotopomer_dist - predicted_isotopomer_dist))
        std_dev = std_preds[i].tolist()
        
        sheet.append([sample, real_isotopomer_dist, hsqc_percentage, gcms_percentage, predicted_isotopomer_dist, mae, std_dev])
    
    wb.save(filename)

# Load the simulated data
file_path = 'data_sim/3C_HSQC__0, 0, 1__simulation_results.xlsx'
df = pd.read_excel(file_path)

# Extract and flatten HSQC multiplet percentages
X_hsqc = df['HSQC Multiplet %'].apply(eval).apply(flatten_hsqc_multiplets).tolist()
X_gcms = df['GC-MS %'].apply(eval).tolist()
X = np.array([flatten_hsqc_multiplets(hsqc) + gcms for hsqc, gcms in zip(X_hsqc, X_gcms)])  # Ensure homogeneous shape

y = df['Isotopomer Distribution'].apply(eval).tolist()
y = np.array(y)

# Prepare data
X_train, X_test, y_train, y_test = prepare_data(X[:, :len(X_hsqc[0])], X[:, len(X_hsqc[0]):], y, test_size=0.2)

# Build and train the model
input_dim = X_train.shape[1]
model = build_and_train_model(X_train, y_train, X_test, y_test, input_dim)

# Perform Monte Carlo simulation
mean_preds, std_preds = monte_carlo_simulation(model, X_test, y_test)

# Save the model
save_model(model, filename='best_model.pkl')

# Generate results sheet
generate_results_sheet(X_test, y_test, model, mean_preds, std_preds, filename='results.xlsx')



Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 18.0457 - val_loss: 9.1166
Epoch 2/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 8.0989 - val_loss: 6.4029
Epoch 3/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 5.9783 - val_loss: 5.2247
Epoch 4/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 4.8689 - val_loss: 4.2256
Epoch 5/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8776 - val_loss: 3.3787
Epoch 6/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.1762 - val_loss: 2.9285
Epoch 7/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 2.8456 - val_loss: 2.7306
Epoch 8/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.6816 - val_loss: 2.4677
Epoch 9/100
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37