- This notebook is to get latent space of amphiphile mixture dataset by using chemeleon 

In [1]:
import sys
sys.path.append("../src/models")
sys.path.append("../src")
sys.path.append("../")

In [2]:
import os
import chemprop
from chemprop import data, featurizers, models, nn
import torch

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [13]:
from chemeleon import chemeleon_latent_space
from chemeleon import build_mixture_latent_features
from utils import binarize_last_column

In [6]:
# Get the data

In [7]:
file_path='../../data/df_input_update_ori_20240812.csv'
df_input_update_ori = pd.read_csv(file_path) 
df_input = df_input_update_ori.copy()
df_input.iloc[:,0:7] = df_input.iloc[:,0:7].applymap(lambda x: np.log1p(x))
df_input=df_input.iloc[:, 0:8]

info_path = '../../data/Info.csv'
df_info = pd.read_csv(info_path)

df = df_input.copy()
df.columns = [col.replace('_Concentration (mM)', '') for col in df.columns[:-1]] + ['num_vesicles']

column_to_name = {
    'decanoic acid': 'Decanoic acid',
    'decanoate': 'Decanoate',
    'decylamine': 'Decylamine',
    'decyl trimethylamine': 'Decyltrimethyl ammonium bromid',
    'decylsulfate': 'Decyl sodium sulfate',
    'decanol': 'Decanol',
    'monocaprin': 'Glycerol monodecanoate'
}

# Get SMILES mapping from df_info
name_to_smiles = dict(zip(df_info['Name'], df_info['SMILES']))

# Construct new dataframe
new_data = {}

for col in df.columns[:-1]:  # Skip num_vesicles
    chem_name = column_to_name[col]
    smiles = name_to_smiles.get(chem_name, '')
    new_data[f"{col}_SMILES"] = [smiles] * len(df)
    new_data[f"{col}_Concentration"] = df[col]

# Add num_vesicles
new_data['num_vesicles'] = df['num_vesicles']

df_structured = pd.DataFrame(new_data)

df_structured.to_csv('df_structured_20250702.csv', index=False)

  df_input.iloc[:,0:7] = df_input.iloc[:,0:7].applymap(lambda x: np.log1p(x))


In [9]:
# Rename columns to the desired format
new_column_names = [
    'smi1', 'conc1',
    'smi2', 'conc2',
    'smi3', 'conc3',
    'smi4', 'conc4',
    'smi5', 'conc5',
    'smi6', 'conc6',
    'smi7', 'conc7',
    'vesicles_formation'
]

# Apply the new column names
df_structured.columns = new_column_names
df_structured.shape

(336, 15)

In [12]:
df_structured_classify = binarize_last_column(df_structured)
df_structured_classify.head()

Unnamed: 0,smi1,conc1,smi2,conc2,smi3,conc3,smi4,conc4,smi5,conc5,smi6,conc6,smi7,conc7,vesicles_formation
0,CCCCCCCCCC(=O)O,0.615186,CCCCCCCCCC(=O)[O-],0.955511,CCCCCCCCCCN,1.231101,CCCCCCCCCC[N+](C)(C)C.[Br-],1.552868,CCCCCCCCCCOS(=O)(=O)[O-].[Na+],1.677097,CCCCCCCCCCO,0.441476,CCCCCCCCCC(=O)OCC(CO)O,0.579418,1
1,CCCCCCCCCC(=O)O,1.48727,CCCCCCCCCC(=O)[O-],0.644482,CCCCCCCCCCN,0.175633,CCCCCCCCCC[N+](C)(C)C.[Br-],1.056053,CCCCCCCCCCOS(=O)(=O)[O-].[Na+],1.216395,CCCCCCCCCCO,0.207827,CCCCCCCCCC(=O)OCC(CO)O,0.107059,1
2,CCCCCCCCCC(=O)O,1.163151,CCCCCCCCCC(=O)[O-],0.70557,CCCCCCCCCCN,1.294727,CCCCCCCCCC[N+](C)(C)C.[Br-],1.787584,CCCCCCCCCCOS(=O)(=O)[O-].[Na+],1.658228,CCCCCCCCCCO,0.159138,CCCCCCCCCC(=O)OCC(CO)O,0.202941,1
3,CCCCCCCCCC(=O)O,1.531476,CCCCCCCCCC(=O)[O-],1.787584,CCCCCCCCCCN,0.381855,CCCCCCCCCC[N+](C)(C)C.[Br-],0.07139,CCCCCCCCCCOS(=O)(=O)[O-].[Na+],1.475907,CCCCCCCCCCO,0.589175,CCCCCCCCCC(=O)OCC(CO)O,0.113329,1
4,CCCCCCCCCC(=O)O,1.170933,CCCCCCCCCC(=O)[O-],1.441019,CCCCCCCCCCN,1.677097,CCCCCCCCCC[N+](C)(C)C.[Br-],0.885832,CCCCCCCCCCOS(=O)(=O)[O-].[Na+],1.011601,CCCCCCCCCCO,0.750236,CCCCCCCCCC(=O)OCC(CO)O,0.300105,1


In [14]:
x_latent, y = build_mixture_latent_features(
    df=df_structured_classify,
    smi_cols=['smi1', 'smi2', 'smi3', 'smi4', 'smi5', 'smi6', 'smi7'],
    conc_cols=['conc1', 'conc2', 'conc3', 'conc4', 'conc5', 'conc6', 'conc7'],
    target_col='vesicles_formation',
    latent_fn=chemeleon_latent_space
)

In [15]:
x_latent.shape

(336, 2048)

In [16]:
df_total = pd.concat([pd.DataFrame(x_latent), y],axis=1)
df_total.shape

(336, 2049)

In [17]:
df_total.to_csv('df_total_amphiphiles_chemleon_20250702.csv', index=False)