In [None]:
import os
os.chdir("..")
os.getcwd()

In [2]:
import pandas as pd
data_path = "notebooks/ma_2021"
mol_df = pd.read_csv(os.path.join(data_path,"molecules.csv"))
mol_df.head()

Unnamed: 0,CID,MolecularWeight,IsomericSMILES,IUPACName,name
0,179,88.11,CC(C(=O)C)O,3-hydroxybutan-2-one,acetoin
1,240,106.12,C1=CC=C(C=C1)C=O,benzaldehyde,benzaldehyde
2,261,72.11,CCCC=O,butanal,butyraldehyde
3,454,128.21,CCCCCCCC=O,octanal,octanal
4,650,86.09,CC(=O)C(=O)C,"butane-2,3-dione","2,3-butanedione"


In [3]:
import analysis.fingerprint
mfpgen = analysis.fingerprint.make_mfpgen()
mol_df["MFP"] = mol_df["IsomericSMILES"].map(lambda smiles: analysis.fingerprint.smiles_to_embed(mfpgen,smiles))
mol_df["MFP"].head()

0    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: MFP, dtype: object

In [4]:
cid_to_embed = mol_df[["CID","MFP"]].set_index("CID").to_dict()["MFP"]
len(cid_to_embed), next(iter(cid_to_embed.items()))

(72, (179, array([0, 1, 0, ..., 0, 0, 0], dtype=uint8)))

In [5]:
blends_df = pd.read_csv(os.path.join(data_path,"behavior.csv"))
blends_df.head()

Unnamed: 0,Stimulus A,Stimulus B,Subject,Rep,IA,IAmix,IB,IBmix,IAB,PA,PB,PAB
0,179,31249,2,1,1.6,0.0,5.6,7.0,6.8,6.3,5.6,5.5
1,179,31249,3,1,5.0,5.3,4.9,5.1,6.3,6.8,3.6,4.2
2,179,31249,5,1,5.2,4.7,5.4,3.8,4.9,3.7,5.3,3.8
3,179,31249,7,1,4.1,0.0,8.0,10.0,7.0,4.0,6.0,6.0
4,179,31249,14,1,5.2,5.7,5.8,4.1,5.4,5.1,5.1,6.2


In [6]:
import numpy as np

# Make MFP emeddings for each molecule
blends_df["Embed A"] = blends_df["Stimulus A"].map(cid_to_embed)
blends_df["Embed B"] = blends_df["Stimulus B"].map(cid_to_embed)

# Combine the pair embeddings through concatenation
blends_df["Embed"] = blends_df.apply(lambda row: np.concatenate([row["Embed A"],row["Embed B"]]).tolist(), axis=1)

# Stack the prediction targets
value_cols = ["IA", "IAmix", "IB", "IBmix", "IAB", "PA", "PB", "PAB"]
blends_df["Values"] = blends_df.apply(lambda row: np.stack(row[value_cols]).tolist(), axis=1)

blends_df.head()

Unnamed: 0,Stimulus A,Stimulus B,Subject,Rep,IA,IAmix,IB,IBmix,IAB,PA,PB,PAB,Embed A,Embed B,Embed,Values
0,179,31249,2,1,1.6,0.0,5.6,7.0,6.8,6.3,5.6,5.5,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1.6, 0.0, 5.6, 7.0, 6.8, 6.3, 5.6, 5.5]"
1,179,31249,3,1,5.0,5.3,4.9,5.1,6.3,6.8,3.6,4.2,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.0, 5.3, 4.9, 5.1, 6.3, 6.8, 3.6, 4.2]"
2,179,31249,5,1,5.2,4.7,5.4,3.8,4.9,3.7,5.3,3.8,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.2, 4.7, 5.4, 3.8, 4.9, 3.7, 5.3, 3.8]"
3,179,31249,7,1,4.1,0.0,8.0,10.0,7.0,4.0,6.0,6.0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[4.1, 0.0, 8.0, 10.0, 7.0, 4.0, 6.0, 6.0]"
4,179,31249,14,1,5.2,5.7,5.8,4.1,5.4,5.1,5.1,6.2,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.2, 5.7, 5.8, 4.1, 5.4, 5.1, 5.1, 6.2]"


In [7]:
import scipy.stats as stats
import sklearn
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import numpy as np

n_splits = 5

# Convert "Embed" and "Values" to matrices
X = np.vstack(blends_df["Embed"].to_numpy())  # Convert list/array-like embeddings to a matrix
y = np.vstack(blends_df["Values"].to_numpy())

# Initialize k-fold cross-validation
kfold = sklearn.model_selection.KFold(n_splits=n_splits, shuffle=False)

# Initialize a dictionary to store MSEs for each value
mse_dict = {col: [] for col in value_cols}

for train_index, test_index in tqdm(kfold.split(X), total=n_splits):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Train a single Ridge regression model for the entire vector
    model = RidgeCV()
    model.fit(X_train, y_train)
    
    # Predict the entire vector
    y_pred = model.predict(X_test)
    
    # Calculate and store MSE for each value (column)
    for i, col in enumerate(value_cols):
        mse = mean_squared_error(y_test[:, i], y_pred[:, i])
        mse_dict[col].append(mse)

# Report MSE for each fold and value with confidence interval
for col, mse_list in mse_dict.items():
    print(f"\n{col} MSEs:")
    for i, mse in enumerate(mse_list, 1):
        print(f"  Fold {i}: MSE = {mse:.4f}")
    
    # Calculate mean and 95% confidence interval
    average_mse = np.mean(mse_list)
    std_error = stats.sem(mse_list)  # Standard error of the mean
    confidence_interval = stats.t.interval(
        0.95, len(mse_list) - 1, loc=average_mse, scale=std_error
    )
    
    print(f"  Average MSE for {col}: {average_mse:.4f}")
    print(f"  95% Confidence Interval for {col}: ({confidence_interval[0]:.4f}, {confidence_interval[1]:.4f})")

  0%|          | 0/5 [00:00<?, ?it/s]


IA MSEs:
  Fold 1: MSE = 3.8072
  Fold 2: MSE = 3.0555
  Fold 3: MSE = 3.1461
  Fold 4: MSE = 3.0070
  Fold 5: MSE = 2.9546
  Average MSE for IA: 3.1941
  95% Confidence Interval for IA: (2.7596, 3.6286)

IAmix MSEs:
  Fold 1: MSE = 6.2578
  Fold 2: MSE = 6.1202
  Fold 3: MSE = 6.2383
  Fold 4: MSE = 6.1912
  Fold 5: MSE = 6.4516
  Average MSE for IAmix: 6.2518
  95% Confidence Interval for IAmix: (6.0983, 6.4053)

IB MSEs:
  Fold 1: MSE = 3.0138
  Fold 2: MSE = 2.9201
  Fold 3: MSE = 2.8173
  Fold 4: MSE = 3.3820
  Fold 5: MSE = 2.7863
  Average MSE for IB: 2.9839
  95% Confidence Interval for IB: (2.6860, 3.2818)

IBmix MSEs:
  Fold 1: MSE = 5.6801
  Fold 2: MSE = 5.5752
  Fold 3: MSE = 5.5974
  Fold 4: MSE = 5.7124
  Fold 5: MSE = 5.8671
  Average MSE for IBmix: 5.6864
  95% Confidence Interval for IBmix: (5.5427, 5.8302)

IAB MSEs:
  Fold 1: MSE = 2.3162
  Fold 2: MSE = 1.9287
  Fold 3: MSE = 2.0656
  Fold 4: MSE = 2.2820
  Fold 5: MSE = 2.0929
  Average MSE for IAB: 2.1371
  95% 