In [5]:
import sys

import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
import os
from utils import results_to_df, evaluate_dataset
# Add your desired directory to PYTHONPATH
sys.path.append("../../lsm/")

from train_property import Finetune_SSModel
from msdatasets import MSDataset

from rdkit.Chem import Descriptors

all_descriptors = {name: func for name, func in Descriptors.descList}

feature_names = list(all_descriptors.keys())
path = "" # path to data here
norm_values = pd.read_csv(f'{path}/datasets/processed_data/norm_values.csv')
mini = np.array(norm_values.iloc[0, :])
maxi = np.array(norm_values.iloc[1, :])

In [6]:
test_set_disjoint = MSDataset(f'{path}/datasets/test/test.zarr', mode='property', train_minmax_path = f'{path}/datasets/processed_data/norm_values.csv')
test_set_overlap = MSDataset(f'{path}/datasets/test/disjoint_test.zarr', mode='property', train_minmax_path = f'{path}/datasets/processed_data/norm_values.csv')
test_set_casmi = MSDataset(f'{path}/datasets/test/casmi.zarr', mode='property', train_minmax_path = f'{path}/datasets/processed_data/norm_values.csv')
            
loader1 =  DataLoader(
    test_set_disjoint,
    batch_size= 64,
    shuffle=False,
    num_workers=0,
)
loader2 = DataLoader(
    test_set_overlap,
    batch_size=64,
    shuffle=False,
    num_workers=0,
)
loader3 = DataLoader(
    test_set_casmi,
    batch_size = 64,
    shuffle=False,
    num_workers=0,
)
            


Data is (12274, 5) dimensions
Data is (1000, 5) dimensions
Data is (464, 5) dimensions


### Load Model

In [98]:
model_name = ''
model = Finetune_SSModel.load_from_checkpoint(f'{path}/trained_models/{model_name}_best.ckpt', map_location='cuda:0').cuda().eval()

#make results directory
os.makedirs(f'../../results/{model_name}', exist_ok=True)
results_path = f'../../results/{model_name}/'

### Evaluate on "unknown" Dataset

In [None]:
r2, mae, smape, GT_feats, Y_feats, wmape, f1, accuracy, precision, recall = evaluate_dataset(loader1, model)
df, df_ms2, df_categorical = results_to_df(feature_names, mae, r2, norm_values, smape, wmape, f1, accuracy, precision, recall)

#write results to results path:
GT_feats = pd.DataFrame(GT_feats, columns=feature_names)
Y_feats = pd.DataFrame(Y_feats, columns=feature_names)
#make "unknown data" subdirectory
os.makedirs(f'{results_path}unknown_data', exist_ok=True)
GT_feats.to_csv(f'{results_path}unknown_data/GT_feats.csv')
Y_feats.to_csv(f'{results_path}unknown_data/Y_feats.csv')
df.to_csv(f'{results_path}unknown_data/results.csv')
df_ms2.to_csv(f'{results_path}unknown_data/results_ms2.csv')
df_categorical.to_csv(f'{results_path}unknown_data/results_categorical.csv')

### Evaluate on "known" dataset

In [100]:
r2, mae, smape, GT_feats, Y_feats, wmape, f1, accuracy, precision, recall = evaluate_dataset(loader2, model)
df, df_ms2, df_categorical = results_to_df(feature_names, mae, r2, norm_values, smape, wmape, f1, accuracy, precision, recall)

#write results to results path:
GT_feats = pd.DataFrame(GT_feats, columns=feature_names)
Y_feats = pd.DataFrame(Y_feats, columns=feature_names)
#make "unknown data" subdirectory
os.makedirs(f'{results_path}known_data', exist_ok=True)
GT_feats.to_csv(f'{results_path}known_data/GT_feats.csv')
Y_feats.to_csv(f'{results_path}known_data/Y_feats.csv')
df.to_csv(f'{results_path}known_data/results.csv')
df_ms2.to_csv(f'{results_path}known_data/results_ms2.csv')
df_categorical.to_csv(f'{results_path}known_data/results_categorical.csv')
# df_categorical

100%|██████████| 16/16 [00:00<00:00, 20.10it/s]


r2: -16931016.186098903
mae: 8.249730707663247
smape: 0.057887665927410126
wmape: 34567.78515625
accuracy: 0.01160287081339713
precision: 0.01160287081339713
recall: 0.01160287081339713
f1: 0.011602867918663249


### Evaluate on CASMI Dataset

In [101]:
r2, mae, smape, GT_feats, Y_feats, wmape, f1, accuracy, precision, recall = evaluate_dataset(loader3, model)
df, df_ms2, df_categorical = results_to_df(feature_names, mae, r2, norm_values, smape, wmape, f1, accuracy, precision, recall)

#write results to results path:
GT_feats = pd.DataFrame(GT_feats, columns=feature_names)
Y_feats = pd.DataFrame(Y_feats, columns=feature_names)
#make "unknown data" subdirectory
os.makedirs(f'{results_path}casmi_data', exist_ok=True)
GT_feats.to_csv(f'{results_path}casmi_data/GT_feats.csv')
Y_feats.to_csv(f'{results_path}casmi_data/Y_feats.csv')
df.to_csv(f'{results_path}casmi_data/results.csv')
df_ms2.to_csv(f'{results_path}casmi_data/results_ms2.csv')
df_categorical.to_csv(f'{results_path}casmi_data/results_categorical.csv')


100%|██████████| 8/8 [00:00<00:00, 21.91it/s]


r2: -16936397.7608993
mae: 9.165955031469208
smape: 0.06174078956246376
wmape: 41773.7421875
accuracy: 0.008393829401088928
precision: 0.008393829401088928
recall: 0.008393829401088928
f1: 0.008393826841282783


### Evaluate on modified cosine

#### Load data

In [17]:
#df_test was produced with 'generate_mod_cosine_scores.py' in preprocess_ns2 folder
df_test = pd.read_pickle(f'{path}/datasets/processed_data/test_modified_cosine_results.pkl')
df_gt_test = pd.read_pickle(f'{path}/datasets/processed_data/test_df.pkl')
df_train = pd.read_pickle(f'{path}/datasets/processed_data/train_df.pkl')
results_path = f'../../results/cosine_similarity/'
os.makedirs(results_path, exist_ok=True)

In [None]:
#for each value in df_test['Train_SMILES'], retrieve the last 209 values in df_train (property predictions), and use these as the property predictions for df_test
def get_last_209_predictions(smiles):
    try: 
        return df_train[df_train['smiles']==smiles].iloc[0, -209:]
    except:
        return np.full(209, np.nan)
feats = df_test['Train_SMILES'].apply(lambda x: get_last_209_predictions(x))
df_test = pd.concat([df_test, feats], axis=1)

cos_feats = df_test.iloc[:, -209:]
gt_feats = df_gt_test.iloc[:, -209:]

#get columnwise r2
epsilon = 1e-8
Y_feats = cos_feats.values
GT_feats = gt_feats.values
# Calculating R^2
numerator = np.sum((GT_feats - Y_feats) ** 2, axis=0)
denominator = np.sum((GT_feats - np.mean(GT_feats, axis=0)) ** 2, axis=0) + epsilon
r2 = 1 - (numerator / denominator)

# Calculating MAE
mae = np.mean(np.abs(GT_feats - Y_feats), axis=0)
smape = np.mean(np.abs(GT_feats - Y_feats) / (np.abs(GT_feats) + epsilon), axis=0)
wmape = np.sum(np.abs(GT_feats - Y_feats), axis=0) / np.sum(np.abs(GT_feats) + epsilon, axis=0)
accuracy = np.mean(np.abs(GT_feats - Y_feats) < 0.01 * GT_feats, axis=0)
precision = np.mean(np.abs(GT_feats - Y_feats) < 0.01 * GT_feats, axis=0)
recall = np.mean(np.abs(GT_feats - Y_feats) < 0.01 * GT_feats, axis=0)
f1 = 2 * (precision * recall) / (precision + recall + epsilon)

df, df_ms2, df_categorical = results_to_df(feature_names, mae, r2, norm_values, smape, wmape, f1, accuracy, precision, recall)

#write results to results path:
GT_feats = pd.DataFrame(GT_feats, columns=feature_names)
Y_feats = pd.DataFrame(Y_feats, columns=feature_names)
# os.makedirs(f'{results_path}/unknown_data', exist_ok=True)
# #make "unknown data" subdirectory
GT_feats.to_csv(f'{results_path}/unknown_data/GT_feats.csv')
Y_feats.to_csv(f'{results_path}/unknown_data/Y_feats.csv')
df.to_csv(f'{results_path}/unknown_data/results.csv')
df_ms2.to_csv(f'{results_path}/unknown_data/results_ms2.csv')
df_categorical.to_csv(f'{results_path}/unknown_data/results_categorical.csv')
df.loc['smape'].mean()