In [1]:
%%capture
import sys, os
import torch, math, os
import sys
sys.path.append("..")

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive

    drive.mount('/content/drive', force_remount=False)
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/../.." + relativePath

print(f"Root project folder is at {resolve_path_gdrive('.')}")

CHECKPOINTS_FOLDER_BASE = "/checkpoints/stn_r3_f849_tlast29/stack=False-scaleY=True"
CHECKPOINTS_FOLDER = resolve_path_gdrive(CHECKPOINTS_FOLDER_BASE) #f'd:/temp{CHECKPOINTS_FOLDER_BASE}'
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.set_float32_matmul_precision("medium")  # to make lightning happy

In [2]:

import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch import nn
import torch.nn.functional as F

# Load the data
df = pd.read_csv('few_merged_data_f849_tlast29_reordered_byR2.csv')
X = df.iloc[:, :849]  # First 849 columns are features
y = df.iloc[:, 849:]  # Last 29 columns are targets
from pl.model_impl import *

In [3]:
import joblib
from sklearn.preprocessing import StandardScaler

# Define the path to the pickle file
pickle_file_path = f'{CHECKPOINTS_FOLDER}/scaler_X.pkl'

# Load the StandardScaler from the pickle file
with open(pickle_file_path, 'rb') as file:
    scaler = joblib.load(file)

# Now you can use the scaler
# Example: scaler.transform(data)

# Standardize features
X_scaled = scaler.transform(X)
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [16]:
from pathlib import Path

def eval_model(X_data, target_num):
    X_scaled = scaler.transform(X_data)
    X_tensor = torch.tensor(X_scaled, dtype=torch.float32)

    checkpoint_path=f'{CHECKPOINTS_FOLDER}/{target_num}'
    pathlist = Path(checkpoint_path).glob('**/*.ckpt')
    for path in pathlist:
        # because path is object not string
        model = SingleTargetNet.load_from_checkpoint(str(path))
        model.eval()
        model.cpu()
        with torch.no_grad():
                y_pred = model(X_tensor)
        return y_pred.detach().numpy()
        
# Load models and make predictions
predictions = []
for target_index in range(df.shape[1] - 849):
    print(f"Predicting target {target_index + 1}...")
    predictions.append(eval_model(X_tensor, target_index+1))
    
# Stack predictions into array
predictions = np.hstack(predictions)

# Create a DataFrame for predictions
#predictions_df = pd.DataFrame(predictions, columns=[f'Prediction_{i}' for i in range(predictions.shape[1])])

# Save predictions to a CSV file
#predictions_df.to_csv('predictions.csv', index=False)

print(f"Predictions shape: {predictions.shape}")

Predicting target 1...
Predicting target 2...
Predicting target 3...
Predicting target 4...
Predicting target 5...
Predicting target 6...
Predicting target 7...
Predicting target 8...
Predicting target 9...
Predicting target 10...
Predicting target 11...
Predicting target 12...
Predicting target 13...
Predicting target 14...
Predicting target 15...
Predicting target 16...
Predicting target 17...
Predicting target 18...
Predicting target 19...
Predicting target 20...
Predicting target 21...
Predicting target 22...
Predicting target 23...
Predicting target 24...
Predicting target 25...
Predicting target 26...
Predicting target 27...
Predicting target 28...
Predicting target 29...
Predictions shape: (8, 29)


In [5]:
predictions

array([[ 0.38041747,  0.37435395,  0.38301158,  0.32179022,  0.40278733,
         0.35184574, -1.5712007 , -1.3984663 ,  1.5064232 ,  2.4844203 ,
         1.7189146 ,  2.7789712 ,  1.8699226 ,  1.3264762 , -1.1213297 ,
        -1.8684686 ,  1.3493798 ,  1.7378752 , -0.16221514, -1.998511  ,
        -1.7195978 , -0.73782766,  1.1627886 , -1.9192525 ,  2.836595  ,
         0.8197746 ,  0.3535876 , -0.40010872, -0.70075166],
       [ 0.25580525,  0.2529648 ,  0.2503891 ,  0.28139862,  0.30164066,
         0.28221887, -0.6387827 , -0.69388795,  0.50233024,  0.11119656,
        -0.03374834,  2.4412274 ,  2.7798343 ,  0.27203688, -0.5255129 ,
        -0.8286061 ,  0.28920034,  2.1548796 ,  0.62466604, -1.1175022 ,
        -1.1320281 ,  0.4210174 , -0.75317067, -1.1652302 ,  0.01175366,
         1.3295193 , -0.41899648,  0.39197096,  1.3291682 ],
       [ 0.81967187,  0.8394152 ,  0.8660329 ,  0.8326798 ,  0.8342287 ,
         0.82582533, -1.8965763 , -2.0378969 ,  1.4113448 ,  0.43807143,
  

In [8]:
import pandas as pd
from mordred import Calculator, descriptors
from rdkit import Chem
import numpy as np

def generate_mordred_descriptors(smiles_list, columns=None):
    """
    Generate Mordred descriptors for a list of SMILES strings.
    
    Parameters:
    -----------
    smiles_list : list
        List of SMILES strings to calculate descriptors for
    columns : list, optional
        List of specific columns to include
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing molecular descriptors for each SMILES string
    """
    # Create a calculator with all available descriptors
    calc = Calculator(descriptors)
    
    # If columns are specified, filter the descriptors
    if columns:
        # Filter descriptors to match the specified columns
        filtered_descriptors = [desc for desc in calc.descriptors if str(desc) in columns]
        calc = Calculator(filtered_descriptors)
    
    # Prepare results
    results = []
    
    # Calculate descriptors for each SMILES string
    for smiles in smiles_list:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        
        if mol is not None:
            # Calculate descriptors
            try:
                desc_values = calc(mol)
                # Convert to dictionary, adding SMILES as first column
                desc_dict = {'SMILES': smiles, **dict(desc_values)}
                results.append(desc_dict)
            except Exception as e:
                print(f"Error calculating descriptors for {smiles}: {e}")
        else:
            print(f"Invalid SMILES string: {smiles}")
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    
    # Ensure all specified columns are present, fill with NaN if missing
    # if columns:
    #     for col in columns:
    #         if col not in df.columns:
    #             df[col] = np.nan
        
    #     # Reorder columns to match the original specification
    #     df = df[['SMILES'] + [col for col in columns if col != 'SMILES']]
    
    return df

def read_descriptors(file_path):
    """
    Read the descriptors from a file.
    
    Parameters:
    -----------
    file_path : str
        Path to the file containing column names
    
    Returns:
    --------
    list
        List of column names
    """
    with open(file_path, 'r') as f:
        # Read the first line and split by tab
        columns = f.readline().strip().split('\t')
    return columns

def main():
    # Example usage
    smiles_list = [
        'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
        'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
        'CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO'  # Salbutamol
    ]
    
    # Generate descriptors
    descriptors_df = generate_mordred_descriptors(smiles_list)
    
    # Read the descriptors from descriptors.txt
    descriptors_file_path = 'descriptors.txt'
    descriptors = read_descriptors(descriptors_file_path)
    
    # Filter the columns to keep only those present in descriptors.txt
    filtered_descriptors_df = descriptors_df#[descriptors]
    
    # Print first few rows and basic info
    print(filtered_descriptors_df)
    print("\nTotal descriptors calculated:", len(filtered_descriptors_df.columns) - 1)  # -1 for SMILES column

#if __name__ == '__main__':
#    main()
# Example usage
smiles_list = [
    'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
    'CN1C=NC2=C1C(=O)N(C(=O)N2C)C',  # Caffeine
    'CC(C)(C)NCC(O)C1=CC(=C(C=C1)O)CO'  # Salbutamol
]

features_file_path = 'descriptors.txt'
features = read_descriptors(features_file_path)
# Generate descriptors
descriptors_df = generate_mordred_descriptors(smiles_list, features)
non_numeric_columns = descriptors_df.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns in descriptors_df:", non_numeric_columns.tolist())
filled_descriptors_df = descriptors_df.apply(pd.to_numeric, errors='coerce').fillna(0)


Non-numeric columns in descriptors_df: ['SMILES', mordred.ABCIndex.ABCIndex(), mordred.ABCIndex.ABCGGIndex(), mordred.GeometricalIndex.Diameter3D(), mordred.GeometricalIndex.Radius3D(), mordred.GeometricalIndex.GeometricalShapeIndex(), mordred.GeometricalIndex.PetitjeanIndex3D(), mordred.GravitationalIndex.GravitationalIndex(True, False), mordred.GravitationalIndex.GravitationalIndex(True, True), mordred.MoRSE.MoRSE(None, 1), mordred.MoRSE.MoRSE('m', 1), mordred.MoRSE.MoRSE('v', 1), mordred.MoRSE.MoRSE('p', 1), mordred.MomentOfInertia.MomentOfInertia('X'), mordred.MomentOfInertia.MomentOfInertia('Y'), mordred.MomentOfInertia.MomentOfInertia('Z'), mordred.PBF.PBF()]


In [14]:
filled_descriptors_df

Unnamed: 0,SMILES,ABC,ABCGG,nAcid,nBase,nAromAtom,nAromBond,nAtom,nHeavyAtom,nSpiro,...,SRW09,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb2
0,0.0,0.0,0.0,1,0,6,6,21,13,0,...,0.0,9.151333,43.556121,180.042259,8.573441,246,16,60.0,66.0,2.972222
1,0.0,0.0,0.0,0,0,9,10,24,14,0,...,6.842683,9.824498,60.521485,194.080376,8.086682,258,25,76.0,94.0,3.027778
2,0.0,0.0,0.0,0,1,6,6,38,17,0,...,0.0,9.519662,49.23903,239.152144,6.293477,560,22,82.0,90.0,3.763889


In [17]:
X = filled_descriptors_df.iloc[:, 1:]
# Load models and make predictions
predictions = []
for target_index in range(0,29):
    print(f"Predicting target {target_index + 1}...")
    predictions.append(eval_model(X, target_index+1))
    
# Stack predictions into array
predictions = np.hstack(predictions)

# Create a DataFrame for predictions
#predictions_df = pd.DataFrame(predictions, columns=[f'Prediction_{i}' for i in range(predictions.shape[1])])

# Save predictions to a CSV file
#predictions_df.to_csv('predictions.csv', index=False)

print(f"Predictions shape: {predictions.shape}")

Predicting target 1...
Predicting target 2...
Predicting target 3...
Predicting target 4...
Predicting target 5...
Predicting target 6...
Predicting target 7...
Predicting target 8...
Predicting target 9...
Predicting target 10...
Predicting target 11...
Predicting target 12...
Predicting target 13...
Predicting target 14...
Predicting target 15...
Predicting target 16...
Predicting target 17...
Predicting target 18...
Predicting target 19...
Predicting target 20...
Predicting target 21...
Predicting target 22...
Predicting target 23...
Predicting target 24...
Predicting target 25...
Predicting target 26...
Predicting target 27...
Predicting target 28...
Predicting target 29...
Predictions shape: (3, 29)


In [18]:
predictions

array([[ 0.64802194,  0.6763011 ,  0.6720045 ,  0.67016065,  0.67170584,
         0.6552182 , -0.82114154, -0.9100838 ,  0.6595404 , -0.5081216 ,
        -1.0814366 ,  0.05959841,  0.02388677, -0.18653761, -0.69126606,
        -0.24044074,  0.29728162, -0.13252875,  0.17962259,  0.1698237 ,
         0.29177457, -0.04388487,  0.883669  ,  0.07067779, -0.20569256,
         0.5556835 , -0.10551096, -0.44832402,  0.04236376],
       [ 0.6738402 ,  0.6561382 ,  0.63686115,  0.6457978 ,  0.61609817,
         0.63263655, -0.44888505, -0.16474499, -0.28109866,  0.45998436,
        -0.6971879 , -0.75150657, -0.8275612 ,  0.5387262 ,  0.33591998,
         1.2273989 ,  1.262187  , -0.8855711 ,  0.67656326,  0.19945568,
         0.03132928,  0.5865191 ,  1.3763336 ,  0.25950426,  0.4072792 ,
         0.4472001 , -0.29103306, -0.63100445,  0.14188881],
       [ 0.5220504 ,  0.49777406,  0.5086727 ,  0.540165  ,  0.5316565 ,
         0.5020793 ,  0.5621456 ,  0.9338471 , -0.8875675 ,  0.7965739 ,
  