In [1]:
import os
os.chdir('/rds/general/user/cb1319/home/STK_search')

In [15]:
import pandas as pd
from src.stk_search import Database_utils
#%% 
# Load the searched space
df_path = '/rds/general/user/cb1319/home/GEOM3D/STK_path/data/output/Full_dataset/df_total_subset_16_11_23.csv'
df_precursors_path = '/rds/general/user/cb1319/home/GEOM3D/STK_path/data/output/Prescursor_data/calculation_data_precursor_071123_clean.pkl'
df_total, df_precursors = Database_utils.load_data_from_file(df_path, df_precursors_path)

In [16]:
# take the first 500 rows of df_total
df_total = df_total.head(1100)

In [17]:
# Get NumPy arrays from DataFrame for the input and target
y_IP = df_total['ionisation potential (eV)'].values
X_6mer_inch = df_total['BB'].values
X_frag_mol = df_precursors['mol_opt'].values
X_frag_inch = df_precursors['InChIKey'].values
keys_6mer = df_total['InChIKey'].values

In [18]:
import os
from collections import Counter
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display, HTML
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw, rdFMCS
from scipy.cluster.hierarchy import dendrogram, fcluster, linkage
from sklearn.decomposition import PCA
from sklearn.cluster import HDBSCAN
from tqdm import tqdm

In [21]:
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import torch

def featurise(X, keys, params=None):
    """
    Function that takes your fingerprint generator and returns a matrix of molecular descriptors.

    X: Input the values of the array in the dataframe
    params: a dictionary of parameters for the featurizer (optional, if no params put params=None)
    keys: list of InchIkeys of the 6mer
    """

    features = []
    m, n = X.shape

    # Implement tqdm to show progress bar
    for i in tqdm(range(m), desc="Featurizing molecules"):
        feature = np.zeros(m)
        for j in range(n):
            mol = X[i, j]
            if mol is not None:
                # try:
                fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
                fingerprint_array = np.array(list(map(int, fingerprint.ToBitString())))
                feature[0:len(fingerprint_array)] = fingerprint_array
                # except ValueError:
                #     print(f"Skipping invalid fingerprint for molecule at position ({i}, {j})")

        features.append(feature)
    
    features = np.array(features)

    print(f'Features before cleaning up invalid and zero variance values: {features.shape[0]}')

    # Drop the features containing invalid values
    features = features[:, ~np.isnan(features).any(axis=0)]

    # Here, we removed all zero-variance features, i.e. features that have the same value in all samples.
    selector = VarianceThreshold(threshold=0.0)
    features = selector.fit_transform(features)
    print(f'Number of molecular descriptors after removing invalid and zero variance features: {features.shape[1]}')
    print(f'Shape of features: {features.shape}.\n')
    
    features_tensor = torch.tensor(features, dtype=torch.float32)

    return features_tensor


In [22]:
print('Number of Oligomers in the dataset:', len(keys_6mer))

X_6mer_mol = [[] for _ in range(6)]  # Create a list of lists to store molecules for each position
inchkey_to_molecule = dict(zip(X_frag_inch, X_frag_mol))  # Creates a dictionary in the precursor data to associate one InchiKey to a molecule
conversion_fail = 0

for i in np.arange(0, 6, 1):  # For each molecule
    mol_list = []
    temp_list = df_total[f'InChIKey_{i}'].values  # Looks at the InChIKeys in each of the 6 columns
    for j in range(len(temp_list)):
        inchkey = temp_list[j]  # Looks at the individual InChIKey in each row
        if inchkey in inchkey_to_molecule:
            mol_list.append(inchkey_to_molecule[inchkey])
        else:
            conversion_fail += 1
    X_6mer_mol[i] = mol_list  # Assign the list of molecules to the corresponding position

print(f"Elements in X_6mers not converted: {conversion_fail}")

# Adjusted part to create a NumPy array
max_molecules = max(len(position) for position in X_6mer_mol)
X_6mer_array = np.full((max_molecules, 6), None, dtype=object)

for i, position in enumerate(X_6mer_mol):
    X_6mer_array[:len(position), i] = position

print(f"Shape of X_6mer_array: {X_6mer_array.shape}")    # make an array containing the different RDKit molecules in each fragment

# Generate Morgan fingerprints for the dataset
morgan_fingerprints = featurise(X_6mer_array, keys_6mer)

print("MorganFP generated for the dataset")



Number of Oligomers in the dataset: 1100
Elements in X_6mers not converted: 0
Shape of X_6mer_array: (1100, 6)


Featurizing molecules: 100%|██████████| 1100/1100 [00:00<00:00, 1293.79it/s]


Features before cleaning up invalid and zero variance values: 1100
Number of molecular descriptors after removing invalid and zero variance features: 66
Shape of features: (1100, 66).

MorganFP generated for the dataset


In [5]:
import importlib
import os
from src.stk_search.Search_algorithm import Representation_morgan_fp

importlib.reload(Representation_morgan_fp)

os.chdir('/rds/general/user/cb1319/home/STK_search')
%run src/stk_search/Search_algorithm/Representation_morgan_fp.py

Number of Oligomers in the dataset: 1500


Featurizing molecules: 100%|██████████| 1500/1500 [00:01<00:00, 991.90it/s]


size of the features tensor: torch.Size([1500, 66])
Finished featurising molecules and saved the dataframe with Morgan fingerprints


In [5]:
import os
os.chdir('/rds/general/user/cb1319/home/STK_search/src')

In [8]:
import importlib
import os
from dev_scripts import run_search

importlib.reload(run_search)

os.chdir('/rds/general/user/cb1319/home/STK_search/src')
%run src/dev_scripts/run_search.py --num_iteration 1 --test_name 'CB_Exp1' --case 'morgan_fp'

ModuleNotFoundError: No module named 'stko.calculators'

In [10]:
import stko.calculators

ModuleNotFoundError: No module named 'stko.calculators'