## This is the main preprocessing file that takes activity data by chemical class and creates the necessary features and target data for the model. We create the input features (differences of molecule fingerprints) and the target data (binary outcome of 'more' or 'less' active). We move here from an absolute potency space to a relative one.

In [1]:
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
import pdb
import multiprocessing as mp
from functools import partial
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity, BulkTanimotoSimilarity
from rdkit import RDLogger
from rdkit.Chem import rdMolDescriptors
import pickle
RDLogger.DisableLog('rdApp.*')
import random


def Morgan_Fingerprint(smile, nbits = 512):
    return AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile),3,nBits=nbits, useFeatures = True)

def Atom_Pair(smile):
    return rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(Chem.MolFromSmiles(smile),nBits = 512)

def TopologicalTorsion(smile):
    return rdMolDescriptors.GetHashedTopologicalTorsionFingerprintAsBitVect(Chem.MolFromSmiles(smile),nBits = 512)

def concat_fingerprints(smile): # our default feature set throughout this model is a concatenation of 3 fingerprint sets
    MF = Morgan_Fingerprint(smile)
    AP = Atom_Pair(smile)
    TT = TopologicalTorsion(smile)
    return np.array(MF + AP + TT)

def get_similarity(ref, comps):
    return BulkTanimotoSimilarity(ref, comps)

def IC50_diff(compound_a, compound_b):
    if (compound_a == np.inf) | (compound_b == np.inf):
        return np.NaN
    else:
        return compound_a - compound_b

def duplicate_meta_data(meta_data): # needed when doing the 'Train-Test' split to account for the 'reverses'
    
    original_meta = pd.DataFrame(meta_data)
    duplicate_meta = pd.DataFrame(columns = original_meta.columns)
    duplicate_meta['Data_Split'], duplicate_meta['Test_Style'] = original_meta['Data_Split'], original_meta['Test_Style']
    duplicate_meta['Compound_A'], duplicate_meta['Compound_B'] = original_meta['Compound_B'], original_meta['Compound_A']
    duplicate_meta['Compound_A_IC50'], duplicate_meta['Compound_B_IC50'] = original_meta['Compound_B_IC50'], original_meta['Compound_A_IC50']
    duplicate_meta['IC50_diff'] = -original_meta['IC50_diff']
    
    assert original_meta.shape == duplicate_meta.shape, "Shapes not matching"
    
    return original_meta.append(duplicate_meta, ignore_index = True).reset_index(drop = True)

## This function deserves its own commentary. 

1. It's the main function that turns absolute data into relative data. We generate fingerprint differences only if there is a sufficient difference between the IC50 data and avoid differences with one's self.


2. Moreover, the data splits are very important -- if we want a totally distinct train and test set (no molecule is implicity seen in both; this is more 'challenging' for the model and we use it as default) then there is no need to add the 'reverse' differences. However if we want a partial overlap in train/test (as this is closer to reality at inference) we do need to add the reverse differences -- the type of split can be specificed as a parameter 'test_split'.


3. It's easier to just carry around the vector of fingerprint diffs than all the ancillary data such as original SMILES, IC50 etc so we also create meta_data CSV of these details that can be picked up and included in analysis (most likely after we've trained the model and want to do some inference).

In [None]:
def generate_difference_data(train_df, test_df, type_split, test_split = 'Test-Test'): 
    
    # type_split is either 'Train' or 'Test' while test_split is either 'Test-Test' or 'Train-Test'
       
    if type_split == 'Train': # training set internal diffs
        df = train_df
        ref_df = train_df
    else: # cross diffs or test set internal diffs
        df = test_df
        if test_split == 'Test-Test':
            ref_df = test_df # train_df
        else:
            ref_df = train_df
        
    fp_diffs = []
    activity_diffs = []
    meta_data = []
    for index, row in df.iterrows():
        for index, ref_row in ref_df.iterrows():
            if (np.array_equal(row['fingerprint'], ref_row['fingerprint']) == False) & (abs(row['IC50'] - ref_row['IC50']) > 5):
                fp_diffs.append(np.array(row['fingerprint']) - np.array(ref_row['fingerprint']))
                meta_data.append({ 'Data_Split' : type_split, 'Test_Style' : test_split,
                                  'Compound_A' : row['SMILES'], 'Compound_B' : ref_row['SMILES'],
                                  'Compound_A_IC50' : row['IC50'], 'Compound_B_IC50' : ref_row['IC50'],
                                  'IC50_diff' : IC50_diff(row['IC50'], ref_row['IC50'])})
                if row['IC50'] < ref_row['IC50']:
                    activity_diffs.append(1)
                else:
                    activity_diffs.append(-1)
    
    fp_diffs = np.vstack(fp_diffs)
    
    if test_split == 'Train-Test':
        return np.concatenate((fp_diffs,-fp_diffs)), np.concatenate((np.array(activity_diffs),-np.array(activity_diffs))), duplicate_meta_data(meta_data)
    else:
        return fp_diffs, activity_diffs, pd.DataFrame(meta_data)

## Allow for the preprocessing of several datasets if we want

In [None]:
datasets = {
             'noncovalent' : pd.read_csv('known_noncovalent_activity.csv')
           }

## This function should be quite self explanatory except for the 'split' line. 

- Basically this is used to split the ABSOLUTE data into a train and test set which is then converted into a RELATIVE train and test set. 
- The quirk is that we double the training data of acrylamides using noncovalent samples. My aim is to always produce a RELATIVE train/test split of 80:20 and (a bit of maths later) this implies the split shown below are needed when splitting the ABSOLUTE data -- these are only aproximations as they don't account for when IC50s are not significantly different.

In [None]:
for label, df in datasets.items():
       
    with mp.Pool(processes = mp.cpu_count()) as pool: 
        df['fingerprint'] = pool.map(concat_fingerprints, df['SMILES'], 1) # get fingerprints
    
    if label == 'acrylamide':
        split = np.sqrt(2)/(1 + np.sqrt(2))
    else: 
        split = 2/3
    
    msk = np.random.rand(len(df)) < split
    train_df = df[msk]
    test_df = df[~msk]
    
    print( label, ' Train active/inactive split:', train_df.groupby(by = 'activity').count(), '\n')
    print( label, ' Test active/inactive split:', test_df.groupby(by = 'activity').count(), '\n')
    
    X_train, y_train, meta_train = generate_difference_data(train_df, test_df, 'Train', '_')
    X_valid, y_valid, meta_valid = generate_difference_data(train_df, test_df, 'Valid', 'Test-Test')
    meta_data = meta_train.append(meta_valid, ignore_index = True).reset_index(drop = True)
    
    print(label + ' shape:', X_train.shape, X_valid.shape, '\n')
    
    meta_data.to_csv('meta_data.csv', index = False)

    with open(label + '_model_data', 'wb') as filename:
        pickle.dump([X_train, y_train, X_valid, y_valid], filename)

### Additonal function to combine datasets together if needed

In [None]:
# with open('acrylamide_model_data', 'rb') as filehandle:
#     data_1 = pickle.load(filehandle)

# with open('noncovalent_model_data', 'rb') as filehandle:
#     data_2 = pickle.load(filehandle)

# X_train_1, y_train_1, X_valid_1, y_valid_1 = np.array(data_1[0]), np.array(data_1[1]), np.array(data_1[2]), np.array(data_1[3])
# X_train_2, y_train_2, X_valid_2, y_valid_2 = np.array(data_2[0]), np.array(data_2[1]), np.array(data_2[2]), np.array(data_2[3])

# rand_idx = random.sample(range(X_train_2.shape[0]), X_train_1.shape[0]) # double the training data by adding noncovalent data
# X_train, y_train = np.vstack([X_train_1, X_train_2[rand_idx]]), np.append(y_train_1, y_train_2[rand_idx])
# X_valid, y_valid = X_valid_1, y_valid_1

In [None]:
with open('combined_model_data', 'wb') as filename:
    pickle.dump([X_train, y_train, X_valid, y_valid], filename)