In [None]:
import sys
sys.path.append('/mnt/d/Research/PHD/DLEPS/code/DLEPS')

from rdkit.Chem import MolFromSmiles, MolToSmiles
from rdkit.Chem import Draw

import numpy as np  
import pandas as pd
import molecule_vae

In [None]:
# 读取训练和测试的 SMILES 数据
dt1 = pd.read_csv('/mnt/d/Research/PHD/DLEPS/results/train_SMILES_demo.csv')
dt2 = pd.read_csv('/mnt/d/Research/PHD/DLEPS/results/test_SMILES_demo.csv')

# 合并 SMILES 列表
smiles = np.concatenate([dt1['smiles'].values, dt2['smiles'].values], axis=0)

print("Number of SMILES from dt1 and dt2:", len(smiles))


In [None]:
# 读取 L1000 基因表达数据
l1000_df = pd.read_csv('/mnt/d/Research/PHD/DLEPS/results/L1000_landmark.csv')

print("Number of SMILES in L1000_landmark.csv:", len(l1000_df))


In [None]:
# 检查 L1000 数据的列
print(l1000_df.columns)


In [None]:
# 由于 SMILES 可能存在不同的表示方式，我们需要将它们规范化为标准的 SMILES
# 处理合并的 SMILES 数据
canonical_smiles = []
for smi in smiles:
    try:
        mol = MolFromSmiles(smi)
        if mol is not None:
            can_smi = MolToSmiles(mol)
            canonical_smiles.append(can_smi)
        else:
            canonical_smiles.append(None)
    except:
        canonical_smiles.append(None)

# 创建 SMILES DataFrame
smiles_df = pd.DataFrame({'smiles': smiles, 'canonical_smiles': canonical_smiles})


In [None]:
# 处理 L1000 基因表达数据的 SMILES 列
canonical_smiles_l1000 = []
for smi in l1000_df['smiles']:
    try:
        mol = MolFromSmiles(smi)
        if mol is not None:
            can_smi = MolToSmiles(mol)
            canonical_smiles_l1000.append(can_smi)
        else:
            canonical_smiles_l1000.append(None)
    except:
        canonical_smiles_l1000.append(None)

l1000_df['canonical_smiles'] = canonical_smiles_l1000


In [None]:
# 重置索引以便后续的合并
smiles_df.reset_index(inplace=True)
smiles_df.rename(columns={'index': 'smiles_index'}, inplace=True)

l1000_df.reset_index(inplace=True)
l1000_df.rename(columns={'index': 'l1000_index'}, inplace=True)


In [None]:
# 合并两个数据集，基于规范化后的 SMILES
merged_df = pd.merge(smiles_df, l1000_df, on='canonical_smiles', how='inner', suffixes=('_smiles', '_l1000'))

print("Number of matched SMILES after merging:", len(merged_df))


In [None]:
# 提取匹配的索引和基因表达数据
matched_indices = merged_df['smiles_index'].values
L962 = merged_df.iloc[:, merged_df.columns.get_loc('780'):].values  # 假设基因表达数据从列名 '780' 开始


In [None]:
# 提取需要处理的 SMILES
smiles_to_process = merged_df['smiles_smiles'].values


In [None]:
# 处理 SMILES，转换为 RDKit 标准 SMILES，并记录有效的索引
smiles_rdkit = []
iid = []
for i, smi in enumerate(smiles_to_process):
    try:
        mol = MolFromSmiles(smi)
        if mol is not None:
            can_smi = MolToSmiles(mol)
            smiles_rdkit.append(can_smi)
            iid.append(i)
        else:
            print("Invalid molecule at index %d" % i)
    except:
        print("Error processing SMILES at index %d" % i)


In [None]:
print("Number of valid SMILES after RDKit processing:", len(smiles_rdkit))


In [None]:
# 更新基因表达数据，保留有效的 SMILES 对应的数据
L962_valid = L962[iid]


In [None]:
# 定义辅助函数
def xlength(y):
    from functools import reduce
    return reduce(lambda sum, element: sum + 1, y, 0)

def get_zinc_tokenizer(cfg):
    long_tokens = [a for a in list(cfg._lexical_index.keys()) if xlength(a) > 1]
    replacements = ['$','%','^']
    assert xlength(long_tokens) == len(replacements)
    for token in replacements: 
        assert token not in cfg._lexical_index

    def tokenize(smiles):
        for i, token in enumerate(long_tokens):
            smiles = smiles.replace(token, replacements[i])
        tokens = []
        for token in smiles:
            try:
                ix = replacements.index(token)
                tokens.append(long_tokens[ix])
            except:
                tokens.append(token)
        return tokens

    return tokenize


In [None]:
import zinc_grammar
import nltk

_tokenize = get_zinc_tokenizer(zinc_grammar.GCFG)
_parser = nltk.ChartParser(zinc_grammar.GCFG)
_productions = zinc_grammar.GCFG.productions()
_prod_map = {}
for ix, prod in enumerate(_productions):
    _prod_map[prod] = ix
MAX_LEN = 277
_n_chars = len(_productions)


In [None]:
# 对 SMILES 进行解析和编码
from multiprocessing import Pool, cpu_count

def parse_smiles_with_index(args):
    """Parse SMILES token with its index."""
    i, t = args
    try:
        tp = next(_parser.parse(t))
        return (i, tp, None)  # 返回成功解析的索引和解析树
    except Exception as e:
        return (i, None, str(e))  # 返回失败索引和错误信息

# 使用并行处理
if __name__ == "__main__":
    tokens = list(map(_tokenize, smiles_rdkit))
    parse_trees = []
    badi = []

    # 使用进程池进行并行化
    with Pool(cpu_count()) as pool:
        results = pool.map(parse_smiles_with_index, enumerate(tokens))

    # 处理结果
    for i, tp, error in results:
        if tp is not None:
            parse_trees.append(tp)
        else:
            print(f"Parse tree error at index {i}: {error}")
            badi.append(i)


In [None]:
# 更新有效的索引，排除解析错误的 SMILES
iid2 = [iid[i] for i in range(len(iid)) if i not in badi]
L962_valid = L962_valid[[i for i in range(len(L962_valid)) if i not in badi]]


In [None]:
# 生成 One-Hot 编码
productions_seq = [tree.productions() for tree in parse_trees]
indices = [np.array([_prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
one_hot = np.zeros((len(indices), MAX_LEN, _n_chars), dtype=np.float32)
for i in range(len(indices)):
    num_productions = len(indices[i])
    if num_productions > MAX_LEN:
        print("Too large molecule at index %d, truncating" % i)
        one_hot[i][np.arange(MAX_LEN), indices[i][:MAX_LEN]] = 1.
    else:
        one_hot[i][np.arange(num_productions), indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN), -1] = 1.


In [None]:
# 检查处理后的数据大小
print("Size of one-hot encoded SMILES:", one_hot.shape)
print("Size of gene expression data:", L962_valid.shape)


In [None]:
# 随机打乱并划分训练和测试集
num_examples = L962_valid.shape[0]
perm = np.arange(num_examples)
np.random.shuffle(perm)
L962_shuffled = L962_valid[perm]
one_hot_shuffled = one_hot[perm]

TEST_SIZE = 3000
L962_test = L962_shuffled[:TEST_SIZE]
L962_train = L962_shuffled[TEST_SIZE:]
one_hot_test = one_hot_shuffled[:TEST_SIZE]
one_hot_train = one_hot_shuffled[TEST_SIZE:]


In [None]:
# 保存数据为 .h5 文件
import h5py

# 保存基因表达数据
h5f = h5py.File('/mnt/d/Research/PHD/DLEPS/results/L1000_train.h5', 'w')
h5f.create_dataset('data', data=L962_train)
h5f.close()

h5f = h5py.File('/mnt/d/Research/PHD/DLEPS/results/L1000_test.h5', 'w')
h5f.create_dataset('data', data=L962_test)
h5f.close()

# 保存 One-Hot 编码的 SMILES
h5f = h5py.File('/mnt/d/Research/PHD/DLEPS/results/SMILE_train_demo.h5', 'w')
h5f.create_dataset('data', data=one_hot_train)
h5f.close()

h5f = h5py.File('/mnt/d/Research/PHD/DLEPS/results/SMILE_test_demo.h5', 'w')
h5f.create_dataset('data', data=one_hot_test)
h5f.close()
