In [34]:
import pandas as pd
import random
from rdkit import Chem

In [35]:
def read_data_train(path):
    train = pd.read_csv(path)
    data_train = pd.concat([train['Smiles'], train['IC50_nM'], train['pIC50']], axis=1)
    
    return data_train

In [36]:
train = read_data_train("../data/train.csv")

In [37]:
len(train)

1952

In [38]:
train.head()

Unnamed: 0,Smiles,IC50_nM,pIC50
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,0.022,10.66
1,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,0.026,10.59
2,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,0.078,10.11
3,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,0.081,10.09
4,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...,0.099,10.0


In [39]:
df_count = train.groupby(['IC50_nM'])['Smiles'].count()

ic50_nm = list(df_count.index)
frequency = list(df_count.values)

len(ic50_nm), len(frequency)

(713, 713)

In [40]:
ic50_nm_val = random.choices(ic50_nm, k = 100)
list_df_val = []
for ic50 in ic50_nm_val:
    df_tmp = train[train['IC50_nM'] == ic50].iloc[:1]
    list_df_val.append(pd.DataFrame(columns=train.columns, data=df_tmp.values))

df_val = pd.concat(list_df_val)

In [41]:
len(df_val)

100

In [42]:
df_train = train[~train['Smiles'].isin(df_val['Smiles'])]

In [43]:
len(df_train)

1858

In [52]:
def create_dict_df(df):
    dict_dataframe = {}
    max_smile = 20

    df_count = df.groupby(['IC50_nM'])['Smiles'].count()
    ic50_nm = list(df_count.index)
    frequency = list(df_count.values)
    set_smiles = set(df['Smiles'].values)

    for ic, freq in zip(ic50_nm, frequency):
        df_tmp = df[df['IC50_nM'] == ic]
        num_smile = 0
        list_new_smiles = []
        attemp = 0
        while num_smile <= max_smile:
            idx = num_smile % len(df_tmp)
            row = df_tmp.iloc[idx].values
            smile, v_ic50, v_pic50 = row[0], row[1], row[2]

            mol = Chem.MolFromSmiles(smile)
            new_smile = Chem.MolToSmiles(mol, doRandom=True)

            if new_smile not in set_smiles:
                list_new_smiles.append([new_smile, v_ic50, v_pic50])

                num_smile += 1

                set_smiles.add(new_smile)

                attemp = 0
            
            else:
                attemp += 1
            
            if attemp == 10:
                num_smile += 1
                attemp = 0
        
        df_new_smiles = pd.DataFrame(columns=df_tmp.columns, data=list_new_smiles)
        
        df_concat = pd.concat([df_tmp, df_new_smiles])

        dict_dataframe[ic] = df_concat
    list_dataframe = []
    for k in dict_dataframe:
        list_dataframe.append(dict_dataframe[k])

    df_all = pd.concat(list_dataframe)

    return df_all

In [53]:
df_train.reset_index(inplace=True, drop=True)
df_val.reset_index(inplace=True, drop=True)

In [54]:
df_train_all = create_dict_df(df_train)

  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])
  df_concat = pd.concat([df_tmp, df_new_smiles])


In [55]:
df_train_all.head()

Unnamed: 0,Smiles,IC50_nM,pIC50
0,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...,0.022,10.66
0,O(CC#Cc1c2c(c(nc1)OC[C@H]1NC(=O)CC1)cc(c(c2)C(...,0.022,10.66
1,NC(c1cc2c(cc1OC)c(OC[C@H]1NC(CC1)=O)ncc2C#CCOC...,0.022,10.66
2,C([C@@H](NC)C)(N[C@@H](C1CCCCC1)C(N1[C@H](C(N[...,0.022,10.66
3,c1c2c(c(OC[C@H]3NC(=O)CC3)ncc2C#CCOCCOCCOCCC(=...,0.022,10.66


In [56]:
len(df_train_all)

13163

In [57]:
df_train_all.to_csv("../data/balance/train_10.csv", index=False)

In [58]:
len(df_train_all)

13163

In [59]:
df_val.to_csv("../data/balance/val_10.csv", index=False)

In [200]:
df_train_all.head()

Unnamed: 0,Smiles,IC50_nM,pIC50
0,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...,0.026,10.59
0,c1c(c(N2CCC(N3CCC3)CC2)cc2C(N(C[C@@H](F)C(O)(C...,0.026,10.59
1,c1(cc2c(cc1N1CCC(CC1)N1CCC1)C(=O)N(C[C@@H](F)C...,0.026,10.59
2,N1(CCC1)C1CCN(c2cc3C(N(Cc3cc2NC(c2c3n(cccn3)nc...,0.026,10.59
3,c1(N2CCC(CC2)N2CCC2)cc2c(CN(C2=O)C[C@H](C(C)(O...,0.026,10.59


In [201]:
len(df_train_all), len(df_val)

(290501, 250)

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors

def calculate_molecular_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    features = {
        'MolecularWeight': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumRings': Descriptors.RingCount(mol),
        'NumHeavyAtoms': Descriptors.HeavyAtomCount(mol),
        'NumDoubleBonds': Descriptors.NumDoubleBonds(mol),
        'NumTripleBonds': Descriptors.NumTripleBonds(mol),
        'PolarSurfaceArea': Descriptors.TPSA(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
    }
    return features

# Ví dụ sử dụng
smiles = "CCO"
features = calculate_molecular_features(smiles)
print(features)