In [3]:
import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from sklearn.model_selection import train_test_split
import dgl
from dgllife.utils import mol_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from enum import Enum
import torch
import json



def remove_duplicates(df):
    indexes_to_remove=[71, 20031, 17054, 28867, 29078, 28112, 25117, 33252, 39561, 39293, 40500, 39604, 40510, 39894, 23457, 1883, 1848, 450, 10499, 5476, 6679, 20046, 14769, 5950, 12216, 22430, 22426, 9034, 9876, 2202, 27142, 1299, 1805, 29622, 25009, 29646, 9218, 17673, 10325, 31053, 24736, 12815, 20952, 27910, 23716, 7245, 1968, 14653, 26318, 25042, 22425, 22432, 2060, 20957, 21664, 8818, 20383, 33539, 5452, 24667, 24674, 4947, 34815, 24611, 11048, 1141, 19526, 11095, 20645, 1973, 4607, 32554, 14255, 13193, 3, 1395, 8835, 6438, 27908, 22703, 27907, 6407, 30873, 23053, 13838, 30935, 13839, 22295, 15075, 1301, 22305, 1300, 26808, 35183, 15033, 16141, 17807, 22785, 11843, 4692, 28596, 14250, 8742, 17059, 23086]
    print(f'Number of indices to remove is: {len(indexes_to_remove)}')
    df['IndexNum'] = df['QSPRID'].str.extract(r'_(\d+)$').astype(int)
    df_filtered = df[~df['IndexNum'].isin(indexes_to_remove)].copy()
    df_filtered.drop(columns=['IndexNum'], inplace=True)
    return df_filtered


def molecules_to_graph(data_frame):
    print("Converting molecules to graphs...")
    #smiles_labels = data_frame[['smiles', 'pki']] 
    chemblid, graphs, labels = [], [], []
    for index, row in data_frame.iterrows():
        #chemblid = row['molecule_chembl_id']   TODO
        #smi = row['smiles']    TODO
        smi = row['Drug']
        mol = Chem.MolFromSmiles(smi)
        #print("Number of edges:", g.number_of_edges())  # Get the number of edges in the graph
        #print("Number of features:", len(CanonicalBondFeaturizer(mol)))  # Check the number of features returne
        graph = mol_to_bigraph(    # converts RDKit mol object into a DGL graph, bigraph - ensuring undirected conectivity (A->B, B->A)
            mol,                   # adding self loops to nodes --> they can use their own features in message passing
            node_featurizer=CanonicalAtomFeaturizer(),
            edge_featurizer=CanonicalBondFeaturizer(),
            explicit_hydrogens=False
        )
        graph = dgl.add_self_loop(graph)
        graphs.append(graph)
        #label = row['pki'] TODO
        #label = row['pchembl_value_Mean'] 
        label = row['Y']
        labels.append(label)
    print(f"LENGTH OF LABELS: {len(labels)}")
    labels = torch.tensor(labels).unsqueeze(1)  # torch.tensor ... converts python list into pytorch tensor
                                                    # unsqueeze(1) ... adds an extra dimension at the position 1 (some models require explicitly the 2nd dimention)
    labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
    print("Graphs done.")
    
    return chemblid, labels_dictionary, graphs


def run_script():
    #data_frame = pd.read_csv("/Users/martinpycha/Desktop/Machine_learning/My_project/A2AR_data/a2ar_val_1")
    for i in range(1, 11):
        for set_type in ["train", "val", "test"]:
            #data_frame = pd.read_csv(f"./A2AR/a2ar_{set_type}_{i}")
            data_frame = pd.read_csv(f"./HIV/hiv_{set_type}_{i}")
            print(type(data_frame))
            data_frame_class = data_frame
            # FOR CLASSIFICATION
            #data_frame_class = prepare_for_classification(data_frame)
            print(data_frame_class.info())
            print(data_frame_class.head())
            print(data_frame["Y"].value_counts())
            
            print(type(data_frame_class))
            _, labels_dict, graphs = molecules_to_graph(data_frame_class)
            # saving the results
            print("Saving the graphs and labels")
            
            labels_dict_int = {}
            for key, item in labels_dict.items():
                labels_dict_int[key] = torch.tensor(item, dtype=torch.int)
            print(f"Number of graphs: {len(graphs)}, Number of labels: {len(labels_dict_int)}")
            dgl.save_graphs(f'./HIV_graphs/HIV_graphs_{set_type}{i}.bin', graphs, labels_dict_int)  # graphs... list of graphs, labels_dict ... dictionary containing labels
            print("Graphs and labels saved.")

def clean_dataset():
    for i in range(1, 11):
        for set_type in ["train", "val", "test"]:
            data_frame = pd.read_csv(f"./raw_HIV/hiv_{set_type}_{i}")
            print(data_frame.info())
            data_frame_clean = remove_duplicates(data_frame)
            print(data_frame_clean.info())
            data_frame_clean.to_csv(f"./HIV/hiv_{set_type}_{i}", index=False)

clean_dataset()
#run_script()




    
    
    
    
        
        
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21728 entries, 0 to 21727
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   QSPRID  21728 non-null  object
 1   Y       21728 non-null  bool  
 2   Drug    21728 non-null  object
dtypes: bool(1), object(2)
memory usage: 360.8+ KB
None
105
<class 'pandas.core.frame.DataFrame'>
Index: 21671 entries, 0 to 21727
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   QSPRID  21671 non-null  object
 1   Y       21671 non-null  bool  
 2   Drug    21671 non-null  object
dtypes: bool(1), object(2)
memory usage: 529.1+ KB
None


OSError: Cannot save file into a non-existent directory: 'HIV'