In [1]:
import pandas as pd
from rdkit.Chem import PandasTools
from rdkit import Chem
from sklearn.model_selection import train_test_split
import dgl
from dgllife.utils import mol_to_bigraph, CanonicalAtomFeaturizer, CanonicalBondFeaturizer
from enum import Enum
import torch
import json



def remove_duplicates(df):
    indexes_to_remove=[71, 20031, 17054, 28867, 29078, 28112, 25117, 33252, 39561, 39293, 40500, 39604, 40510, 39894, 23457, 1883, 1848, 450, 10499, 5476, 6679, 20046, 14769, 5950, 12216, 22430, 22426, 9034, 9876, 2202, 27142, 1299, 1805, 29622, 25009, 29646, 9218, 17673, 10325, 31053, 24736, 12815, 20952, 27910, 23716, 7245, 1968, 14653, 26318, 25042, 22425, 22432, 2060, 20957, 21664, 8818, 20383, 33539, 5452, 24667, 24674, 4947, 34815, 24611, 11048, 1141, 19526, 11095, 20645, 1973, 4607, 32554, 14255, 13193, 3, 1395, 8835, 6438, 27908, 22703, 27907, 6407, 30873, 23053, 13838, 30935, 13839, 22295, 15075, 1301, 22305, 1300, 26808, 35183, 15033, 16141, 17807, 22785, 11843, 4692, 28596, 14250, 8742, 17059, 23086]
    #print(f'Number of indices to remove is: {len(indexes_to_remove)}')
    df['IndexNum'] = df['QSPRID'].str.extract(r'_(\d+)$').astype(int)
    df_filtered = df[~df['IndexNum'].isin(indexes_to_remove)].copy()
    df_filtered.drop(columns=['IndexNum'], inplace=True)
    return df_filtered

def remove_invalid_smi(df):
    valid_rows = []
    for idx, row in df.iterrows():
        smiles = row['Drug']
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            valid_rows.append(row)
        else:
            print(f"Removed invalid SMILES at index {idx}: {smiles}")
    return pd.DataFrame(valid_rows).reset_index(drop=True)


def molecules_to_graph(data_frame):
    print("Converting molecules to graphs...")
    #smiles_labels = data_frame[['smiles', 'pki']] 
    chemblid, graphs, labels = [], [], []
    data_frame = remove_invalid_smi(data_frame)
    for index, row in data_frame.iterrows():
        #chemblid = row['molecule_chembl_id']   TODO
        #smi = row['smiles']    TODO
        smi = row['Drug']
        mol = Chem.MolFromSmiles(smi)
        #print("Number of edges:", g.number_of_edges())  # Get the number of edges in the graph
        #print("Number of features:", len(CanonicalBondFeaturizer(mol)))  # Check the number of features returne

        if mol is None:
            print(f"Invalid SMILES at index {index}: {smi}")
            continue  # Skip invalid molecules
        
        try:
            graph = mol_to_bigraph(    # converts RDKit mol object into a DGL graph, bigraph - ensuring undirected conectivity (A->B, B->A)
                mol,                   # adding self loops to nodes --> they can use their own features in message passing
                node_featurizer=CanonicalAtomFeaturizer(),
                edge_featurizer=CanonicalBondFeaturizer(),
                explicit_hydrogens=False
            )
            graph = dgl.add_self_loop(graph)
            graphs.append(graph)
        except Exception as e:
            print(f"Graph conversion failed at index {index}: {e}")
            continue
        #label = row['pki'] TODO
        #label = row['pchembl_value_Mean'] 
        label = row['Y']
        labels.append(label)
    print(f"LENGTH OF LABELS: {len(labels)}")
    labels = torch.tensor(labels).unsqueeze(1)  # torch.tensor ... converts python list into pytorch tensor
                                                    # unsqueeze(1) ... adds an extra dimension at the position 1 (some models require explicitly the 2nd dimention)
    labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
    print("Graphs done.")
    
    return chemblid, labels_dictionary, graphs


def run_script():
    #data_frame = pd.read_csv("/Users/martinpycha/Desktop/Machine_learning/My_project/A2AR_data/a2ar_val_1")
    for i in range(1, 11):
        for set_type in ["train", "val", "test"]:
            #data_frame = pd.read_csv(f"./A2AR/a2ar_{set_type}_{i}")
            data_frame = pd.read_csv(f"./HIV/hiv_{set_type}_{i}")
            #print(f"NUMBER OF NAN's is {data_frame.isna().sum().sum()}")
            #print(type(data_frame))
            data_frame_class = data_frame
            # FOR CLASSIFICATION
            #data_frame_class = prepare_for_classification(data_frame)
            #print(data_frame_class.info())
            #print(data_frame_class.head())
            #print(data_frame["Y"].value_counts())
            
            print(type(data_frame_class))
            _, labels_dict, graphs = molecules_to_graph(data_frame_class)
            # saving the results
            print("Saving the graphs and labels")
            
            labels_dict_int = {}
            for key, item in labels_dict.items():
                labels_dict_int[key] = torch.tensor(item, dtype=torch.int)
            print(f"Number of graphs: {len(graphs)}, Number of labels: {len(labels_dict_int)}")
            dgl.save_graphs(f'./HIV_graphs/HIV_graphs_{set_type}{i}.bin', graphs, labels_dict_int)  # graphs... list of graphs, labels_dict ... dictionary containing labels
            print("Graphs and labels saved.")

def clean_dataset():
    for i in range(1, 11):
        for set_type in ["train", "val", "test"]:
            data_frame = pd.read_csv(f"./raw_HIV/hiv_{set_type}_{i}")
            print(data_frame.info())
            data_frame_clean = remove_duplicates(data_frame)
            print(data_frame_clean.info())
            data_frame_clean.to_csv(f"./HIV/hiv_{set_type}_{i}", index=False)

#clean_dataset()
run_script()

    
        
        
        

  from .autonotebook import tqdm as notebook_tqdm


<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 495: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 9251: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 16140: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 16141: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 18815: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 21666
Graphs done.
Saving the graphs and labels
Number of graphs: 21666, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 2910: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
LENGTH OF LABELS: 8829
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8829, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 30: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
LENGTH OF LABELS: 10063
Graphs done.
Saving the graphs and labels
Number of graphs: 10063, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 57: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 6433: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 16368: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 19163: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 22123
Graphs done.
Saving the graphs and labels
Number of graphs: 22123, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 4009: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
LENGTH OF LABELS: 8643
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8643, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 265: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 7402: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 9792
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9792, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 54: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 486: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 9254: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 19003: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 21987
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 21987, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 2917: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
LENGTH OF LABELS: 8902
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8902, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 7240: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 7241: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 9669
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9669, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 466: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 9437: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 19164: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 22229
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22229, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
LENGTH OF LABELS: 8818
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8818, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 37: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 3269: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 7215: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 7216: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 9511
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9511, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 507: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 9466: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 16569: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 16570: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 19194: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 22143
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22143, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 25: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
LENGTH OF LABELS: 8814
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8814, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 3097: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
LENGTH OF LABELS: 9601
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9601, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 64: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 16519: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 16520: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 19174: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 22169
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22169, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 2849: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 4072: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
LENGTH OF LABELS: 8769
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8769, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 275: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
LENGTH OF LABELS: 9620
Graphs done.
Saving the graphs and labels
Number of graphs: 9620, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 62: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 6328: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 9182: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 16060: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 16061: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 21466
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 21466, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 7853: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 9103
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9103, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 275: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
LENGTH OF LABELS: 9989
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9989, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 70: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 509: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 6900: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 9849: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
Removed invalid SMILES at index 16829: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 16830: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 19441: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 22218
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22218, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
LENGTH OF LABELS: 8826
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 8826, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
LENGTH OF LABELS: 9514
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 9514, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 2356: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
LENGTH OF LABELS: 6937
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 6937, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 9465: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 10971
Graphs done.
Saving the graphs and labels
Number of graphs: 10971, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 90: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 612: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 7787: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 17448: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 17449: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 22650
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22650, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 2236: CC1=C2O[AlH3-3]34(OC5=C(C)N(C)C=CC5=[O+]3)(OC3=C(C)N(C)C=CC3=[O+]4)[O+]=C2C=CN1C
LENGTH OF LABELS: 6607
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 6607, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 10169: c1ccc2c(c1)[SH+][GeH2+]([SH+]c1ccccc1[SH+][GeH2+]1[SH+]c3ccccc3[SH+]1)[SH+]2
LENGTH OF LABELS: 11605
Graphs done.
Saving the graphs and labels
Number of graphs: 11605, Number of labels: 1


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)
  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
<class 'pandas.core.frame.DataFrame'>
Converting molecules to graphs...
Removed invalid SMILES at index 86: O=C1O[Al]23(OC1=O)(OC(=O)C(=O)O2)OC(=O)C(=O)O3
Removed invalid SMILES at index 650: Cc1ccc([B-2]2(c3ccc(C)cc3)=NCCO2)cc1
Removed invalid SMILES at index 7767: Oc1ccc(C2Oc3cc(O)cc4c3C(=[O+][AlH3-3]35(O4)(Oc4cc(O)cc6c4C(=[O+]3)C(O)C(c3ccc(O)cc3O)O6)Oc3cc(O)cc4c3C(=[O+]5)C(O)C(c3ccc(O)cc3O)O4)C2O)c(O)c1
Removed invalid SMILES at index 17025: CC(c1cccs1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
Removed invalid SMILES at index 17026: CC(c1ccccn1)=[N+]1NC(N)=[S+][AlH3-]12[OH+]B(c1ccccc1)[OH+]2
LENGTH OF LABELS: 22346
Graphs done.


  labels_dictionary = {'labels':torch.tensor(labels)} # dgl expects tensors for metadata (we convert labels to a dictionary with tensor)


Saving the graphs and labels
Number of graphs: 22346, Number of labels: 1


  labels_dict_int[key] = torch.tensor(item, dtype=torch.int)


Graphs and labels saved.
