In [None]:
#importing libraries
import numpy as np
from tqdm.notebook import tqdm

import tensorflow

def read_sdf(file):
    with open(file, 'r') as rf:
        content = rf.read()
    samples = content.split('$$$$')


    def parse_sample(s):
        lines = s.splitlines()    #splitting the text data to lines
        links = []                #empty array for links
        nodes = []                #empty array for nodes
        label = 0
        for l in lines:           #loop over each line
            if l.strip() == '1.0':   #
                label = 1
            if l.strip() == '-1.0':
                label = 0
            if l.startswith('    '):
                feature = l.split()
                node = feature[3]
                nodes.append(node)
            elif l.startswith(' '):
                lnk = l.split()
                if int(lnk[0]) - 1 < len(nodes):
                    links.append((
                        int(lnk[0])-1,
                        int(lnk[1])-1,

                    ))
        return nodes, np.array(links), label

    return [parse_sample(s) for s in tqdm(samples) if len(s[0]) > 0]
from sklearn.model_selection import train_test_split

#reading train.sdf file
training_set = read_sdf('train.sdf')

  0%|          | 0/25024 [00:00<?, ?it/s]

In [None]:
#reading test file
testing_set  = read_sdf('test_x.sdf')

  0%|          | 0/12326 [00:00<?, ?it/s]

In [None]:
#Tokenizer importing
from tensorflow.keras.preprocessing.text import Tokenizer

#vocabulary size
max_vocab = 500
max_len = 100 # maximum length of the tokenized vector



all_nodes = [s[0] for s in training_set]

#training tokenizer
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(all_nodes)
#importing libraries
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random
random.seed(0) #random seed



def prepare_single_batch(samples):
    sample_nodes = [s[0] for s in samples]
    sample_nodes = tokenizer.texts_to_sequences(sample_nodes)
    sample_nodes = pad_sequences(sample_nodes, padding='post')
    max_nodes_len = np.shape(sample_nodes)[1]
    edges = [s[1]+i*max_nodes_len for i,s in enumerate(samples)]
    edges = [e for e in edges if len(e) > 0]


    node_to_graph = [[i]*max_nodes_len for i in range(len(samples))]

    all_nodes = np.reshape(sample_nodes, -1)
    all_edges = np.concatenate(edges)

    node_to_graph = np.reshape(node_to_graph, -1)
    return {
        'data': all_nodes,
        'edges': all_edges,
        'node2grah': node_to_graph,
    }, np.array([s[2] for s in samples])


def gen_batch(dataset, batch_size=16, repeat=False, shuffle=True):
    while True:
        dataset = list(dataset)
        if shuffle:
            random.shuffle(dataset)
        l = len(dataset)
        for ndx in range(0, l, batch_size):
            batch_samples = dataset[ndx:min(ndx + batch_size, l)]
            yield prepare_single_batch(batch_samples)
        if not repeat:
            break


In [None]:
!pip install --quiet tf2_gnn
from tf2_gnn.layers.gnn import GNN, GNNInput

In [None]:
import tensorflow as tf
from tensorflow.math import segment_mean #to calculate segmented mean
from tensorflow import keras
from tensorflow.keras import Input, Model #layers and model
from tensorflow.keras.layers import Embedding, Dense #layers
from tensorflow.keras.optimizers import Adam #optimizer
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 40
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)

pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

gnn_out KerasTensor(type_spec=TensorSpec(shape=(None, 40), dtype=tf.float32, name=None), name='gnn/StatefulPartitionedCall:0', description="created by layer 'gnn'")
mean: KerasTensor(type_spec=TensorSpec(shape=(None, 40), dtype=tf.float32, name=None), name='tf.math.segment_mean/SegmentMean:0', description="created by layer 'tf.math.segment_mean'")
pred: KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), name='dense/Sigmoid:0', description="created by layer 'dense'")
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None,)]            0           []                      

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)


In [None]:
zero_class=[]
one_class=[]
for i in range(len(training_set)):

    if training_set[i][2]==0:

        zero_class.append(training_set[i])
    else:
        one_class.append(training_set[i])

In [None]:
from sklearn.utils import resample
one_upsample = resample(one_class,
             replace=True,
             n_samples=len(zero_class),
             random_state=19)

In [None]:
training_balanced = [*zero_class,*one_upsample]

In [None]:
training_balanced, validation_balanced = train_test_split(training_balanced, test_size=0.15)

In [None]:
import math

batch_size = 8
num_batchs = math.ceil(len(training_balanced) / batch_size)

num_batchs_validation = math.ceil(len(validation_balanced) / batch_size)

model.fit(
    gen_batch(
        training_balanced, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=50,
    validation_data=gen_batch(
        validation_balanced, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x16620a147c0>

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_1.csv')

In [None]:
from  tf2_gnn.layers.message_passing import rgat,rgin,rgcn,gnn_film,ggnn,gnn_edge_mlp
#importing tensorflow and other libraries
import tensorflow as tf
from tensorflow.math import segment_mean #to calculate segmented mean
from tensorflow import keras
from tensorflow.keras import Input, Model #layers and model
from tensorflow.keras.layers import Embedding, Dense #layers
from tensorflow.keras.optimizers import Adam #optimizer



data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 75)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 32
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
fc1 = Dense(8,activation='relu')(avg)
pred = Dense(1, activation='sigmoid')(fc1)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)
import math

batch_size = 64

num_batchs = math.ceil(len(training_balanced) / batch_size)

num_batchs_validation = math.ceil(len(validation_balanced) / batch_size)

model.fit(
    gen_batch(
        training_balanced, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=2,
    validation_data=gen_batch(
        validation_balanced, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)


In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_2.csv')

In [None]:
from  tf2_gnn.layers.message_passing import RGAT,  MessagePassing, MessagePassingInput
#importing tensorflow and other libraries
import tensorflow as tf
from tensorflow.math import segment_mean #to calculate segmented mean
from tensorflow import keras
from tensorflow.keras import Input, Model #layers and model
from tensorflow.keras.layers import Embedding, Dense #layers
from tensorflow.keras.optimizers import Adam #optimizer
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["message_calculation_class"] = 'rgat'
params["num_heads"] = 3
params["hidden_dim"] = 12
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)
import math

batch_size = 16

num_batchs = math.ceil(len(training_balanced) / batch_size)

num_batchs_validation = math.ceil(len(validation_balanced) / batch_size)

model.fit(
    gen_batch(
        training_balanced, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=50,
    validation_data=gen_batch(
        validation_balanced, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_3.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 40
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)

pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)


In [None]:
import math
#splitting the train data
training_set, validation_set = train_test_split(training_set, test_size=0.15,)
batch_size = 8

num_batchs = math.ceil(len(training_set) / batch_size)

num_batchs_validation = math.ceil(len(validation_set) / batch_size)

model.fit(
    gen_batch(
        training_set, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=50,
    validation_data=gen_batch(
        validation_set, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_4.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 75)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 32
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
fc1 = Dense(8,activation='relu')(avg)
pred = Dense(1, activation='sigmoid')(fc1)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:
import math
#splitting the train data
training_set, validation_set = train_test_split(training_set, test_size=0.15,)
batch_size = 64

num_batchs = math.ceil(len(training_set) / batch_size)

num_batchs_validation = math.ceil(len(validation_set) / batch_size)

model.fit(
    gen_batch(
        training_set, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=2,
    validation_data=gen_batch(
        validation_set, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_5.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["message_calculation_class"] = 'rgat'
params["num_heads"] = 3
params["hidden_dim"] = 12
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:
import math
#splitting the train data
training_set, validation_set = train_test_split(training_set, test_size=0.15,)
batch_size = 16

num_batchs = math.ceil(len(training_set) / batch_size)

num_batchs_validation = math.ceil(len(validation_set) / batch_size)

model.fit(
    gen_batch(
        training_set, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=2,
    validation_data=gen_batch(
        validation_set, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_6.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["message_calculation_class"] = 'rgat'
params["num_heads"] = 3
params["hidden_dim"] = 12
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:
import math
#splitting the train data
training_set, validation_set = train_test_split(training_set, test_size=0.15,)
batch_size = 16

num_batchs = math.ceil(len(training_set) / batch_size)

num_batchs_validation = math.ceil(len(validation_set) / batch_size)

model.fit(
    gen_batch(
        training_set, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=30,
    validation_data=gen_batch(
        validation_set, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_7.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 75)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 32
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
fc1 = Dense(8,activation='relu')(avg)
pred = Dense(1, activation='sigmoid')(fc1)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:
import math
#splitting the train data
training_set, validation_set = train_test_split(training_set, test_size=0.15,)
batch_size = 16

num_batchs = math.ceil(len(training_set) / batch_size)

num_batchs_validation = math.ceil(len(validation_set) / batch_size)

model.fit(
    gen_batch(
        training_set, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=10,
    validation_data=gen_batch(
        validation_set, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_8.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["hidden_dim"] = 40
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)

pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:
batch_size = 64
num_batchs = math.ceil(len(training_balanced) / batch_size)

num_batchs_validation = math.ceil(len(validation_balanced) / batch_size)

model.fit(
    gen_batch(
        training_balanced, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=10,
    validation_data=gen_batch(
        validation_balanced, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_9.csv')

In [None]:
data = keras.Input(batch_shape=(None,))
edge = keras.Input(batch_shape=(None, 2), dtype=tf.int32)
node2graph = keras.Input(batch_shape=(None,), dtype=tf.int32)
embeded = Embedding(tokenizer.num_words, 100)(data)
num_graph = tf.reduce_max(node2graph)+1
gnn_input = GNNInput(
    node_features=embeded,
    adjacency_lists=(edge,),
    node_to_graph_map=node2graph,
    num_graphs=num_graph,
)
params = GNN.get_default_hyperparameters()
params["message_calculation_class"] = 'rgat'
params["num_heads"] = 3
params["hidden_dim"] = 12
gnn_layer = GNN(params)
#gnn output layer
gnn_out = gnn_layer(gnn_input)
print('gnn_out', gnn_out)
avg = segment_mean(
    data=gnn_out,
    segment_ids=node2graph
    )
print('mean:', avg)
pred = Dense(1, activation='sigmoid')(avg)
print('pred:', pred)
model = Model(
    inputs={
        'data': data,
        'edges': edge,
        'node2grah': node2graph,
    },
    outputs=pred
)
#printing summary of the model
model.summary()

In [None]:
model.compile(
    loss='BinaryCrossentropy',
    metrics=['AUC']
)

In [None]:

batch_size = 64
num_batchs = math.ceil(len(training_balanced) / batch_size)

num_batchs_validation = math.ceil(len(validation_balanced) / batch_size)

model.fit(
    gen_batch(
        training_balanced, batch_size=batch_size, repeat=True
    ),
    steps_per_epoch=num_batchs,
    epochs=10,
    validation_data=gen_batch(
        validation_balanced, batch_size=16, repeat=True
    ),
    validation_steps=num_batchs_validation,
)

In [None]:
y_pred = model.predict(
    gen_batch(testing_set, batch_size=16, shuffle=False)
)
y_pred = np.reshape(y_pred, -1)
len(y_pred)
import pandas as pd
submission = pd.DataFrame({'label':y_pred})
submission.index.name = 'id'
submission.to_csv('trail_10.csv')

# Based on the provided template, describe the format of the input file (sdf file).
The input file is structure data file (SDF). It contains information about the chemical composition of a molecule. SDF file store information about position of individual atom in the chemical compound and also tells about the connections. Bond block block tells about the bonding structure of the compound. These both blocks are used in this assignment to get information about the compound and saving them in form of edges and nodes. Each node is the atom given in the chemical molecule.Different molecules are delimited by '$$$$' expression.
Each sample/molecule starts with header which tells about the name/title of the compound. Other sections includes information about Atom count, version number, connections etc. Atom block tells about the elements of the compound.

# What are the input tensors to the neural network model (their meaning, not just symbol)? What is each of their dims and their meaning (e.g. batch_size?)?
 The input tensors :
data: The shape for each batch is batch_size*max_len_nodes, where batch_size is the number of samples in the batch and max_len_nodes is the length of tokenized nodes after padding is done. The data contains the nodes of the chemical compound in the tokenized form. Nodes for each compound are extracted, then they are tokenized using the tokenizer and finally padding is done using pad_sequence method.
node2graph:  The shape for each batch is batch_size*max_len_nodes, where batch_size is the number of samples in the batch and max_len_nodes is the length of tokenized nodes after padding is done.
It is the input tensor which is used for segmented mean and contains information about segmented ids.
edge:The shape of edge is sum_of_all_edges,2. The sum_of_all_edges represents the sum(no. of edges of each sample) of the batch_size. For example in a batch of 3 samples, the number of edges in sample 1: 21, sample 2: 20 and sample 3: 40. So the size of edge tensor would be 81,2. edge is the input tensor which carries information about connections between atoms.

# For each dim of gnn_out, what does it represent? For each dim of avg, what does it represent?
gnn_out: The gnn_out is of shape batch_size_node_dimension,hidden layers, It represents the aggregation output of the model for each hidden layer. where batch_size_node_dimension is the dimension of the input data (node) vector (dimension of tokenized vector for the complete batch).

avg:  The final output of the avg tensor is of shape batch_size, hidden_layer. It is a way of collecting information for each sample and representing it in the form of mean data. Each sample has one segment id. Thus the segment_mean takes the mean of all the output data in the gnn_out output and represents one sample with one number for each hidden layer.Average takes the segmented mean of the gnn_out based on the segmented ids. For each sample in the batch_size, the output of gnn_out is tokenized_vector_dimension, hidden_layers.

#What is the difference between segment_mean and tf.reduce_mean? For each dim of pred, what does it represent?
 segment_mean takes the mean of the data which have same segmented ids.
pred:  the final output is a number which represents the probability associated with each chemical compound about its activity.The final output (pred) tells about the probability of a chemical compound to be active for the cancer cell or not. The shape of pred is batch_size,1. Thus for each sample.
reduce_mean: computes the mean of elements across dimensions of a tensor given the arguments.



#What is the motivation / theory/idea to use multiple gcn layers comparing to just one? How many layers were used in the template?
The default layer are 4 as given in the documentaion. The default message passing method is rgcn (Graph convolution layers). Using multiple gcn helps in incorporating all the graph complexity properly and thus creates a better model.The default template implements the default setting of the number of layers in the gcn network.

Problem Description: It is a binary classification problem based on the graph data. The task is to predict the anticancer activity of a chemical compound using the chemical structure of the compound. The chemical compound can be positive or negative against lung cancer cell and thus labelled as either 0 or 1.The data is in the form of graph which represents the chemical structure of the compound. Each sample of data contains information about the atoms and the connections between atoms of the molecule. So in this problem the features are the atoms and connections.

Methods
The nodes(atoms) are given as characters . Thus it is treated as sequence of text data and best way to describe the text data sequence to tokenize the data and then adding the embeddig layer.
The first step is to read the sdf file to get the information about the atoms and their connectivity in the compound. The atoms are described as nodes and connections are described as edges. The read_sdf method is used to read sdf file and the chemical composition of the compound.
Graph convolutional network is used in this assignment to calculate the probability of the output class. Different methods differ in implementing message passing methods as:
R-GIN Method: (Relation Graph Isomorphism Network message propogation layer) Compute new graph states by neural message passing using MLPs for state updates and message computation.
GGNN Method: (Gated graph neural network layer) Compute new graph states by neural message passing and gated units on the nodes. This method works best for this problem as given in Aggregation Method 2 below.
R-GCN Method: (Graph convolution layers) Compute new graph states by neural message passing.
R-GAT Method: (Relation graph attention network layer) Compute new graph states by neural message passing using attention.



