In [2]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
import torch_geometric
from torch_geometric.data import Data
import networkx as nx
import torch
from transformers import GraphormerModel, GraphormerConfig
from transformers import T5EncoderModel
from transformers import T5Tokenizer
SEED = 42
import random
torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

ModuleNotFoundError: No module named 'torch_geometric'

In [3]:
# pip install torch_geometric datasets transformers accelerate

In [3]:
from torch.utils.data import Dataset

class QuestionAnswerDataset(Dataset):

    def __init__(self, df, tokenizer, max_length, context_key="answerEntity",
                 tokenizer_truncation="only_first"):
        super(QuestionAnswerDataset).__init__()

        self.questions = df.question.values
        self.contexts = df[context_key].values
        self.labels = torch.tensor(df.correct.values, dtype=torch.float32)#torch.tensor([0]*10961, dtype=torch.float32)##torch.tensor(df.correct.values, dtype=torch.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.tokenized_input = [tokenizer.encode_plus(x, y,
                              max_length=self.max_length,
                              padding="max_length",
                              truncation=tokenizer_truncation,
                              return_tensors="pt", ) \
                          for x, y in zip(self.questions,
                                          self.contexts)]
        print(len(self.questions))
        print(len(self.contexts))
        print(len(self.labels))
        assert len(self.questions) == len(self.contexts) == len(self.labels)

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return {
                "input_ids": self.tokenized_input[idx]["input_ids"][0],
                "attention_mask" : self.tokenized_input[idx]["attention_mask"][0],
                "labels": self.labels[idx]}

In [4]:
training_data = "/kaggle/input/kbqa-classification/train.tsv"#"/kaggle/input/kbqa-classification/train.tsv"#"/kaggle/input/kbqa-classification/train.tsv"#"/kaggle/input/kbqa-classification/train.tsv"
train_data = pd.read_csv(training_data, sep="\t")
train_data["graph"] = train_data["graph"].apply(eval)
train_data.head(2)

Unnamed: 0,sample_id,question,questionEntity,answerEntity,groundTruthAnswerEntity,answerEntityId,questionEntityId,groundTruthAnswerEntityId,correct,graph
0,0,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini's return to Iran,Office of the Supreme Leader of Iran,Q7293530,Q794,Q16045000,False,"{'nodes': [{'type': 'QUESTIONS_ENTITY', 'name_..."
1,1,Whst is the name of the head of state and high...,Iran,Ruhollah Khomeini's letter to Mikhail Gorbachev,Office of the Supreme Leader of Iran,Q5952984,Q794,Q16045000,False,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q417..."


In [7]:
# train_data['correct']=train_data['GT']

In [8]:
# graph_json = train_data.iloc[0].graph
# graph_json['directed']=True
# nx_graph = nx.node_link_graph(graph_json, )


In [9]:
# nx_graph

In [10]:
def linearize_graph(graph_dict):
    nodes = sorted((node_dict for node_dict in graph_dict["nodes"]), key=lambda d:d["id"])
    for n_id, node_dict in enumerate(nodes):
        assert n_id == node_dict["id"]
    src_node_id2links = {}
    # print("graph_dict", graph_dict)
    # print("links", graph_dict["links"])
    for link_dict in graph_dict["links"]:
        link_src =  link_dict["source"]
        if src_node_id2links.get(link_src) is None:
            src_node_id2links[link_src] = []
        src_node_id2links[link_src].append(link_dict)
    graph_s = ""
    # print("src_node_id2links", src_node_id2links)
    for n_id, node_dict in enumerate(nodes):
        links = src_node_id2links.get(n_id, list())
        start_label = node_dict["label"]
        if node_dict["type"] == "ANSWER_CANDIDATE_ENTITY":
            start_label = f"{SEP_TOKEN} {start_label} {SEP_TOKEN}"
        for link_dict in links:
            target_label = nodes[link_dict["target"]]["label"]
            if nodes[link_dict["target"]]["type"] == "ANSWER_CANDIDATE_ENTITY":
                target_label = f"{SEP_TOKEN} {target_label} {SEP_TOKEN}"
            link_s = f" {start_label}, {link_dict['label']}, {target_label} "
            graph_s += link_s
        # graph_s += node_dict["label"]
        # print("n_id, node_dict", n_id, node_dict)
        # if n_id != len(nodes) - 1:


        #     link_label = link["label"]
        #     graph_s += link_label
    # print('--')
    return graph_s

In [11]:
# train_data['qa'] = train_data['question']+" ; "+train_data['answerEntity']
tokenizer = T5Tokenizer.from_pretrained("DeepPavlov/t5-wikidata5M-with-neighbors")#("s-nlp/t5_large_ssm_nq_mintaka")#("DeepPavlov/t5-wikidata5M-with-neighbors")
# qa_tokens = []
SEP_TOKEN = tokenizer.sep_token
train_data["linearized_graph"] = train_data["graph"].apply(linearize_graph)


tokenizer_config.json:   0%|          | 0.00/1.94k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.81k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [1]:
train_data[27]

NameError: name 'train_data' is not defined

In [12]:
train_data['linearized_graph'][27]

' English, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  World War I, on focus list of Wikimedia project, Wikipedia:Vital articles/Level/4  [SEP] Franz Ferdinand [SEP], language of work or name, English '

In [13]:
train_data['graph'][27]

{'nodes': [{'type': 'INTERNAL',
   'name_': 'Q6173448',
   'id': 0,
   'label': 'Wikipedia:Vital articles/Level/4'},
  {'type': 'INTERNAL', 'name_': 'Q1860', 'id': 1, 'label': 'English'},
  {'type': 'QUESTIONS_ENTITY',
   'name_': 'Q361',
   'id': 2,
   'label': 'World War I'},
  {'type': 'ANSWER_CANDIDATE_ENTITY',
   'name_': 'Q829973',
   'id': 3,
   'label': 'Franz Ferdinand'}],
 'links': [{'name_': 'P5008',
   'source': 1,
   'target': 0,
   'label': 'on focus list of Wikimedia project'},
  {'name_': 'P5008',
   'source': 2,
   'target': 0,
   'label': 'on focus list of Wikimedia project'},
  {'name_': 'P407',
   'source': 3,
   'target': 1,
   'label': 'language of work or name'}]}

In [296]:
# model_name="sentence-transformers/all-mpnet-base-v2"
# from transformers import AutoTokenizer, AutoModel
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# bert_model = AutoModel.from_pretrained(model_name)
# SEP_TOKEN = tokenizer.sep_token
# train_data["linearized_graph"] = train_data["graph"].apply(linearize_graph)


In [297]:
# train_data['correct'] = [0]*10961

In [305]:
train_dataset = QuestionAnswerDataset(train_data, tokenizer=tokenizer, max_length=128,context_key="linearized_graph",
                                      tokenizer_truncation="only_second")
train_dataset

37672
37672
37672


<__main__.QuestionAnswerDataset at 0x7f469d2308b0>

In [306]:
train_data['qa_tokens']=[dict(item)['input_ids'] for item in train_dataset]

In [139]:
# true_labels_df = train_data[train_data['correct'] ==True].copy()
# duplicated_true_labels = pd.concat([true_labels_df] * 3, ignore_index=True)

# # Concatenate the original DataFrame and duplicated true labels
# train_data = pd.concat([train_data, duplicated_true_labels], ignore_index=True)
# train_data.info()

In [140]:
# train_data.to_csv('prep_dataset.csv', index=False)

In [307]:
# # Copyright (c) Microsoft Corporation and HuggingFace
# # Licensed under the MIT License.

from typing import Any, Dict, List, Mapping


class GraphormerDataCollator:
    def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
        # if not is_cython_available():
            # raise ImportError("Graphormer preprocessing needs Cython (pyximport)")

        self.spatial_pos_max = spatial_pos_max
        self.on_the_fly_processing = on_the_fly_processing

    def __call__(self, features: List[dict]) -> Dict[str, Any]:
        # if self.on_the_fly_processing:
        #     features = [preprocess_item(i) for i in features]
        try:
            if not isinstance(features[0], Mapping):
                features = [vars(f) for f in features]
            batch = {}
            max_node_num = max(len(i["input_nodes"]) for i in features)
            node_feat_size = len(features[0]["input_nodes"][0])
            edge_feat_size = len(features[0]["attn_edge_type"][0][0])
            max_dist = max(len(i["input_edges"][0][0]) for i in features)
            edge_input_size = len(features[0]["input_edges"][0][0][0])
            batch_size = len(features)

            batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
            batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
            batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
            batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
            batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
            batch["input_edges"] = torch.zeros(
                batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
            )
            batch["qa_tokens"] = torch.zeros(batch_size,1, 128, dtype=torch.int)  # Initialize qa_tokens tensor

            for ix, f in enumerate(features):
                for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges", "qa_tokens"]:
                    f[k] = torch.tensor(f[k])

                if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
                    f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")

                batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
                batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
                    "attn_edge_type"
                ]
                batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
                batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
                batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
                batch['qa_tokens'][ix] = f['qa_tokens'][0]
                batch["input_edges"][
                    ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
                ] = f["input_edges"]

            batch["out_degree"] = batch["in_degree"]
    #         batch["qa"]
            sample = features[0]["labels"]
            if len(sample) == 1:  # one task
                if isinstance(sample[0], float):  # regression
                    batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
                else:  # binary classification
                    batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
            else:  # multi task classification, left to float to keep the NaNs
                batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))

            return batch
        except:
            pass

In [308]:
# import MultiModalDiscussionTransformer.mDT.src.data.pyg_datasets.pre_processing as p
from transformers.models.graphormer.collating_graphormer import preprocess_item #GraphormerDataCollator
# from MultiModalDiscussionTransformer.mDT.src.data import algos # import floyd_warshall, gen_edge_input  # Import Cython functions
import numpy as np
import json

def json_graph_to_pyg_data(G, graph_json):
    # Create a NetworkX graph from the JSON data
    # G = nx.Graph()

    # # Add nodes with features
    # for node in json_graph['nodes']:
    #     G.add_node(node['id'], type=node['type'], name=node['name_'], label=node['label'])

    # # Add edges with attributes
    # for link in json_graph['links']:
    #     G.add_edge(link['source'], link['target'], name=link['name_'], label=link['label'])
    # G = nx.node_link_graph(graph_json, )
    # Convert NetworkX graph to PyTorch Geometric data
    edges = torch.tensor(list(G.edges()), dtype=torch.long).t().contiguous()
    # print(edges)
    edge_label_map = {label: idx for idx, label in enumerate(set(link['label'] for link in graph_json['links']))}
    # print(edge_label_map)
    edge_attrs = torch.tensor([edge_label_map[G[u][v][0]['label']] for u, v in list(G.edges())], dtype=torch.long).unsqueeze(1)

    # print(edge_attrs)
    x = torch.tensor([[node['id']] for node in graph_json['nodes']], dtype=torch.float)  # Using node IDs as features

    dist_matrix = np.zeros((len(G.nodes), len(G.nodes)), dtype=np.float32)
    for u, v, d in G.edges(data=True):
        dist_matrix[u][v] = 1  # For non-weighted graph, set distance as 1

    for i in range(len(G.nodes)):
        dist_matrix[i][i] = 0  # Diagonal elements

    dist_matrix = nx.floyd_warshall_numpy(G, weight=None)
    data = Data(node_feat=x, edge_index=edges, edge_attr=edge_attrs, distance_matrix=[dist_matrix], num_nodes = x.size(0), y=[0])

    return data

In [315]:
# Mine
from tqdm import tqdm
dataset = []
indexes = []
for ind, item in tqdm(train_data.iterrows()):
    try:
        graph_json = item.graph
        graph_json["directed"] = True
        G = nx.node_link_graph(graph_json, )

        data = json_graph_to_pyg_data(G, graph_json)
        data=preprocess_item(data)
#         print(data)
#         print(item.qa_tokens)
        data['qa_tokens'] = item.qa_tokens
        data.y =[1 if item.correct==True else 0]
        # data  = preprocess_item(data)
        for key in data.keys():
            data[key] = torch.tensor(data[key])
            data[key] = data[key].unsqueeze(0)
        dataset.append(data)
    except:
        print(ind)
        indexes.append(ind)
#         dataset.append(Data(x=[0]))
        pass

  data[key] = torch.tensor(data[key])
319it [00:00, 649.92it/s]

190


2152it [00:03, 689.20it/s]

2072


2439it [00:03, 705.10it/s]

2337


4655it [00:07, 593.15it/s]

4549


6189it [00:09, 678.42it/s]

6068


7224it [00:11, 694.47it/s]

7148
7215


7581it [00:11, 682.45it/s]

7478


8055it [00:12, 651.16it/s]

7944


9161it [00:14, 654.76it/s]

9042


12272it [00:18, 642.50it/s]

12206


12649it [00:19, 598.56it/s]

12578
12619


13037it [00:20, 633.49it/s]

12911


13754it [00:21, 645.60it/s]

13665
13744
13766


13897it [00:21, 646.04it/s]

13811
13834
13871
13888


15664it [00:24, 617.64it/s]

15589


16455it [00:25, 674.79it/s]

16354


17110it [00:26, 624.55it/s]

17005


18322it [00:31, 532.16it/s]

18203


18668it [00:31, 647.18it/s]

18563
18574
18585


19585it [00:33, 653.65it/s]

19459
19481
19493
19503
19574


19801it [00:33, 692.87it/s]

19660


19938it [00:33, 657.91it/s]

19825


21129it [00:35, 625.94it/s]

21011
21113


21261it [00:35, 638.84it/s]

21173
21186
21207
21237


21394it [00:35, 642.84it/s]

21310


22088it [00:37, 652.94it/s]

22000
22043
22063
22086
22109


22221it [00:37, 634.73it/s]

22151
22179


22806it [00:38, 645.69it/s]

22729
22837


23403it [00:39, 649.29it/s]

23287
23303
23359


24138it [00:40, 575.23it/s]

24060


24826it [00:41, 599.04it/s]

24744
24756


25292it [00:42, 642.96it/s]

25194


26209it [00:43, 650.43it/s]

26120
26171


26345it [00:43, 656.67it/s]

26274
26304


27251it [00:45, 624.88it/s]

27127
27138
27233


27446it [00:45, 639.03it/s]

27363
27376


27700it [00:45, 617.79it/s]

27581
27604
27662


27976it [00:46, 670.99it/s]

27901


28518it [00:47, 631.65it/s]

28425


28782it [00:47, 634.85it/s]

28657


29131it [00:48, 690.48it/s]

29051
29059
29072
29139
29149


29343it [00:48, 696.06it/s]

29209
29229
29260
29296
29305


29697it [00:48, 683.20it/s]

29571
29645
29669


30525it [00:50, 627.75it/s]

30406


30913it [00:50, 639.27it/s]

30815


31122it [00:51, 671.93it/s]

31037
31103
31129
31169


31259it [00:51, 673.76it/s]

31178


31607it [00:51, 682.04it/s]

31520


32104it [00:52, 684.13it/s]

32030


32315it [00:52, 687.67it/s]

32232
32236


32657it [00:53, 665.31it/s]

32529


32798it [00:53, 683.96it/s]

32676
32708
32756
32773


33005it [00:53, 681.70it/s]

32874
32895
32973
32983
33009


33148it [00:54, 690.43it/s]

33046


33495it [00:54, 672.18it/s]

33401


33848it [00:55, 698.07it/s]

33750
33760


34290it [00:55, 723.41it/s]

34157
34178


34434it [00:56, 679.52it/s]

34351
34393
34439
34477


34717it [00:56, 700.40it/s]

34577
34586


35073it [00:56, 677.42it/s]

35004
35049
35065
35092
35120


35283it [00:57, 687.89it/s]

35144
35153
35166
35195


35985it [00:58, 563.54it/s]

35919


36249it [00:58, 640.79it/s]

36128


36778it [00:59, 612.94it/s]

36664


37672it [01:01, 615.49it/s]


In [316]:
from torch_geometric.loader import DataLoader
data_collator=GraphormerDataCollator()
loader = DataLoader(dataset, batch_size=1, collate_fn=data_collator, shuffle=False)

In [317]:
len(loader)

37541

In [318]:
generator1 = torch.Generator().manual_seed(SEED)
test_dataset, dev_dataset = torch.utils.data.random_split(dataset, [3700, 33841], generator=generator1)

In [319]:
train_loader = DataLoader(dev_dataset, batch_size=1, collate_fn=data_collator, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=data_collator, shuffle=False)

In [24]:
from datasets import DatasetDict
dataset_dict = DatasetDict({'train': dataset}, )

In [320]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class AttentionPooling(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AttentionPooling, self).__init__()
        self.linear = nn.Linear(input_size, hidden_size)
        self.attention_weights = nn.Linear(hidden_size, 1)

    def forward(self, last_hidden_state):
        # Apply linear transformation to obtain attention scores
        attention_scores = self.attention_weights(torch.tanh(self.linear(last_hidden_state)))

        # Apply softmax to get attention weights
        attention_weights = F.softmax(attention_scores, dim=0)

        # Apply attention weights to node embeddings and sum to get graph embedding
        graph_embedding = torch.sum(torch.mul(last_hidden_state, attention_weights), dim=0)

        return graph_embedding


class InterHead(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.text_fc = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.graph_fc = nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.softmax = nn.Softmax(dim=0)
        self.atn_layer = AttentionPooling(input_size=512, hidden_size=512)

    def forward(self, text, graph):
        # text = torch.mean(text[0], dim=0)
        # graph = self.atn_layer(graph[0])
        text_sen = self.text_fc(text)
        graph_sen = self.graph_fc(graph)

        a_tt = torch.mul(text, text_sen).sum(-1)
        a_tg = torch.mul(text, graph_sen).sum(-1)
        a_gt = torch.mul(graph, text_sen).sum(-1)
        a_gg = torch.mul(graph, graph_sen).sum(-1)

        a_tt, a_tg = self.softmax(torch.stack([a_tt, a_tg])).split([1, 1], dim=0)
        a_gt, a_gg = self.softmax(torch.stack([a_gt, a_gg])).split([1, 1], dim=0)

        a_tt = a_tt.squeeze(0).unsqueeze(-1)
        a_tg = a_tg.squeeze(0).unsqueeze(-1)
        a_gt = a_gt.squeeze(0).unsqueeze(-1)
        a_gg = a_gg.squeeze(0).unsqueeze(-1)

        text = torch.mul(a_tt, text) + torch.mul(a_tg, graph)
        graph = torch.mul(a_gt, text) + torch.mul(a_gg, graph)
        return text, graph

class TextModule(nn.Module):
    def __init__(self):
        super().__init__()
        T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
        self.auto_model = T5EncoderModel.from_pretrained("DeepPavlov/t5-wikidata5M-with-neighbors")
        self.tokenizer = T5Tokenizer.from_pretrained("DeepPavlov/t5-wikidata5M-with-neighbors")
        # self.auto_model = bert_model
    def forward(self, inputs):
        hidden_state = self.auto_model(inputs)
        return hidden_state['last_hidden_state']


class GraphormerModule(nn.Module):
    def __init__(self):
        super().__init__()
        config = GraphormerConfig(embedding_dim=512)
        self.model = GraphormerModel.from_pretrained("clefourrier/pcqm4mv2_graphormer_base", ignore_mismatched_sizes=True,config=config )
#         for layer in self.model.children():
#             if hasattr(layer, 'reset_parameters'):
#                 layer.reset_parameters()
    def forward(self, input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type):
        graph_encoding = self.model(input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type)
        outputs, hidden_states = graph_encoding["last_hidden_state"], graph_encoding["hidden_states"]
        return outputs, hidden_states

class T5GrapormerInteractModel(nn.Module):
    def __init__(self, layers=10, dropout=0.1):
        super().__init__()
        # self.layers=layers
        # self.dropout=droput
        self.text_encoder = TextModule()
        self.graph_encoder = GraphormerModule()
        for param in self.text_encoder.parameters():
            param.requires_grad = True
        for param in self.graph_encoder.parameters():
            param.requires_grad = True
        self.inter = nn.ModuleList()
        for i in range(10):
            self.inter.append(InterHead(hidden_dim=512))
        self.cls = nn.Linear(512, 1)
        self.act_func = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        # self.dense = nn.linear(512, 512)
        self.fc = nn.Linear(2 * 512, 512)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.Dropout(p=0.1),
            nn.ReLU(),
            nn.Linear(512, 1),
        )
        self.atn_layer=AttentionPooling(input_size=512, hidden_size=512)
    def forward(self, inputs):
        text_rep = self.text_encoder(inputs['qa_tokens'])
        # print(text_rep)
        graph_rep, _ = self.graph_encoder(torch.tensor(inputs['input_nodes']), torch.tensor(inputs['input_edges']), torch.tensor(inputs['attn_bias']), torch.tensor(inputs['in_degree']), torch.tensor(inputs['out_degree']), torch.tensor(inputs['spatial_pos']), torch.tensor(inputs['attn_edge_type']))
        # print(graph_rep)
#         print(text_rep, graph_rep)
        text_rep = torch.stack([elem[0, :] for elem in text_rep])#torch.stack([elem[0, :] for elem in text_rep])
        graph_rep = torch.stack([elem[0, :] for elem in graph_rep])#torch.stack([elem[0, :] for elem in graph_rep])
        org_text_rep = text_rep
        org_graph_rep = graph_rep
        for i in range(10-1):
            text_rep, graph_rep = self.inter[i](text_rep, graph_rep)
#         print(text_rep, graph_rep)
        text_code = org_text_rep+text_rep#torch.mul(org_text_rep, text_rep)
        graph_code = org_text_rep+graph_rep#torch.mul(org_graph_rep,graph_rep)
        text_code = self.dropout(self.act_func(text_code))
        graph_code = self.dropout(self.act_func(graph_code))
        rep = torch.cat([text_code,
                             graph_code],dim=-1)
#         print(rep)
        rep = self.act_func(self.dropout(self.fc(rep)))
        class_logits = self.classifier(rep)
#         class_logits = torch.sigmoid(class_logits)
        return class_logits






In [324]:
interact_model = T5GrapormerInteractModel()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of GraphormerModel were not initialized from the model checkpoint at clefourrier/pcqm4mv2_graphormer_base and are newly initialized: ['graph_encoder.graph_attn_bias.edge_dis_encoder.weight', 'graph_encoder.graph_attn_bias.edge_encoder.weight', 'graph_encoder.graph_attn_bias.graph_token_virtual_distance.weight', 'graph_encoder.graph_attn_bias.spatial_pos_encoder.weight', 'graph_encoder.graph_node_feature.atom_encoder.weight', 'graph_encoder.graph_node_feature.graph_token.weight', 'graph_encoder.graph_node_feature.in_degree_encoder.weight', 'graph_encoder.graph_node_feature.out_degree_encoder.weight', 'graph_encoder.layers.0.fc1.bias', 'graph_encoder.layers.0.fc1.weight', 'graph_encoder.layers.0.fc2.bias', 'graph_encoder.layers.0.fc2.weight', 'graph_encoder.layers.0.final_layer_norm.bias', 'graph_encoder.layers.0.final_layer_norm.weight', 'graph_encoder.layer

In [43]:
# interact_model.load_state_dict(torch.load('/content/drive/MyDrive/results_epoch16.pt', map_location=torch.device('cpu')))

In [28]:
# pip install wandb

In [322]:
import wandb
wandb.init(
    # set the wandb project where this run will be logged
    project="kadduformer",
)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
avg_test_loss,▇█▁▃
avg_train_loss,█▄▂▁
epoch,▁▁▃▃▆▆██
training_loss,█▆▄▂▅▄▅▄▄▂▂▃▄▄▅▆▃▃▃▄▃▃▃▅▂▃▁▁▂▃▁▃▄▃▂▁▄▃▂▄

0,1
avg_test_loss,0.17928
avg_train_loss,0.16118
epoch,3.0
training_loss,0.18538


In [46]:
# from google.colab import drive
# drive.mount('/content/drive')

In [323]:
import numpy as np

def calculate_pos_weights(num_positives, num_negatives):
  """
  Calculates class weight for positive class in binary crossentropy with logits loss.

  Args:
      num_positives: int Number of positive samples.
      num_negatives: int Number of negative samples.

  Returns:
      float Class weight for the positive class.
  """
  if num_positives == 0:
    raise ValueError("Number of positive samples cannot be zero.")
  pos_weight = num_negatives / num_positives
  return pos_weight

# Example usage
num_positives = 3754
num_negatives = 33918
pos_weight = calculate_pos_weights(num_positives, num_negatives)
print(pos_weight)

9.035162493340437


In [None]:
import torch.optim as optim
import torch.nn as nn
from torch.optim.lr_scheduler import StepLR
# weight = torch.tensor()
optimizer = optim.Adam(interact_model.parameters(), lr=3e-5,weight_decay=1e-5)

# Move model to device if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.BCELoss()#BCEWithLogitsLoss()#(pos_weight = torch.tensor([.9035162493340437]).to(device))#(weight=torch.tensor([0.09964960713527288]))#(pos_weight = torch.tensor([0.09964960713527288]))  # Binary Cross-Entropy Loss

interact_model.to(device)
train_dataloader=train_loader
# Training loop
print_frequency = 1000
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)  # Reduce LR by a factor of 0.1 every 5 epochs
epochs = 4

for epoch in range(epochs):
    interact_model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    step_counter = 0
    running_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}"):
#         if step_counter<20:
            try:

                optimizer.zero_grad()
                inputs, labels = batch.to(device),torch.tensor(batch['y'], dtype=torch.float).squeeze(1).to(device)

                # Forward pass
                outputs = interact_model(inputs)
#                 print(outputs[0])
#                 print(labels) # Remove the second dimension
                loss = criterion(outputs[0], labels)
                total_loss += loss.item()
                running_loss +=loss.item()
        #             grads = torch.autograd.grad(outputs=outputs, inputs=inputs, grad_outputs=torch.ones_like(outputs), allow_unused=True)

                loss.backward()
                  # Monitor gradients
                optimizer.step()

                step_counter += 1

                if step_counter % print_frequency == 0:
                    epoch_loss = total_loss / print_frequency  # Calculate average loss for the last print_frequency steps
                    print(f"output:{outputs.float().to(device)} , act: {labels.float()}")  # Calculate average loss for the last print_frequency steps
                    print("loss::::::::::::::::::::",epoch_loss)
                    wandb.log({"training_loss": epoch_loss})

                    total_loss = 0  # Reset total loss after printing
            except:
#                 print('error')
                pass

    scheduler.step()
# Compute average training loss for the epoch
    avg_train_loss = running_loss / len(train_loader)

    # Log average training loss for the epoch
    wandb.log({"epoch": epoch, "avg_train_loss": avg_train_loss})

    # Set model to evaluation mode
    interact_model.eval()

    # Evaluate on test data loader
    test_loss = 0.0
    with torch.no_grad():
        for test_batch in test_loader:
            try:
                inputs, labels = test_batch.to(device),torch.tensor(test_batch['y'], dtype=torch.float).squeeze(0).to(device)
                output = interact_model(inputs)
                test_loss += criterion(output[0], labels).item()
            except:
                pass
    avg_test_loss = test_loss / len(test_loader)

    # Log test loss for the epoch
    wandb.log({"epoch": epoch, "avg_test_loss": avg_test_loss})

    # Set model back to training mode
    interact_model.train()

    # Print epoch results
    torch.save(interact_model.state_dict(), f"results_epoch{epoch}.pt")
    print(f'Epoch {epoch + 1}, Avg. Training Loss: {avg_train_loss}, Avg. Test Loss: {avg_test_loss}')


  inputs, labels = batch.to(device),torch.tensor(batch['y'], dtype=torch.float).squeeze(1).to(device)
  graph_rep, _ = self.graph_encoder(torch.tensor(inputs['input_nodes']), torch.tensor(inputs['input_edges']), torch.tensor(inputs['attn_bias']), torch.tensor(inputs['in_degree']), torch.tensor(inputs['out_degree']), torch.tensor(inputs['spatial_pos']), torch.tensor(inputs['attn_edge_type']))
Epoch 1/4:   3%|▎         | 1002/33841 [01:17<46:29, 11.77it/s]

output:tensor([[0.0306]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.28899315743305487


Epoch 1/4:   6%|▌         | 2002/33841 [02:37<42:59, 12.34it/s]

output:tensor([[0.0003]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.23086554735901882


Epoch 1/4:   9%|▉         | 3001/33841 [04:01<42:33, 12.08it/s]

output:tensor([[0.1852]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.23945683776495572


Epoch 1/4:  12%|█▏        | 4001/33841 [05:21<38:49, 12.81it/s]

output:tensor([[0.0005]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19974208805931265


Epoch 1/4:  15%|█▍        | 5001/33841 [06:40<38:21, 12.53it/s]

output:tensor([[0.0042]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19034314837081365


Epoch 1/4:  18%|█▊        | 6001/33841 [08:00<37:24, 12.40it/s]

output:tensor([[0.0001]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1770724367986186


Epoch 1/4:  21%|██        | 7001/33841 [09:20<35:56, 12.45it/s]

output:tensor([[0.0029]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.2803094615686805


Epoch 1/4:  24%|██▎       | 8003/33841 [10:41<34:31, 12.48it/s]

output:tensor([[0.1995]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1970541564737505


Epoch 1/4:  27%|██▋       | 9003/33841 [12:00<33:18, 12.43it/s]

output:tensor([[0.0644]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.184112429039189


Epoch 1/4:  30%|██▉       | 10003/33841 [13:20<31:58, 12.43it/s]

output:tensor([[0.0127]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18978122478858223


Epoch 1/4:  33%|███▎      | 11003/33841 [14:41<30:36, 12.44it/s]

output:tensor([[0.0726]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19303592819990445


Epoch 1/4:  35%|███▌      | 12003/33841 [16:02<29:23, 12.38it/s]

output:tensor([[5.2119e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18381011885594306


Epoch 1/4:  38%|███▊      | 13003/33841 [17:22<27:37, 12.57it/s]

output:tensor([[3.5019e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17497683089721922


Epoch 1/4:  41%|████▏     | 14003/33841 [18:42<27:02, 12.22it/s]

output:tensor([[0.0693]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19343094325434548


Epoch 1/4:  44%|████▍     | 15003/33841 [20:02<26:29, 11.85it/s]

output:tensor([[3.3437e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18123023134700997


Epoch 1/4:  47%|████▋     | 16003/33841 [21:22<24:05, 12.34it/s]

output:tensor([[2.7793e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18847192206709962


Epoch 1/4:  50%|█████     | 17003/33841 [22:43<22:35, 12.42it/s]

output:tensor([[0.1714]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17991702344754795


Epoch 1/4:  53%|█████▎    | 18003/33841 [24:03<20:59, 12.57it/s]

output:tensor([[0.1068]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1844886033937263


Epoch 1/4:  56%|█████▌    | 19003/33841 [25:24<20:05, 12.31it/s]

output:tensor([[4.1476e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16198911218967244


Epoch 1/4:  59%|█████▉    | 20003/33841 [26:46<18:37, 12.39it/s]

output:tensor([[5.1446e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.22525725162866087


Epoch 1/4:  62%|██████▏   | 21003/33841 [28:09<18:44, 11.42it/s]

output:tensor([[0.0003]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1853877426605261


Epoch 1/4:  65%|██████▌   | 22003/33841 [29:33<16:23, 12.04it/s]

output:tensor([[3.9396e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16580614396942125


Epoch 1/4:  68%|██████▊   | 23003/33841 [30:57<14:36, 12.37it/s]

output:tensor([[0.2205]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.15777711342761525


Epoch 1/4:  71%|███████   | 24003/33841 [32:17<13:11, 12.43it/s]

output:tensor([[0.0020]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19008740424143664


Epoch 1/4:  74%|███████▍  | 25003/33841 [33:38<11:55, 12.35it/s]

output:tensor([[0.2095]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17746277367102906


Epoch 1/4:  77%|███████▋  | 26003/33841 [34:58<10:32, 12.39it/s]

output:tensor([[1.2327e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17895224616343938


Epoch 1/4:  80%|███████▉  | 27003/33841 [36:20<09:18, 12.24it/s]

output:tensor([[3.0391e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18863658782997875


Epoch 1/4:  83%|████████▎ | 28003/33841 [37:41<07:51, 12.38it/s]

output:tensor([[0.1562]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17717235386292762


Epoch 1/4:  86%|████████▌ | 29003/33841 [39:02<06:38, 12.13it/s]

output:tensor([[0.0145]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.2630529742764827


Epoch 1/4:  89%|████████▊ | 30003/33841 [40:23<05:11, 12.33it/s]

output:tensor([[0.0014]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.26020559722435793


Epoch 1/4:  92%|█████████▏| 31003/33841 [41:44<03:48, 12.41it/s]

output:tensor([[0.3075]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.20033519974331465


Epoch 1/4:  95%|█████████▍| 32003/33841 [43:06<02:27, 12.50it/s]

output:tensor([[1.8320e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.21487589402871704


Epoch 1/4:  98%|█████████▊| 33003/33841 [44:25<01:06, 12.57it/s]

output:tensor([[0.1143]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16412527432325708


Epoch 1/4: 100%|██████████| 33841/33841 [45:33<00:00, 12.38it/s]
  inputs, labels = test_batch.to(device),torch.tensor(test_batch['y'], dtype=torch.float).squeeze(0).to(device)


Epoch 1, Avg. Training Loss: 0.19914906283285494, Avg. Test Loss: 0.2000404862183853


Epoch 2/4:   3%|▎         | 1002/33841 [01:22<45:19, 12.08it/s]

output:tensor([[7.7020e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17366940645477275


Epoch 2/4:   6%|▌         | 2002/33841 [02:43<43:32, 12.19it/s]

output:tensor([[0.2808]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19869831997032292


Epoch 2/4:   9%|▉         | 3002/33841 [04:04<41:32, 12.37it/s]

output:tensor([[9.3317e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1722705922673131


Epoch 2/4:  12%|█▏        | 4002/33841 [05:26<41:43, 11.92it/s]

output:tensor([[2.5437e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16535332440739103


Epoch 2/4:  15%|█▍        | 5002/33841 [06:47<39:28, 12.17it/s]

output:tensor([[2.0386e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1771968226296371


Epoch 2/4:  18%|█▊        | 6002/33841 [08:09<38:07, 12.17it/s]

output:tensor([[2.0103e-08]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17662593418531897


Epoch 2/4:  21%|██        | 7002/33841 [09:29<36:19, 12.31it/s]

output:tensor([[0.0003]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.2156088676443268


Epoch 2/4:  24%|██▎       | 8002/33841 [10:50<34:33, 12.46it/s]

output:tensor([[0.0374]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16952918147478832


Epoch 2/4:  27%|██▋       | 9002/33841 [12:12<36:37, 11.30it/s]

output:tensor([[0.1140]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18669662088068953


Epoch 2/4:  30%|██▉       | 10002/33841 [13:32<32:03, 12.39it/s]

output:tensor([[2.7749e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19555586908869732


Epoch 2/4:  33%|███▎      | 11002/33841 [14:53<30:17, 12.57it/s]

output:tensor([[1.3101e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18792190012013496


Epoch 2/4:  35%|███▌      | 12002/33841 [16:13<29:23, 12.38it/s]

output:tensor([[2.5959e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17220350378947338


Epoch 2/4:  38%|███▊      | 13002/33841 [17:33<27:41, 12.54it/s]

output:tensor([[0.1920]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.18068313671075845


Epoch 2/4:  41%|████▏     | 14002/33841 [18:53<26:56, 12.27it/s]

output:tensor([[4.9485e-08]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19392954441837412


Epoch 2/4:  44%|████▍     | 15002/33841 [20:14<24:58, 12.57it/s]

output:tensor([[3.6275e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16863292359433787


Epoch 2/4:  47%|████▋     | 16002/33841 [21:34<25:06, 11.84it/s]

output:tensor([[0.1121]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17373230645754195


Epoch 2/4:  50%|█████     | 17002/33841 [22:53<22:18, 12.58it/s]

output:tensor([[0.0772]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16651323906533166


Epoch 2/4:  53%|█████▎    | 18002/33841 [24:13<20:58, 12.58it/s]

output:tensor([[0.0020]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.21187723285792553


Epoch 2/4:  56%|█████▌    | 19002/33841 [25:33<19:51, 12.45it/s]

output:tensor([[0.1839]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.17468449155991947


Epoch 2/4:  59%|█████▉    | 20002/33841 [26:53<18:07, 12.72it/s]

output:tensor([[0.0004]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17004129168680673


Epoch 2/4:  62%|██████▏   | 21002/33841 [28:12<17:05, 12.52it/s]

output:tensor([[0.0017]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17800349857406764


Epoch 2/4:  65%|██████▌   | 22002/33841 [29:31<15:44, 12.54it/s]

output:tensor([[1.9953e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1630981792574248


Epoch 2/4:  68%|██████▊   | 23002/33841 [30:50<14:08, 12.77it/s]

output:tensor([[0.1048]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17810798845315293


Epoch 2/4:  71%|███████   | 24002/33841 [32:10<13:15, 12.36it/s]

output:tensor([[4.3650e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1616611392503225


Epoch 2/4:  74%|███████▍  | 25002/33841 [33:29<11:38, 12.65it/s]

output:tensor([[1.2113e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16185795263624692


Epoch 2/4:  77%|███████▋  | 26002/33841 [34:48<10:16, 12.71it/s]

output:tensor([[1.6747e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.2063555211873662


Epoch 2/4:  80%|███████▉  | 27003/33841 [36:07<09:41, 11.77it/s]

output:tensor([[0.1031]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.19146789046463267


Epoch 2/4:  83%|████████▎ | 28003/33841 [37:26<07:48, 12.45it/s]

output:tensor([[1.7595e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.14636645209318314


Epoch 2/4:  86%|████████▌ | 29003/33841 [38:46<06:32, 12.31it/s]

output:tensor([[0.1635]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18321870931097534


Epoch 2/4:  89%|████████▊ | 30003/33841 [40:05<05:12, 12.27it/s]

output:tensor([[2.1663e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16268194974212435


Epoch 2/4:  92%|█████████▏| 31003/33841 [41:24<03:44, 12.65it/s]

output:tensor([[0.1952]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1865305948090127


Epoch 2/4:  95%|█████████▍| 32003/33841 [42:43<02:26, 12.52it/s]

output:tensor([[0.0862]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.14893136887379854


Epoch 2/4:  98%|█████████▊| 33003/33841 [44:03<01:07, 12.42it/s]

output:tensor([[0.3871]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19639142544788007


Epoch 2/4: 100%|██████████| 33841/33841 [45:10<00:00, 12.49it/s]


Epoch 2, Avg. Training Loss: 0.1786566185264126, Avg. Test Loss: 0.1830154994647529


Epoch 3/4:   3%|▎         | 1002/33841 [01:19<43:18, 12.64it/s]

output:tensor([[1.4563e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18226591613080245


Epoch 3/4:   6%|▌         | 2002/33841 [02:37<41:48, 12.69it/s]

output:tensor([[0.9861]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.17891981050053235


Epoch 3/4:   9%|▉         | 3002/33841 [03:57<42:03, 12.22it/s]

output:tensor([[9.4076e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.16852964081487634


Epoch 3/4:  12%|█▏        | 4002/33841 [05:17<41:22, 12.02it/s]

output:tensor([[0.0357]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.2213666052445679


Epoch 3/4:  15%|█▍        | 5002/33841 [06:35<40:00, 12.01it/s]

output:tensor([[0.0616]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.17256719531585862


Epoch 3/4:  18%|█▊        | 6002/33841 [07:55<36:28, 12.72it/s]

output:tensor([[0.9999]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.1762898229133179


Epoch 3/4:  21%|██        | 7002/33841 [09:14<34:34, 12.94it/s]

output:tensor([[0.1221]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19112146751240672


Epoch 3/4:  24%|██▎       | 8002/33841 [10:32<33:46, 12.75it/s]

output:tensor([[0.9870]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.17714506876858876


Epoch 3/4:  27%|██▋       | 9002/33841 [11:51<37:00, 11.19it/s]

output:tensor([[0.0275]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.14821139872583114


Epoch 3/4:  30%|██▉       | 10002/33841 [13:11<30:48, 12.90it/s]

output:tensor([[2.6409e-07]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19450659730979544


Epoch 3/4:  33%|███▎      | 11002/33841 [14:29<29:46, 12.79it/s]

output:tensor([[0.0140]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.18891859438414013


Epoch 3/4:  35%|███▌      | 12002/33841 [15:47<29:41, 12.26it/s]

output:tensor([[0.0770]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.18160636566121013


Epoch 3/4:  38%|███▊      | 13003/33841 [17:05<27:15, 12.74it/s]

output:tensor([[0.2094]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.1506111251471138


Epoch 3/4:  41%|████▏     | 14003/33841 [18:22<25:30, 12.96it/s]

output:tensor([[0.2168]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.16284709126989008


Epoch 3/4:  44%|████▍     | 15003/33841 [19:40<24:34, 12.78it/s]

output:tensor([[0.9956]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.18013673903771368


Epoch 3/4:  47%|████▋     | 16003/33841 [20:57<23:18, 12.75it/s]

output:tensor([[4.0568e-06]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.15164019790923844


Epoch 3/4:  50%|█████     | 17003/33841 [22:15<22:17, 12.59it/s]

output:tensor([[1.0021e-08]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17187048940252925


Epoch 3/4:  53%|█████▎    | 18003/33841 [23:32<20:21, 12.97it/s]

output:tensor([[0.9917]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.1695408259077716


Epoch 3/4:  56%|█████▌    | 19003/33841 [24:49<19:03, 12.97it/s]

output:tensor([[0.9901]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.1297496743509777


Epoch 3/4:  59%|█████▉    | 20004/33841 [26:07<18:06, 12.73it/s]

output:tensor([[0.1236]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.17489157674050204


Epoch 3/4:  62%|██████▏   | 21004/33841 [27:25<16:31, 12.94it/s]

output:tensor([[0.1954]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.15436210093912023


Epoch 3/4:  65%|██████▌   | 22004/33841 [28:43<15:15, 12.94it/s]

output:tensor([[0.2423]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.162468820981452


Epoch 3/4:  68%|██████▊   | 23004/33841 [30:03<14:29, 12.47it/s]

output:tensor([[0.9542]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([1.], device='cuda:0')
loss:::::::::::::::::::: 0.17730856279509224


Epoch 3/4:  71%|███████   | 24004/33841 [31:22<12:46, 12.84it/s]

output:tensor([[3.3140e-05]], device='cuda:0', grad_fn=<SigmoidBackward0>) , act: tensor([0.], device='cuda:0')
loss:::::::::::::::::::: 0.19389816702651352


Epoch 3/4:  73%|███████▎  | 24650/33841 [32:12<11:44, 13.05it/s]