### Install relevant libraries and packages

In [1]:
import pandas as pd
import torch
from torch.nn import Linear
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.nn import RGCNConv
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import matthews_corrcoef
from pytorch_metric_learning import losses, distances, reducers, testers
#from pcgrad import PCGrad
from tqdm import tqdm

###### ******* LOAD  COCRYSTAL GRAPG DATA *******

In [2]:
dataset = torch.load('mordred_graph_data.pt')
dataset

Data(x=[740, 3226], edge_index=[2, 1480], y=[740], edge_type=[1480])

### Initialize RGCNConv model 

In [3]:
class Net(torch.nn.Module):
    def __init__(self, dim1, dim2, dim3, dropout):
        super(Net, self).__init__()
        self.gene_emb = Parameter(torch.randn(740, 3226))
        self.conv1 = RGCNConv(3226, dim1, 4)
#         self.conv2 = RGCNConv(dim1, dim2, 4)
        self.lin1 = Linear(dim1, dim2)
        self.lin2 = Linear(dim2, 2)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_type):
        x = torch.cat((x, self.gene_emb), dim=0)
        x = F.relu(self.conv1(x, edge_index, edge_type))
        x = F.dropout(x, self.dropout, training=self.training)
#         x = F.relu(self.conv2(x, edge_index, edge_type))
#         x = F.dropout(x, self.dropout, training=self.training)
        x = F.relu(self.lin1(x))
        emb = x
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.lin2(x)

        return F.log_softmax(x, dim=-1), emb

#### Define Model for training

In [4]:
def model_train(train_idx):
    model.train()
    optimizer.zero_grad()
    out, emb = model(data.x, data.edge_index, data.edge_type)
    print('The out: ', out)
    print('The emb: ', emb)
    loss_tri = Loss_triplet(emb[train_idx], data.y[train_idx])
    loss_nll = F.nll_loss(out[train_idx], data.y[train_idx]).backward()
    loss_list = [loss_tri, loss_nll]
    #loss_list.backward()
    optimizer.step()

#### Check the device

In [5]:
# Re-introducing the device just incase you have access on the cuda GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

#### Define model for evaluation

In [6]:
@torch.no_grad()
def model_val(val_idx, plus_test=False):
    model.eval()
    out, emb = model(data.x, data.edge_index, data.edge_type)
    out_val = out.exp()[val_idx]
    pred_val = out_val.argmax(dim=-1)
    auroc_val = roc_auc_score(data.y[val_idx].cpu(), out_val[:, 1].cpu())
    
    if plus_test == False:
        acc_val = accuracy_score(data.y[val_idx].cpu(), pred_val.cpu())
        auprc_val = average_precision_score(
                data.y[val_idx].cpu(), out_val[:, 0].cpu(), pos_label=0)
        sens_val = recall_score(data.y[val_idx].cpu(), pred_val.cpu())
        spec_val = recall_score(data.y[val_idx].cpu(), pred_val.cpu(), pos_label=0)
        mcc_val = matthews_corrcoef(data.y[val_idx].cpu(), pred_val.cpu())
        return acc_val, sens_val, spec_val, mcc_val, auroc_val, auprc_val
    
    else:
        out_test = out.exp()[test_index]
        pred_test = out_test.argmax(dim=-1)
        acc_test = accuracy_score(data.y[test_index].cpu(), pred_test.cpu())
        auroc_test = roc_auc_score(data.y[test_index].cpu(), out_test[:, 1].cpu())
        auprc_test = average_precision_score(
                data.y[test_index].cpu(), out_test[:, 0].cpu(), pos_label=0)
        sens_test = recall_score(data.y[test_index].cpu(), pred_test.cpu())
        spec_test = recall_score(data.y[test_index].cpu(), pred_test.cpu(), pos_label=0)
        mcc_test = matthews_corrcoef(data.y[test_index].cpu(), pred_test.cpu())
        return auroc_val, acc_test, sens_test, spec_test, mcc_test, auroc_test, auprc_test

###### ******* RECHECK  COVID-19 GRAPH DATA *******

In [7]:
dataset

Data(x=[740, 3226], edge_index=[2, 1480], y=[740], edge_type=[1480])

In [8]:
len(dataset)

4

In [9]:
data = dataset # Tensor representation of the Cocrstal data
print('covid_drugs_to_target_organism:', data)

covid_drugs_to_target_organism: Data(x=[740, 3226], edge_index=[2, 1480], y=[740], edge_type=[1480])


In [10]:
len(data)

4

In [11]:
data.num_features

3226

In [12]:
data.y

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,

In [13]:
data.y.unique()

tensor([0, 1])

In [14]:
data.y.unique

<bound method Tensor.unique of tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
        1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0

In [15]:
len(data.y.unique())

2

In [16]:
# Load data into cuda dataset
data = data.to(device)
data

Data(x=[740, 3226], edge_index=[2, 1480], y=[740], edge_type=[1480])

### Shuffling, Data split, and Kfold cross validation

###### Train and test data split initially

In [17]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=661)
sss

StratifiedShuffleSplit(n_splits=1, random_state=661, test_size=0.15,
            train_size=None)

In [18]:
train_index, test_index = next(sss.split(data.x.cpu(), data.y.cpu()))

In [19]:
train_index

array([ 59, 535, 235, 639, 575,  25, 472, 254, 513, 580, 506, 108, 638,
       719, 455, 543, 512, 198, 686, 481, 394, 439, 398, 143, 207, 679,
        89, 739, 395, 684, 660, 205, 404, 416, 256, 495, 166,  19, 268,
        66, 712, 402, 294, 607, 530, 258, 290, 531, 221, 134, 626, 285,
       714,   4,  72, 209, 598,  21,  41,  55, 365,  68, 555, 423, 161,
       447, 694, 692, 300, 318, 128, 240, 637, 737, 581, 471, 572, 484,
       403, 217, 346, 561, 255, 538, 247, 163, 590,  43, 587, 141, 690,
       528, 738, 241,  98, 529, 702,  71,   6, 704, 381, 523, 276,  86,
         8, 400,  88, 588, 685, 386, 364, 467,  30, 668,  10,  78,  39,
       180, 114, 218, 478, 532,  15, 722, 253, 474, 159, 171, 734, 200,
       459, 377, 696, 706, 248,  69, 133, 526, 713, 541, 408, 152, 288,
       454, 591, 178, 260, 616, 138, 601, 606, 689, 413, 307, 120, 328,
       194, 687, 597, 537, 146, 278, 341, 525,  38, 226, 515, 415, 220,
        63, 354,  57,  42, 488, 187, 263, 319, 453, 733, 682, 43

In [20]:
len(train_index)

629

In [21]:
test_index

array([608, 179, 110, 726, 375, 169, 594, 659,  83, 292, 125, 720, 489,
       468, 396, 674, 103, 336, 476, 458, 466, 366, 287,  35, 233, 257,
       723, 633, 349, 320, 578, 210, 426, 553, 188, 101, 244, 279, 558,
       245, 329, 444, 224, 170, 174, 184, 595, 119,  13, 231, 353,  77,
        85, 339, 440, 183, 605, 618, 412, 261, 678, 196, 109,   3, 160,
        49, 391, 554, 619, 729, 229, 655, 446, 308,  16, 409, 105, 465,
       211, 612, 111,  91,  27, 368, 641, 282, 262, 469, 482, 275, 189,
       150, 583, 363, 552, 420, 646, 107, 314, 611, 360, 367, 431, 192,
       651, 479, 705, 185, 136,  12, 158])

In [22]:
len(test_index)

111

##### Train and validation data split

In [23]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15/0.85, random_state=154)
sss

StratifiedShuffleSplit(n_splits=1, random_state=154,
            test_size=0.17647058823529413, train_size=None)

In [24]:
train_slice, val_slice = next(sss.split(data.x[train_index].cpu(), data.y[train_index].cpu()))

In [25]:
train_slice

array([204, 117, 416,  74, 244, 502, 529, 233, 206, 323,  73, 622, 157,
       615, 445, 621, 181, 557, 313, 143,  95,  48, 443, 468, 129, 123,
       249, 365,  98, 319, 377, 360,  88, 454,  18, 359, 186,  12, 486,
       243, 381, 246,  35, 515, 453, 138, 590,  29, 466,   8, 480, 259,
       321, 222, 178, 446, 613, 540,  84, 405, 440,  62, 537, 485, 192,
       126, 501, 458, 518, 482, 119, 324, 492, 560, 103, 265,  66, 426,
        33, 520, 550, 489, 358, 318, 627, 370, 465, 373, 279, 432, 109,
       113, 462,   4, 255, 242, 374, 538, 378, 450,  92, 276, 196, 341,
       451, 447, 301, 372, 457, 163, 198,   1,  27, 592, 425, 346,  41,
       471, 174,  71, 351, 283, 456, 337, 626, 235, 167, 314, 415, 585,
       170, 122, 331, 524, 221, 343, 477, 412, 407, 227, 141, 541, 580,
       384, 250, 162, 326, 579, 232,  86, 547,  14,  80, 185, 438, 555,
       164, 559, 380,  53, 487, 125, 315, 439, 532, 317, 536, 367, 612,
       183,  11, 203, 553, 421, 187, 236, 402, 542, 332, 202, 60

In [26]:
len(train_slice)

517

In [27]:
val_slice

array([571, 228, 581, 488, 175, 544, 409, 363, 200, 272,  42, 496, 430,
       339,   2, 508, 572,   7, 127, 172, 108, 497,  45, 411, 306, 589,
       609, 514, 493, 517, 441, 110, 154, 189, 410, 155,  47,  28, 511,
       564, 494, 254, 470, 625, 264, 575, 513, 519, 543, 177, 605, 404,
       302,  58, 509, 121, 490, 472, 437, 467,  50, 413, 132, 134, 601,
       574, 417, 152, 182, 274,  26, 399, 361, 388, 217, 237, 368,  97,
       219, 149, 463, 423, 295, 116, 362, 455, 312, 479, 355,  17, 512,
       271, 270, 398, 205,  59, 320, 224,  54, 208, 584,  51, 275, 527,
       356, 603, 350, 586,  68,  43, 469, 573])

In [28]:
len(val_slice)

112

In [29]:
val_index = train_index[val_slice]
val_index

array([727, 615, 671, 579, 263,  37,  18, 242, 113, 104, 294, 274, 338,
       604, 235,  96, 151, 254, 171,  42, 685, 475, 258, 518, 317, 334,
       310, 315, 557, 564, 701, 364, 120, 680, 142, 328, 531, 395,  36,
       340, 499, 624,  28, 432, 519, 259, 422, 243, 647, 453, 130,  87,
       345,  41, 193, 532, 693, 524, 272, 710, 626, 149, 696, 248,  11,
       442, 485, 413, 145, 725,  89,  32, 238, 172, 456, 718, 352,  71,
       124, 601, 568, 648, 664,  39, 289, 225,  56, 715, 384, 198, 536,
       492, 424, 313,  82,  55, 510, 355,  72,  31, 620, 285, 636, 534,
       460, 131, 717, 405, 300, 607, 491, 304])

In [30]:
len(val_index)

112

In [31]:
train_index = train_index[train_slice]
train_index

array([436, 180, 298, 581, 462, 625, 487, 335, 332, 599, 737, 311, 687,
       121, 542, 520, 204, 316, 337, 454, 529, 221, 707, 252, 200, 722,
       401, 388,   6, 333, 663,  50, 587, 593, 686, 249, 100, 638, 322,
       202, 503, 551, 495, 284, 505, 713, 299, 684, 418, 513, 643, 156,
       430, 246, 733, 182, 574, 494, 247, 708, 560, 555, 411, 451, 406,
       159, 154, 438, 545, 387, 218, 213, 357,  52,  86, 216, 694, 628,
       416, 621, 577, 206,  97, 372, 667, 173, 570,  95,  45, 350, 386,
       668, 303, 575,  34, 297, 266, 306, 425, 609, 738, 571, 374,  79,
       675, 699,  53, 271, 517, 525, 427, 535, 739, 548, 613, 407, 402,
       281, 187, 240, 731, 473, 508, 137,  46,  90, 415,  26, 445, 295,
       354,  15, 283,  47, 135, 457, 511, 709, 448, 203, 152, 698, 644,
       176, 452, 341, 177, 378, 650, 590, 450, 455, 346, 123, 127, 392,
        38, 239,  94,   4, 563, 474, 106, 691, 681, 312, 343, 157, 230,
       490, 108, 550, 117, 582, 305,  48, 652, 164, 376, 700, 59

In [32]:
len(train_index)

517

##### 10_fold cross validation

In [33]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=18)
skf

StratifiedKFold(n_splits=10, random_state=18, shuffle=True)

In [34]:
# Hyperparameters
lr = 0.00195
weight_decay = 0.00738
margin = 0.274
dropout = 0.645
triplets_per_anchor = 60
dim1 = 1340
dim2 = 920
dim3 = 740
low = 0.274

In [35]:
# triplet margin loss
distance = distances.CosineSimilarity()
reducer = reducers.ThresholdReducer(low=low)
Loss_triplet = losses.TripletMarginLoss(margin=margin, distance=distance, reducer=reducer,
                                triplets_per_anchor=triplets_per_anchor)

In [36]:
distance

CosineSimilarity()

In [37]:
reducer

ThresholdReducer()

In [38]:
Loss_triplet

TripletMarginLoss(
  (distance): CosineSimilarity()
  (reducer): ThresholdReducer()
)

In [39]:
# cross-validation
acc_list = []
sens_list = []
spec_list = []
mcc_list = []
auroc_list = []
auprc_list = []
epoch_list = []

In [50]:
print('Cross-validation progressing...')
for train_mask, val_mask in tqdm(skf.split(data.x[train_index].cpu(), data.y[train_index].cpu())):
    model = Net(dim1, dim2, dim3, dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    #optimizer = PCGrad(torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay))

    auroc_max = 0
    epoch_count = 0
    for epoch in range(500):
        epoch_count += 1
        model_train(train_index[train_mask])
        acc, sens, spec, mcc, auroc, auprc = model_val(train_index[val_mask])
        if auroc > auroc_max:
            epoch_count = 0
            acc_max = acc
            auroc_max = auroc
            auprc_max = auprc
            sens_max = sens
            spec_max = spec
            mcc_max = mcc
            epoch_max = epoch
        if epoch_count == 30:
            break

    acc_list.append(acc_max)
    sens_list.append(sens_max)
    spec_list.append(spec_max)
    mcc_list.append(mcc_max)
    auroc_list.append(auroc_max)
    auprc_list.append(auprc_max)
    epoch_list.append(epoch_max)

Cross-validation progressing...


0it [00:00, ?it/s]

The out:  tensor([[-0.6342, -0.7558],
        [-0.9674, -0.4782],
        [-0.6174, -0.7751],
        ...,
        [-0.6539, -0.7340],
        [-0.3267, -1.2775],
        [-0.8069, -0.5910]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.2502, 0.2971,  ..., 0.6029, 0.0000, 0.0000],
        [0.0000, 0.5991, 0.0000,  ..., 1.9227, 0.7859, 0.0000],
        [0.4437, 0.9028, 0.7403,  ..., 0.9074, 0.0802, 0.0000],
        ...,
        [0.4529, 1.1371, 0.0000,  ..., 1.3049, 0.0000, 1.1282],
        [0.9914, 0.2006, 0.0000,  ..., 0.1320, 0.2283, 0.5785],
        [0.0000, 1.1359, 0.0190,  ..., 0.0000, 0.5448, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.4235e+00, -9.2788e-02],
        [-3.8712e-04, -7.8570e+00],
        [-3.7806e-04, -7.8807e+00],
        ...,
        [-1.7204e+00, -1.9723e-01],
        [-2.7335e+00, -6.7202e-02],
        [-3.1127e+00, -4.5503e-02]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-1.0015, -0.4578],
        [-0.0664, -2.7451],
        [-0.1645, -1.8860],
        ...,
        [-0.9297, -0.5020],
        [-0.6936, -0.6927],
        [-0.7831, -0.6106]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-1.2967, -0.3194],
        [-0.1608, -1.9068],
        [-0.0358, -3.3471],
        ...,
        [-0.7353, -0.6527],
        [-0.7464, -0.6426],
        [-0.9627, -0.4810]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-0.6250, -0.7663],
        [-0.0215, -3.8513],
        [-0.0054, -5.2282],
        ...,
        [-0.6272, -0.7638],
        [-0.6943, -0.6920],
        [-1.2533, -0.3363]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-0.6833, -0.7031],
        [-0.0307, -3.5001],
        [-0.1177, -2.1982],
        ...,
        [-0.6710, -0.7158],
        [-0.6232, -0.7683],
        [-1.1339, -0.3883]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-4.0395e-01, -1.1016e+00],
        [-2.5391e-05, -1.0581e+01],
        [-1.6553e-03, -6.4046e+00],
        ...,
        [-6.3851e-01, -7.5095e-01],
        [-6.3264e-01, -7.5755e-01],
        [-1.6166e+00, -2.2137e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1584, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0870, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.9356, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0455, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0529, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0261, 0.0202]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.1397, -2.0372],
        [-0.0154, -4.1791],
        [-0.0069, -4.9802],
        ...,
        [-0.5501, -0.8602],
        [-0.4911, -0.9466],
        [-1.3880, -0.2871]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-3.3806e-01, -1.2488e+00],
        [-1.8341e-02, -4.0078e+00],
        [-3.6149e-03, -5.6245e+00],
        ...,
        [-4.9334e-01, -9.4311e-01],
        [-5.4931e-01, -8.6120e-01],
        [-1.5324e+00, -2.4337e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.9807, 0.2009],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 2.8000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 3.9398, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2047, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2697, 0.0141],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0556, 0.0411]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.2356e-01, -2.1522e+00],
        [-2.5307e-03, -5.9805e+00],
        [-7.9068e-03, -4.8440e+00],
        ...,
        [-6.7900e-01, -7.0750e-01],
        [-4.4468e-01, -1.0245e+00],
        [-1.0623e+00, -4.2412e-01]], device='

The out:  tensor([[-6.1749e-01, -7.7500e-01],
        [-7.9390e-05, -9.4414e+00],
        [-8.2254e-06, -1.1711e+01],
        ...,
        [-4.2223e-01, -1.0659e+00],
        [-3.7972e-01, -1.1522e+00],
        [-1.0169e+00, -4.4895e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.9465, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 5.0370, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 5.5943, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2363, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3295, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0117, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-5.5917e-01, -8.4789e-01],
        [-2.6556e-04, -8.2339e+00],
        [-4.8995e-03, -5.3211e+00],
        ...,
        [-4.1562e-01, -1.0786e+00],
        [-5.0503e-01, -9.2505e-01],
        [-1.2829e+00, -3.2465e-01]], device='

1it [00:01,  1.86s/it]

The out:  tensor([[-1.9749e-01, -1.7192e+00],
        [-3.4146e-03, -5.6814e+00],
        [-2.3290e-03, -6.0635e+00],
        ...,
        [-4.7516e-01, -9.7230e-01],
        [-5.4932e-01, -8.6119e-01],
        [-9.1570e-01, -5.1122e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.5585, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 4.0389, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 4.9253, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4089, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3389, 0.0140],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1203, 0.0276]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-8.7267e-03, -4.7457e+00],
        [-5.0709e-03, -5.2868e+00],
        [-2.3365e-02, -3.7682e+00],
        ...,
        [-4.9374e-01, -9.4248e-01],
        [-5.1835e-01, -9.0511e-01],
        [-1.1007e+00, -4.0442e-01]], device='

The out:  tensor([[-1.0744, -0.4178],
        [-0.4918, -0.9455],
        [-0.4233, -1.0638],
        ...,
        [-0.8973, -0.5237],
        [-0.6874, -0.6990],
        [-0.6807, -0.7057]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.6343, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0232, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.9289, -0.1570],
        [-0.3234, -1.2863],
        [-0.3578, -1.2015],
        ...,
        [-0.9190, -0.5090],
        [-0.7445, -0.6443],
        [-0.9131, -0.5129]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000

The out:  tensor([[-8.5326e-01, -5.5517e-01],
        [-5.5053e-02, -2.9269e+00],
        [-2.2018e-03, -6.1196e+00],
        ...,
        [-1.0387e+00, -4.3684e-01],
        [-8.7346e-01, -5.4043e-01],
        [-8.8811e-01, -5.3006e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 5.5469, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 5.6957, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.7525, -0.6371],
        [-0.0100, -4.6149],
        [-0.0119, -4.4396],
        ...,
        [-1.3445, -0.3020],
        [-0.8757, -0.5389],
        [-0.8784, -0.5369]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-5.1651e-01, -9.0782e-01],
        [-1.7265e-02, -4.0677e+00],
        [-1.7358e-03, -6.3571e+00],
        ...,
        [-1.1963e+00, -3.5998e-01],
        [-6.3427e-01, -7.5571e-01],
        [-1.1222e+00, -3.9387e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.8343e-01, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.5895e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 6.5550e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 2.5561e-02,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 2.9179e-01, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 6.1116e-03,
         0.0000e+00]], device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-5.6419e-01, -8.4123e-01],
        [-3

The out:  tensor([[-0.3986, -1.1126],
        [-0.0502, -3.0175],
        [-0.0216, -3.8458],
        ...,
        [-1.1458, -0.3827],
        [-0.3134, -1.3129],
        [-1.0491, -0.4312]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 4.8509e-01, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.5190e+00, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.7640e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 1.1817e-02,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 7.5923e-01, 0.0000e+00,
         0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 4.3071e-04,
         0.0000e+00]], device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.1781e-01, -1.0744e+00],
        [-5.3802e-03, -5.2277e+00],
        [-1.1363e-03, -6.7806e

The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.7037, 0.0257, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 3.6577, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 3.6406, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0222, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.6222, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.3282e-01, -1.5716e+00],
        [-2.1086e-04, -8.4647e+00],
        [-2.5829e-04, -8.2615e+00],
        ...,
        [-1.2441e+00, -3.3997e-01],
        [-4.3644e-01, -1.0394e+00],
        [-8.5691e-01, -5.5246e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 1.1203, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 4.1673, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 4.1344, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000, 

2it [00:03,  1.91s/it]

The out:  tensor([[-1.5001e+00, -2.5246e-01],
        [-3.2682e-04, -8.0262e+00],
        [-5.8471e-03, -5.1447e+00],
        ...,
        [-1.5300e+00, -2.4403e-01],
        [-4.1135e-01, -1.0870e+00],
        [-1.0956e+00, -4.0698e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.5038, 0.0948, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 4.3314, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0381,  ..., 3.3490, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0162, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.6048, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0076, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-3.7624e-01, -1.1598e+00],
        [-1.3802e-02, -4.2898e+00],
        [-3.1516e-03, -5.7614e+00],
        ...,
        [-1.6050e+00, -2.2426e-01],
        [-4.0578e-01, -1.0980e+00],
        [-8.5761e-01, -5.5195e-01]], device='

The out:  tensor([[-1.5621, -0.2353],
        [-0.3873, -1.1360],
        [-0.6201, -0.7720],
        ...,
        [-1.0008, -0.4582],
        [-0.8810, -0.5351],
        [-0.8106, -0.5881]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.8536, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 1.1374, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.4596, 0.0000,  ..., 0.6344, 0.0000, 0.0000],
        [0.0000, 0.1074, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.1721, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.2182, -0.3506],
        [-0.1941, -1.7347],
        [-0.1776, -1.8155],
        ...,
        [-1.1310, -0.3897],
        [-0.8161, -0.5837],
        [-0.8163, -0.5835]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.3839, 0.0000

The out:  tensor([[-0.4870, -0.9531],
        [-0.0104, -4.5732],
        [-0.1135, -2.2323],
        ...,
        [-1.2430, -0.3404],
        [-1.2669, -0.3309],
        [-0.5859, -0.8133]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0926],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4035],
        ...,
        [0.0000, 0.5084, 0.0000,  ..., 0.3110, 0.0000, 0.0000],
        [0.0000, 0.1732, 0.0000,  ..., 0.4810, 0.0000, 0.0256],
        [0.0000, 0.0000, 0.0000,  ..., 0.1312, 0.0000, 0.0516]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.4325, -1.0467],
        [-0.0371, -3.3130],
        [-0.0071, -4.9545],
        ...,
        [-1.1238, -0.3931],
        [-1.0303, -0.4414],
        [-0.5398, -0.8743]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1023, 0.0000

The out:  tensor([[-0.0379, -3.2924],
        [-0.0787, -2.5806],
        [-0.0079, -4.8501],
        ...,
        [-1.6623, -0.2104],
        [-1.2656, -0.3314],
        [-0.4049, -1.0998]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0038, 0.2454,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2206, 0.8932,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.4615, 0.9611,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.5498, 0.0000,  ..., 0.3470, 0.0000, 0.0000],
        [0.0000, 0.5237, 0.0000,  ..., 0.3846, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0222, 0.0000, 0.0101]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-7.6427e-01, -6.2675e-01],
        [-4.8425e-03, -5.3328e+00],
        [-2.0981e-05, -1.0775e+01],
        ...,
        [-1.6510e+00, -2.1301e-01],
        [-1.6058e+00, -2.2405e-01],
        [-4.0803e-01, -1.0935e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-1.6488e-01, -1.8838e+00],
        [-2.0895e-04, -8.4735e+00],
        [-9.4869e-04, -6.9609e+00],
        ...,
        [-1.8346e+00, -1.7397e-01],
        [-1.3695e+00, -2.9335e-01],
        [-4.6229e-01, -9.9382e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.1889, 0.1050,  ..., 0.0137, 0.0000, 0.0090],
        [0.0000, 0.0000, 0.7523,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.7949,  ..., 0.0000, 0.0000, 0.0247],
        ...,
        [0.0000, 0.4003, 0.0000,  ..., 0.3322, 0.0000, 0.0000],
        [0.0000, 0.4068, 0.0000,  ..., 0.2065, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0751,  ..., 0.0000, 0.0000, 0.0049]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-3.4446e-01, -1.2331e+00],
        [-1.7105e-03, -6.3718e+00],
        [-1.3618e-02, -4.3032e+00],
        ...,
        [-1.4190e+00, -2.7700e-01],
        [-1.6075e+00, -2.2364e-01],
        [-3.7399e-01, -1.1647e+00]], device='

The out:  tensor([[-3.3805e-02, -3.4040e+00],
        [-2.2850e-04, -8.3843e+00],
        [-1.1852e-03, -6.7385e+00],
        ...,
        [-1.6926e+00, -2.0340e-01],
        [-1.8476e+00, -1.7152e-01],
        [-3.1040e-01, -1.3211e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.3367,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.6586,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.3413, 0.0000,  ..., 0.1781, 0.0000, 0.0000],
        [0.0000, 0.3634, 0.0000,  ..., 0.1991, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0068,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.2769e-01, -2.1213e+00],
        [-1.9834e-04, -8.5256e+00],
        [-8.8688e-05, -9.3310e+00],
        ...,
        [-1.4166e+00, -2.7779e-01],
        [-1.1823e+00, -3.6609e-01],
        [-3.8823e-01, -1.1340e+00]], device='

The out:  tensor([[-3.1692e-01, -1.3034e+00],
        [-5.5334e-04, -7.4998e+00],
        [-1.3556e-02, -4.3077e+00],
        ...,
        [-1.4015e+00, -2.8266e-01],
        [-1.9414e+00, -1.5490e-01],
        [-4.0852e-01, -1.0925e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.4809, 0.4876,  ..., 0.4784, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.3615,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6067, 1.4859,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.3088, 0.0000,  ..., 0.1353, 0.0000, 0.0000],
        [0.0000, 0.4563, 0.0000,  ..., 0.3309, 0.0000, 0.0017],
        [0.0000, 0.0105, 0.0793,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.5241e-01, -1.9564e+00],
        [-1.3491e-03, -6.6090e+00],
        [-2.8010e-04, -8.1806e+00],
        ...,
        [-1.4198e+00, -2.7676e-01],
        [-1.8069e+00, -1.7931e-01],
        [-4.0438e-01, -1.1008e+00]], device='

The out:  tensor([[-3.8731e-02, -3.2704e+00],
        [-8.9331e-04, -7.0210e+00],
        [-2.3350e-04, -8.3623e+00],
        ...,
        [-1.8525e+00, -1.7060e-01],
        [-2.2672e+00, -1.0937e-01],
        [-3.8073e-01, -1.1500e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.2680,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 1.4049,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.3899, 0.0000,  ..., 0.2059, 0.0067, 0.0000],
        [0.0000, 0.5118, 0.0000,  ..., 0.3448, 0.0064, 0.0000],
        [0.0000, 0.0000, 0.0683,  ..., 0.0031, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-6.7372e-02, -2.7310e+00],
        [-2.3299e-03, -6.0631e+00],
        [-4.0654e-04, -7.8080e+00],
        ...,
        [-1.5805e+00, -2.3050e-01],
        [-1.7667e+00, -1.8742e-01],
        [-4.1018e-01, -1.0893e+00]], device='

The out:  tensor([[-2.7340e-02, -3.6130e+00],
        [-1.2159e-05, -1.1319e+01],
        [-5.0186e-05, -9.8994e+00],
        ...,
        [-1.5668e+00, -2.3410e-01],
        [-1.7971e+00, -1.8126e-01],
        [-5.1648e-01, -9.0787e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0847, 0.3757, 0.8969,  ..., 0.0000, 0.0000, 0.0000],
        [0.3010, 0.0000, 1.1136,  ..., 0.0000, 0.0000, 0.0000],
        [0.4192, 0.0000, 1.5555,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.3215, 0.0000,  ..., 0.2253, 0.0174, 0.0000],
        [0.0000, 0.4405, 0.0000,  ..., 0.2607, 0.0031, 0.0000],
        [0.0225, 0.0000, 0.0352,  ..., 0.0147, 0.0038, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-5.1126e-02, -2.9989e+00],
        [-3.6597e-05, -1.0216e+01],
        [-3.5050e-03, -5.6553e+00],
        ...,
        [-1.5606e+00, -2.3573e-01],
        [-1.6320e+00, -2.1757e-01],
        [-4.9514e-01, -9.4028e-01]], device='

3it [00:06,  2.27s/it]

The out:  tensor([[-4.7367e-01, -9.7476e-01],
        [-2.2909e-04, -8.3817e+00],
        [-7.6229e-04, -7.1796e+00],
        ...,
        [-2.5629e+00, -8.0210e-02],
        [-1.6465e+00, -2.1410e-01],
        [-3.6249e-01, -1.1905e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[1.7203e-01, 8.6084e-01, 4.5102e-01,  ..., 2.9774e-01, 0.0000e+00,
         0.0000e+00],
        [7.1281e-01, 0.0000e+00, 1.8380e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.0338e+00, 0.0000e+00, 2.0419e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.1977e-02, 4.8217e-01, 0.0000e+00,  ..., 3.9505e-01, 2.1090e-02,
         0.0000e+00],
        [0.0000e+00, 2.8996e-01, 0.0000e+00,  ..., 2.2187e-01, 7.3555e-04,
         0.0000e+00],
        [5.5921e-02, 0.0000e+00, 6.2386e-02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]], device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-6.8485e-02, -2.7152e+00],
        [-1

The out:  tensor([[-1.5086, -0.2500],
        [-0.4479, -1.0187],
        [-0.6803, -0.7062],
        ...,
        [-0.8097, -0.5888],
        [-0.9351, -0.4985],
        [-0.8577, -0.5519]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.3104,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.3132, -0.3133],
        [-0.2229, -1.6103],
        [-0.4542, -1.0078],
        ...,
        [-0.7412, -0.6473],
        [-0.8691, -0.5435],
        [-0.8644, -0.5470]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 

The out:  tensor([[-0.8506, -0.5572],
        [-0.1401, -2.0347],
        [-0.0101, -4.6010],
        ...,
        [-0.5462, -0.8654],
        [-0.6286, -0.7621],
        [-1.2997, -0.3183]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-9.6652e-01, -4.7869e-01],
        [-4.9600e-03, -5.3088e+00],
        [-2.2769e-02, -3.7937e+00],
        ...,
        [-6.6883e-01, -7.1807e-01],
        [-6.6209e-01, -7.2520e-01],
        [-1.2821e+00, -3.2498e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [

The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-0.8782, -0.5370],
        [-0.0295, -3.5374],
        [-0.0147, -4.2255],
        ...,
        [-0.5958, -0.8011],
        [-0.4955, -0.9398],
        [-1.7633, -0.1881]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-2.5245e-01, -1.5001e+00],
        [-3.6639e-03, -5.6111e+00],
        [-1.0167e-03, -6.8918e+00],
        ...,
   

4it [00:08,  2.00s/it]

The out:  tensor([[-0.6942, -0.6921],
        [-1.1500, -0.3807],
        [-1.1858, -0.3645],
        ...,
        [-1.5275, -0.2447],
        [-0.7612, -0.6295],
        [-0.4253, -1.0600]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.3054,  ..., 0.8927, 0.0000, 0.0000],
        [1.2499, 0.0000, 0.0000,  ..., 0.3711, 0.0000, 0.0000],
        [0.0378, 0.0000, 1.2299,  ..., 1.7996, 0.0000, 0.0524],
        ...,
        [0.0000, 0.0000, 0.2894,  ..., 0.5799, 0.0000, 0.0000],
        [0.4058, 0.0000, 0.0000,  ..., 0.0000, 1.7138, 0.0000],
        [0.0000, 0.1565, 0.5538,  ..., 0.0000, 0.0000, 1.1026]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.1571e+00, -3.7745e-01],
        [-1.1921e-07, -1.5642e+01],
        [-3.0994e-06, -1.2697e+01],
        ...,
        [-1.5752e+00, -2.3188e-01],
        [-7.2919e-01, -6.5836e-01],
        [-1.8076e+00, -1.7918e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-2.4012, -0.0950],
        [-0.2754, -1.4242],
        [-0.0953, -2.3982],
        ...,
        [-1.2864, -0.3233],
        [-0.9217, -0.5072],
        [-1.0481, -0.4317]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5633, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1482, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3873, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1931, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.3624, -0.2958],
        [-0.2222, -1.6133],
        [-0.2481, -1.5155],
        ...,
        [-1.2138, -0.3525],
        [-1.3451, -0.3018],
        [-0.9750, -0.4735]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2371

The out:  tensor([[-1.3191, -0.3111],
        [-0.0488, -3.0446],
        [-0.0231, -3.7786],
        ...,
        [-1.7468, -0.1916],
        [-1.2765, -0.3271],
        [-1.2418, -0.3409]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.9104, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0074, 0.0000,  ..., 0.0000, 0.6421, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3011, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1885, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-7.4659e-01, -6.4242e-01],
        [-1.0262e-01, -2.3276e+00],
        [-3.9084e-03, -5.5466e+00],
        ...,
        [-1.2780e+00, -3.2655e-01],
        [-1.9286e+00, -1.5706e-01],
        [-1.0178e+00, -4.4843e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-7.3825e-01, -6.4999e-01],
        [-1.1633e-02, -4.4597e+00],
        [-5.2367e-04, -7.5550e+00],
        ...,
        [-1.7640e+00, -1.8797e-01],
        [-1.4198e+00, -2.7677e-01],
        [-1.4527e+00, -2.6649e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4981, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0618, 0.0000,  ..., 0.0000, 0.6606, 0.0000],
        [0.0000, 0.0840, 0.0000,  ..., 0.0000, 0.2937, 0.0000],
        [0.0000, 0.0729, 0.0000,  ..., 0.0000, 0.3712, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.3600e-01, -2.0624e+00],
        [-1.0291e-03, -6.8795e+00],
        [-5.4857e-04, -7.5084e+00],
        ...,
        [-2.5306e+00, -8.2961e-02],
        [-1.6186e+00, -2.2086e-01],
        [-1.4726e+00, -2.6049e-01]], device='

5it [00:10,  1.98s/it]

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0726, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0589, 0.0000,  ..., 0.0000, 0.3211, 0.0000],
        [0.0000, 0.0195, 0.0000,  ..., 0.0000, 0.2160, 0.0000],
        [0.0000, 0.0668, 0.0000,  ..., 0.0000, 0.3307, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-3.8516e-02, -3.2759e+00],
        [-6.9513e-03, -4.9723e+00],
        [-1.8833e-03, -6.2757e+00],
        ...,
        [-2.2616e+00, -1.1002e-01],
        [-1.6980e+00, -2.0218e-01],
        [-2.4821e+00, -8.7264e-02]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2666, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0015, 0.0000,  ..., 0.00

The out:  tensor([[-1.5544, -0.2374],
        [-0.4682, -0.9839],
        [-0.4404, -1.0322],
        ...,
        [-0.6801, -0.7064],
        [-0.8005, -0.5962],
        [-0.8294, -0.5732]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-1.4936, -0.2543],
        [-0.1339, -2.0769],
        [-0.1499, -1.9720],
        ...,
        [-0.7048, -0.6816],
        [-0.9641, -0.4802],
        [-0.9315, -0.5008]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-4.8259, -0.0081],
        [-0.0428, -3.1729],
        [-0.1639, -1.8891],
        ...,
        [-0.6430, -0.7459],
        [-2.2266, -0.1142],
        [-1.3783, -0.2904]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0421, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0032]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-3.4266, -0.0330],
        [-0.0404, -3.2291],
        [-0.0650, -2.7660],
        ...,
        [-0.6891, -0.6972],
        [-2.6406, -0.0740],
        [-0.9597, -0.4829]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000

The out:  tensor([[-5.2154e+00, -5.4473e-03],
        [-1.0670e-02, -4.5456e+00],
        [-4.2515e-02, -3.1791e+00],
        ...,
        [-5.7478e-01, -8.2743e-01],
        [-5.6093e+00, -3.6705e-03],
        [-8.6041e-01, -5.4989e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0772, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.9596, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.7179, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0501, 0.0000, 0.0000],
        [0.0000, 0.0149, 0.0000,  ..., 0.0000, 0.0000, 0.1553],
        [0.0000, 0.0000, 0.0000,  ..., 0.0180, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.5096e+00, -1.1064e-02],
        [-6.0903e-03, -5.1041e+00],
        [-2.2158e-03, -6.1133e+00],
        ...,
        [-5.4932e-01, -8.6119e-01],
        [-4.7161e+00, -8.9905e-03],
        [-1.4610e+00, -2.6396e-01]], device='

The out:  tensor([[-6.4156e+00, -1.6371e-03],
        [-4.5957e-04, -7.6854e+00],
        [-1.6984e-03, -6.3789e+00],
        ...,
        [-4.2278e-01, -1.0649e+00],
        [-3.4594e+00, -3.1955e-02],
        [-1.2921e+00, -3.2114e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.2941, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.3271, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.6689, -0.0094],
        [-0.0189, -3.9768],
        [-0.0677, -2.7263],
        ...,
        [-0.4699, -0.9811],
        [-3.3581, -0.0354],
        [-1.3284, -0.3077]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:

6it [00:11,  1.95s/it]

The out:  tensor([[-2.2095e+00, -1.1626e-01],
        [-9.6334e-04, -6.9456e+00],
        [-4.9068e-03, -5.3196e+00],
        ...,
        [-3.1384e-01, -1.3117e+00],
        [-4.2505e+00, -1.4360e-02],
        [-1.5172e+00, -2.4759e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1021, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.3945, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.2516, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0036, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0589, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0122, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.2217e+00, -1.4783e-02],
        [-2.0383e-04, -8.4983e+00],
        [-5.3401e-03, -5.2352e+00],
        ...,
        [-4.3696e-01, -1.0385e+00],
        [-4.5360e+00, -1.0775e-02],
        [-1.0453e+00, -4.3324e-01]], device='

The out:  tensor([[-0.8543, -0.5544],
        [-0.3857, -1.1393],
        [-0.5112, -0.9158],
        ...,
        [-0.5674, -0.8370],
        [-0.7309, -0.6568],
        [-0.6110, -0.7827]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-0.9022, -0.5203],
        [-0.3515, -1.2162],
        [-0.5606, -0.8460],
        ...,
        [-0.6738, -0.7129],
        [-0.8989, -0.5226],
        [-0.7882, -0.6063]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-0.6879, -0.6984],
        [-0.1412, -2.0274],
        [-0.0177, -4.0430],
        ...,
        [-0.6233, -0.7683],
        [-0.5510, -0.8589],
        [-0.8006, -0.5961]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-0.7681, -0.6234],
        [-0.0229, -3.7895],
        [-0.0148, -4.2205],
        ...,
        [-0.5728, -0.8299],
        [-0.5728, -0.8299],
        [-0.7242, -0.6630]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-8.2721e-01, -5.7495e-01],
        [-8.3085e-05, -9.3962e+00],
        [-1.2159e-02, -4.4157e+00],
        ...,
        [-4.0526e-01, -1.0990e+00],
        [-5.3233e-01, -8.8488e-01],
        [-9.0808e-01, -5.1634e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0634, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.7275,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1725,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.8147e-01, -1.7960e+00],
        [-7.6193e-04, -7.1800e+00],
        [-9.4917e-04, -6.9604e+00],
        ...,
        [-4.4334e-01, -1.0269e+00],
        [-5.8504e-01, -8.1437e-01],
        [-8.5349e-01, -5.5500e-01]], device='

The out:  tensor([[-3.3850e-01, -1.2477e+00],
        [-1.8776e-03, -6.2787e+00],
        [-9.7132e-04, -6.9373e+00],
        ...,
        [-4.4091e-01, -1.0313e+00],
        [-3.4301e-01, -1.2366e+00],
        [-1.0216e+00, -4.4633e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.2207,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.4719,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0322,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0551,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0418, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.0402e-02, -3.2290e+00],
        [-5.8801e-04, -7.4391e+00],
        [-1.7248e-04, -8.6652e+00],
        ...,
        [-3.6015e-01, -1.1959e+00],
        [-4.8472e-01, -9.5677e-01],
        [-1.2455e+00, -3.3939e-01]], device='

The out:  tensor([[-5.0793e-02, -3.0053e+00],
        [-5.6271e-03, -5.1830e+00],
        [-5.8789e-04, -7.4393e+00],
        ...,
        [-2.6856e-01, -1.4460e+00],
        [-2.5639e-01, -1.4865e+00],
        [-1.5858e+00, -2.2914e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0453,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0018,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0430, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.4308e-01, -1.5335e+00],
        [-4.1937e-03, -5.4763e+00],
        [-1.0023e-03, -6.9060e+00],
        ...,
        [-2.3150e-01, -1.5767e+00],
        [-3.7469e-01, -1.1632e+00],
        [-1.1043e+00, -4.0266e-01]], device='

7it [00:13,  1.99s/it]

The out:  tensor([[-7.3116e-02, -2.6520e+00],
        [-4.0859e-03, -5.5023e+00],
        [-1.3568e-02, -4.3068e+00],
        ...,
        [-2.6353e-01, -1.4625e+00],
        [-3.7761e-01, -1.1568e+00],
        [-8.3907e-01, -5.6583e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0354, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3154, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5865, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.1168, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-6.6636e-02, -2.7416e+00],
        [-4.9889e-04, -7.6035e+00],
        [-4.5528e-04, -7.6948e+00],
        ...,
        [-1.9564e-01, -1.7277e+00],
        [-3.5289e-01, -1.2129e+00],
        [-8.8652e-01, -5.3118e-01]], device='

The out:  tensor([[-1.8002, -0.1806],
        [-0.2057, -1.6826],
        [-0.4819, -0.9613],
        ...,
        [-0.7010, -0.6854],
        [-0.8305, -0.5724],
        [-0.7335, -0.6544]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.1066, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1802, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.1059, -0.4018],
        [-0.3203, -1.2945],
        [-0.1479, -1.9843],
        ...,
        [-0.7049, -0.6815],
        [-0.7961, -0.5998],
        [-0.6981, -0.6882]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.8090, 0.0000,  ..., 0.0000, 0.0000

The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0364, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.7928, -0.6026],
        [-0.0556, -2.9165],
        [-0.1089, -2.2716],
        ...,
        [-0.8166, -0.5833],
        [-1.3489, -0.3005],
        [-0.7487, -0.6405]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1008,

The out:  tensor([[-1.2062e-01, -2.1748e+00],
        [-1.2681e-03, -6.6709e+00],
        [-4.3640e-03, -5.4365e+00],
        ...,
        [-7.4927e-01, -6.4001e-01],
        [-1.2950e+00, -3.2007e-01],
        [-6.1538e-01, -7.7748e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.1887, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0802, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.6040e-01, -1.9092e+00],
        [-1.1509e-03, -6.7677e+00],
        [-9.7632e-04, -6.9323e+00],
        ...,
        [-7.1145e-01, -6.7518e-01],
        [-1.0939e+00, -4.0782e-01],
        [-8.6079e-01, -5.4961e-01]], device='

8it [00:15,  1.86s/it]

The out:  tensor([[-3.0849e-01, -1.3263e+00],
        [-1.8909e-03, -6.2716e+00],
        [-5.9603e-05, -9.7271e+00],
        ...,
        [-7.1560e-01, -6.7119e-01],
        [-1.2363e+00, -3.4315e-01],
        [-6.5192e-01, -7.3614e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0535, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0041, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-6.0358e-01, -7.9154e-01],
        [-4.1670e-03, -5.4826e+00],
        [-1.2885e-03, -6.6549e+00],
        ...,
        [-7.1579e-01, -6.7100e-01],
        [-1.3584e+00, -2.9716e-01],
        [-6.6945e-01, -7.1742e-01]], device='

The out:  tensor([[-1.2565, -0.3350],
        [-0.3873, -1.1360],
        [-0.3966, -1.1165],
        ...,
        [-0.7711, -0.6209],
        [-0.7826, -0.6110],
        [-0.9111, -0.5143]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0344,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.1918, -0.3619],
        [-0.4525, -1.0107],
        [-0.7109, -0.6757],
        ...,
        [-0.7797, -0.6135],
        [-0.8217, -0.5792],
        [-0.8288, -0.5737]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000

The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.8061],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.8483, -0.5588],
        [-0.0731, -2.6517],
        [-0.0190, -3.9714],
        ...,
        [-0.9662, -0.4789],
        [-0.9400, -0.4953],
        [-0.9263, -0.5042]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2392],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.6403],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000,

The out:  tensor([[-1.1400, -0.3854],
        [-0.0576, -2.8824],
        [-0.2098, -1.6647],
        ...,
        [-1.1513, -0.3801],
        [-1.2680, -0.3304],
        [-1.2501, -0.3376]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0767, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.4774],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1072],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0314, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0365, 0.0816,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-4.6005e-01, -9.9765e-01],
        [-5.4571e-04, -7.5137e+00],
        [-2.1114e-02, -3.8684e+00],
        ...,
        [-7.4024e-01, -6.4817e-01],
        [-1.7629e+00, -1.8818e-01],
        [-1.5392e+00, -2.4149e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

9it [00:17,  1.74s/it]

The out:  tensor([[-1.4112e-01, -2.0279e+00],
        [-3.3133e-03, -5.7115e+00],
        [-9.3595e-04, -6.9744e+00],
        ...,
        [-5.7383e-01, -8.2866e-01],
        [-1.2365e+00, -3.4304e-01],
        [-1.3775e+00, -2.9064e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.4564],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.4815],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.2542],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.1013],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0073, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-8.0324e-02, -2.5616e+00],
        [-3.0469e-02, -3.5062e+00],
        [-3.0560e-03, -5.7921e+00],
        ...,
        [-4.9783e-01, -9.3610e-01],
        [-1.1464e+00, -3.8241e-01],
        [-1.9579e+00, -1.5217e-01]], device='

The out:  tensor([[-1.3824, -0.2890],
        [-0.2575, -1.4829],
        [-0.0667, -2.7408],
        ...,
        [-0.7787, -0.6144],
        [-0.7377, -0.6505],
        [-0.6587, -0.7288]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.1345, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 4.8268, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5400, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.0455, -0.4331],
        [-0.2029, -1.6949],
        [-0.2655, -1.4558],
        ...,
        [-0.7108, -0.6758],
        [-0.7064, -0.6801],
        [-0.7068, -0.6797]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000

The out:  tensor([[-0.8436, -0.5624],
        [-0.5074, -0.9214],
        [-0.2838, -1.3981],
        ...,
        [-0.8596, -0.5505],
        [-0.6051, -0.7897],
        [-0.5881, -0.8106]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.4519, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.6781, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.2100, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1044, 0.0000, 0.0000,  ..., 0.0000, 0.1065, 0.0000],
        [0.1552, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.6432, -0.7457],
        [-0.0201, -3.9159],
        [-0.0528, -2.9672],
        ...,
        [-1.4175, -0.2775],
        [-0.6933, -0.6930],
        [-0.5743, -0.8280]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0040, 0.0000, 0.0000,  ..., 0.0000, 0.0000

The out:  tensor([[-2.1443e-01, -1.6451e+00],
        [-4.0984e-03, -5.4992e+00],
        [-5.2546e-04, -7.5514e+00],
        ...,
        [-1.4812e+00, -2.5794e-01],
        [-5.4369e-01, -8.6894e-01],
        [-6.5700e-01, -7.3065e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2759, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.7198, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.2739, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0206, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.1528e-01, -1.6415e+00],
        [-3.4894e-03, -5.6598e+00],
        [-1.8702e-04, -8.5842e+00],
        ...,
        [-1.0601e+00, -4.2530e-01],
        [-5.7394e-01, -8.2851e-01],
        [-5.4979e-01, -8.6056e-01]], device='

The out:  tensor([[-0.7766, -0.6162],
        [-0.0087, -4.7440],
        [-0.0146, -4.2320],
        ...,
        [-1.0349, -0.4389],
        [-0.5015, -0.9304],
        [-0.5064, -0.9230]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5341, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5996, 0.0000],
        [0.0000, 0.0169, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.0432e-01, -1.6885e+00],
        [-8.0720e-04, -7.1223e+00],
        [-1.0011e-02, -4.6091e+00],
        ...,
        [-9.4512e-01, -4.9206e-01],
        [-5.4564e-01, -8.6624e-01],
        [-4.9753e-01, -9.3657e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-1.2117e+00, -3.5336e-01],
        [-5.1757e-02, -2.9870e+00],
        [-1.6327e-03, -6.4184e+00],
        ...,
        [-1.4481e+00, -2.6790e-01],
        [-7.7990e-01, -6.1332e-01],
        [-4.6074e-01, -9.9646e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3635, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.2103, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.4880, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0182, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0151, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0944, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.7906e-02, -3.5928e+00],
        [-4.6949e-03, -5.3636e+00],
        [-2.4816e-04, -8.3015e+00],
        ...,
        [-1.7820e+00, -1.8429e-01],
        [-7.2519e-01, -6.6210e-01],
        [-6.3969e-01, -7.4963e-01]], device='

The out:  tensor([[-2.6189e-01, -1.4679e+00],
        [-3.0136e-03, -5.8061e+00],
        [-1.5520e-04, -8.7708e+00],
        ...,
        [-1.4406e+00, -2.7023e-01],
        [-5.8374e-01, -8.1601e-01],
        [-5.5855e-01, -8.4873e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.5934, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3213, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0091,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0274, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0410, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-7.8089e-02, -2.5887e+00],
        [-1.5425e-03, -6.4751e+00],
        [-7.8198e-05, -9.4569e+00],
        ...,
        [-1.2071e+00, -3.5534e-01],
        [-5.3811e-01, -8.7672e-01],
        [-3.7604e-01, -1.1602e+00]], device='

10it [00:19,  1.96s/it]

The out:  tensor([[-4.8478e-01, -9.5667e-01],
        [-6.6032e-04, -7.3230e+00],
        [-5.8888e-05, -9.7395e+00],
        ...,
        [-9.7565e-01, -4.7312e-01],
        [-7.2268e-01, -6.6446e-01],
        [-5.9802e-01, -7.9829e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.3160, 0.0000],
        [0.0000, 0.0775, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1003, 0.0000,  ..., 0.0000, 0.0471, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0102, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-1.5481e-02, -4.1759e+00],
        [-7.0094e-04, -7.2634e+00],
        [-1.6101e-03, -6.4323e+00],
        ...,
        [-9.5613e-01, -4.8512e-01],
        [-7.0761e-01, -6.7889e-01],
        [-4.1853e-01, -1.0730e+00]], device='




##### result in pandas dataframe

In [51]:
cv_result = pd.DataFrame([acc_list, sens_list, spec_list, mcc_list, auroc_list, auprc_list],
                         index=['Accuracy', 'Sensitivity', 'Specificity', 'MCC', 'AUROC', 'AUPRC'])
cv_result

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Accuracy,0.865385,0.884615,0.903846,0.923077,0.923077,0.923077,0.884615,0.980392,0.921569,0.901961,0.865385,0.942308,0.903846,0.942308,0.884615,0.942308,0.865385,0.960784,0.941176,0.921569
Sensitivity,0.892857,0.892857,0.821429,0.888889,0.851852,0.888889,0.851852,0.962963,0.888889,0.925926,0.892857,0.928571,0.892857,0.888889,0.777778,0.925926,0.851852,0.962963,0.888889,0.925926
Specificity,0.833333,0.875,1.0,0.96,1.0,0.96,0.92,1.0,0.958333,0.875,0.833333,0.958333,0.916667,1.0,1.0,0.96,0.88,0.958333,1.0,0.916667
MCC,0.728907,0.767857,0.824502,0.848889,0.856945,0.848889,0.771852,0.96148,0.845918,0.803409,0.728907,0.884932,0.807723,0.890871,0.791985,0.88527,0.73131,0.921296,0.888889,0.842593
AUROC,0.980655,0.947917,0.962054,0.983704,0.983704,0.977778,0.948148,0.993827,0.986111,0.984568,0.980655,0.947917,0.970982,0.988148,0.982222,0.974815,0.942222,0.993827,0.987654,0.978395
AUPRC,0.97837,0.96451,0.950674,0.982608,0.983256,0.976939,0.930333,0.992892,0.985534,0.984117,0.979032,0.963547,0.963597,0.985628,0.983054,0.971341,0.922398,0.992892,0.986366,0.979155


In [52]:
cv_result['Mean'] = cv_result.mean(axis=1)
cv_result['Mean']

Accuracy       0.914065
Sensitivity    0.890146
Specificity    0.940250
MCC            0.831621
AUROC          0.974765
AUPRC          0.972812
Name: Mean, dtype: float64

In [53]:
print('Cross-validation results:')
print('Stopping Epochs:', epoch_list)

Cross-validation results:
Stopping Epochs: [34, 30, 102, 36, 36, 83, 65, 46, 42, 98, 54, 59, 98, 41, 61, 56, 66, 42, 37, 92]


In [54]:
print(cv_result)

                    0         1         2         3         4         5  \
Accuracy     0.865385  0.884615  0.903846  0.923077  0.923077  0.923077   
Sensitivity  0.892857  0.892857  0.821429  0.888889  0.851852  0.888889   
Specificity  0.833333  0.875000  1.000000  0.960000  1.000000  0.960000   
MCC          0.728907  0.767857  0.824502  0.848889  0.856945  0.848889   
AUROC        0.980655  0.947917  0.962054  0.983704  0.983704  0.977778   
AUPRC        0.978370  0.964510  0.950674  0.982608  0.983256  0.976939   

                    6         7         8         9  ...        11        12  \
Accuracy     0.884615  0.980392  0.921569  0.901961  ...  0.942308  0.903846   
Sensitivity  0.851852  0.962963  0.888889  0.925926  ...  0.928571  0.892857   
Specificity  0.920000  1.000000  0.958333  0.875000  ...  0.958333  0.916667   
MCC          0.771852  0.961480  0.845918  0.803409  ...  0.884932  0.807723   
AUROC        0.948148  0.993827  0.986111  0.984568  ...  0.947917  0.9709

### Testing result

In [55]:
print('Testing progressing...')
model = Net(dim1, dim2, dim3, dropout).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
#optimizer = PCGrad(torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay))

Testing progressing...


In [56]:
model

Net(
  (conv1): RGCNConv(3226, 1340, num_relations=4)
  (lin1): Linear(in_features=1340, out_features=920, bias=True)
  (lin2): Linear(in_features=920, out_features=2, bias=True)
)

In [57]:
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: False
    lr: 0.00195
    maximize: False
    weight_decay: 0.00738
)

In [58]:
auroc_val_max = 0
epoch_count = 0
for epoch in range(500):
    epoch_count += 1
    model_train(train_index)
    auroc_val, acc, sens, spec, mcc, auroc, auprc = model_val(val_index, plus_test=True)
    if auroc_val > auroc_val_max:
        epoch_count = 0
        auroc_val_max = auroc_val
        acc_max = acc
        auroc_max = auroc
        auprc_max = auprc
        sens_max = sens
        spec_max = spec
        mcc_max = mcc
        epoch_max = epoch
    if epoch_count == 30:
        break
        
test_result = pd.Series([acc_max, sens_max, spec_max, mcc_max, auroc_max, auprc_max],
                        index=['Accuracy', 'Sensitivity', 'Specificity', 'MCC', 'AUROC', 'AUPRC'])
print('Testing results at Epoch {}:'.format(epoch_max))

The out:  tensor([[-0.5072, -0.9217],
        [-1.1174, -0.3962],
        [-0.5891, -0.8093],
        ...,
        [-0.6261, -0.7651],
        [-1.2933, -0.3207],
        [-0.5974, -0.7991]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.4576, 0.0000, 0.7840,  ..., 0.0000, 0.2693, 0.7963],
        [0.8565, 0.5925, 0.2407,  ..., 0.6146, 1.7330, 0.3215],
        [1.3922, 1.1361, 0.0000,  ..., 0.7728, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0665, 0.5255, 0.5379],
        [1.9930, 0.5841, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.5670, 0.5696, 0.0000,  ..., 1.0442, 0.0000, 0.0082]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.1093e+00, -1.2934e-01],
        [-1.6998e-04, -8.6801e+00],
        [-4.5662e-03, -5.3913e+00],
        ...,
        [-1.8505e+00, -1.7097e-01],
        [-2.5784e+00, -7.8932e-02],
        [-2.0317e+00, -1.4054e-01]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:

The out:  tensor([[-0.9595, -0.4830],
        [-0.1036, -2.3188],
        [-0.2928, -1.3710],
        ...,
        [-0.9259, -0.5045],
        [-1.0175, -0.4486],
        [-0.7349, -0.6531]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<ReluBackward0>)
The out:  tensor([[-1.2383, -0.3423],
        [-0.0678, -2.7254],
        [-0.0694, -2.7027],
        ...,
        [-0.7035, -0.6829],
        [-0.8244, -0.5771],
        [-0.6965, -0.6898]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..

The out:  tensor([[-1.0630, -0.4238],
        [-0.0463, -3.0953],
        [-0.0640, -2.7809],
        ...,
        [-0.6249, -0.7664],
        [-0.7485, -0.6407],
        [-0.3813, -1.1488]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.0411, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-0.8221, -0.5789],
        [-0.0469, -3.0829],
        [-0.0159, -4.1498],
        ...,
        [-0.5658, -0.8391],
        [-0.9685, -0.4774],
        [-0.4876, -0.9522]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 

The out:  tensor([[-3.2462e-01, -1.2830e+00],
        [-1.0133e-05, -1.1505e+01],
        [-1.4551e-03, -6.5334e+00],
        ...,
        [-4.1527e-01, -1.0793e+00],
        [-7.0395e-01, -6.8246e-01],
        [-2.6527e-01, -1.4567e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)
The emb:  tensor([[0.0000, 0.2405, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0', grad_fn=<ReluBackward0>)
The out:  tensor([[-2.2624e-01, -1.5971e+00],
        [-1.1844e-03, -6.7392e+00],
        [-8.3852e-04, -7.0843e+00],
        ...,
        [-5.7004e-01, -8.3356e-01],
        [-7.7752e-01, -6.1535e-01],
        [-2.6595e-01, -1.4545e+00]], device='

In [59]:
print(test_result)

Accuracy       0.900901
Sensitivity    0.881356
Specificity    0.923077
MCC            0.802864
AUROC          0.939700
AUPRC          0.907457
dtype: float64
