In [12]:
import torch
import torch.nn as nn
from torchtext.data import Field, Dataset, Example, Iterator
import numpy as np
import math
import random

In [13]:
ls ../../context2vec2/src/

dataset.py


In [14]:
# %%writefile ../../context2vec2/src/dataset.py

class WikiDataset:
    def __init__(self, X, batch_size, min_freq, device, pad_token='<PAD>', unk_token='<UNK>', 
                                      bos_token='<BOS>', eos_token='<EOS>', seed=100):
        super().__init__() 
        np.random.seed(seed)
        self.sent_dict = self._gathered_by_lengths(X)
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.bos_token = bos_token
        self.eos_token = eos_token
        self.device = device
        # set up torchtext Fields
        self.sentence_field = Field(use_vocab=True, unk_token=self.unk_token, pad_token=self.pad_token,
                                         init_token=self.bos_token, eos_token=self.eos_token,
                                         batch_first=True, include_lengths=False)
        self.sentence_field_id = Field(use_vocab=False, batch_first=True)
        # build vocal
        self.sentence_field.build_vocab(X, min_freq=min_freq)
        self.vocab = self.sentence_field.vocab
        if self.pad_token: self.pad_idx = self.sentence_field.vocab.stoi[self.pad_token]
        self.dataset = self._create_dataset(self.sent_dict, X)
    
    def get_raw_sentence(self, X):
        return [[self.vocab.itos[idx] for idx in sentence] for sentence in X]   
     
        
    def _gathered_by_lengths(self, X):
        lengths = [(index, len(sent)) for index, sent in enumerate(X)]
        lengths = sorted(lengths, key=lambda x: x[1], reverse=True)

        sent_dict = {}
        current_length = -1
        for i, length in lengths:
            if current_length == length:
                sent_dict[length].append(i)
            else:
                sent_dict[length] = [i]
                current_length = length

        return sent_dict
    
    def _create_dataset(self, sent_dict, X):
        datasets = {}
        _fields = [('sentence', self.sentence_field),
                   ('id', self.sentence_field_id)]
        for length, index in sent_dict.items():
            index = np.array(index)
            items = [*zip(X[index], index[:, numpy.newaxis])]
            datasets[length] = Dataset(self._get_examples(items, _fields), _fields)
        return np.random.permutation(list(datasets.values()))
    
    
    def _get_examples(self, items, fields):
        return [Example.fromlist(item, fields) for item in items]

    
    def get_batch_iter(self, batch_size):

        def sort(data):
            return len(getattr(data, 'sentence'))

        for dataset in self.dataset:
            yield Iterator(dataset=dataset,
                                batch_size=batch_size,
                                sort_key=sort,
                                train=True,
                                repeat=False,
                                device=self.device)

In [22]:
%%writefile ../../context2vec2/src/walker_alias.py

import numpy
# Taken from here 
# https://github.com/chainer/chainer/blob/v5.2.0/chainer/utils/walker_alias.py#L6

class WalkerAlias(object):
    """Implementation of Walker's alias method.
    This method generates a random sample from given probabilities
    :math:`p_1, \\dots, p_n` in :math:`O(1)` time.
    It is more efficient than :func:`~numpy.random.choice`.
    This class works on both CPU and GPU.
    Args:
        probs (float list): Probabilities of entries. They are normalized with
                            `sum(probs)`.
    See: `Wikipedia article <https://en.wikipedia.org/wiki/Alias_method>`_
    """

    def __init__(self, probs):
        prob = numpy.array(probs, numpy.float32)
        prob /= numpy.sum(prob)
        threshold = numpy.ndarray(len(probs), numpy.float32)
        values = numpy.ndarray(len(probs) * 2, numpy.int32)
        il, ir = 0, 0
        pairs = list(zip(prob, range(len(probs))))
        pairs.sort()
        for prob, i in pairs:
            p = prob * len(probs)
            while p > 1 and ir < il:
                values[ir * 2 + 1] = i
                p -= 1.0 - threshold[ir]
                ir += 1
            threshold[il] = p
            values[il * 2] = i
            il += 1
        # fill the rest
        for i in range(ir, len(probs)):
            values[i * 2 + 1] = 0

        assert((values < len(threshold)).all())
        self.threshold = threshold
        self.values = values
        self.use_gpu = False

    def to_gpu(self):
        """Make a sampler GPU mode.
        """
        if not self.use_gpu:
            self.threshold = cuda.to_gpu(self.threshold)
            self.values = cuda.to_gpu(self.values)
            self.use_gpu = True

    def to_cpu(self):
        """Make a sampler CPU mode.
        """
        if self.use_gpu:
            self.threshold = cuda.to_cpu(self.threshold)
            self.values = cuda.to_cpu(self.values)
            self.use_gpu = False

    def sample(self, shape):
        """Generates a random sample based on given probabilities.
        Args:
            shape (tuple of int): Shape of a return value.
        Returns:
            Returns a generated array with the given shape. If a sampler is in
            CPU mode the return value is a :class:`numpy.ndarray` object, and
            if it is in GPU mode the return value is a :class:`cupy.ndarray`
            object.
        """
        if self.use_gpu:
            return self.sample_gpu(shape)
        else:
            return self.sample_cpu(shape)

    def sample_cpu(self, shape):
        ps = numpy.random.uniform(0, 1, shape)
        pb = ps * len(self.threshold)
        index = pb.astype(numpy.int32)
        left_right = (self.threshold[index] < pb - index).astype(numpy.int32)
        return self.values[index * 2 + left_right]

    def sample_gpu(self, shape):
        ps = cuda.cupy.random.uniform(size=shape, dtype=numpy.float32)
        vs = cuda.elementwise(
            'T ps, raw T threshold , raw S values, int32 b',
            'int32 vs',
            '''
            T pb = ps * b;
            int index = __float2int_rd(pb);
            // fill_uniform sometimes returns 1.0, so we need to check index
            if (index >= b) {
              index = 0;
            }
            int lr = threshold[index] < pb - index;
            vs = values[index * 2 + lr];
            ''',
            'walker_alias_sample'
        )(ps, self.threshold, self.values, len(self.threshold))
        return vs
    
  
# import numpy
    
# class WalkerAlias(object):
#     '''
#     This is from Chainer's implementation.
#     You can find the original code at
#     https://github.com/chainer/chainer/blob/v4.4.0/chainer/utils/walker_alias.py
#     This class is
#         Copyright (c) 2015 Preferred Infrastructure, Inc.
#         Copyright (c) 2015 Preferred Networks, Inc.
#     '''
#     def __init__(self, probs):
#         prob = numpy.array(probs, numpy.float32)
#         prob /= numpy.sum(prob)
#         threshold = numpy.ndarray(len(probs), numpy.float32)
#         values = numpy.ndarray(len(probs) * 2, numpy.int32)
#         il, ir = 0, 0
#         pairs = list(zip(prob, range(len(probs))))
#         pairs.sort()
#         for prob, i in pairs:
#             p = prob * len(probs)
#             while p > 1 and ir < il:
#                 values[ir * 2 + 1] = i
#                 p -= 1.0 - threshold[ir]
#                 ir += 1
#             threshold[il] = p
#             values[il * 2] = i
#             il += 1
#         # fill the rest
#         for i in range(ir, len(probs)):
#             values[i * 2 + 1] = 0

#         assert((values < len(threshold)).all())
#         self.threshold = threshold
#         self.values = values

#     def sample(self, shape):
#         ps = numpy.random.uniform(0, 1, shape)
#         pb = ps * len(self.threshold)
#         index = pb.astype(numpy.int32)
#         left_right = (self.threshold[index] < pb - index).astype(numpy.int32)
#         return self.values[index * 2 + left_right]

Writing ../../context2vec2/src/walker_alias.py


In [24]:
%%writefile ../../context2vec2/src/negative_sampling.py

from walker_alias import WalkerAlias
from torch import tensor
import torch
import torch.nn as nn
import numpy as np

def init_embeddings(x):
    x = x.weight.data
    value = 2 / (x.size(1) + 1)
    x.uniform_(-value, value)
    

class NegativeSampling(nn.Module):
    def __init__(self, embed_size, counter, num_neg, power, device, pad_idx):
        super().__init__()
        self.counter = counter
        self.num_neg = num_neg
        self.power = power
        self.device = device
        
        self.W = nn.Embedding(len(counter), embedding_dim=embed_size, padding_idx=pad_idx)
        init_embeddings(self.W)
        # self.W.weight.data.zero_()
        self.log_loss = nn.LogSigmoid()
#         self.sum_log_sampled = t.bmm(noise, input.unsqueeze(2)).sigmoid().log().sum(1).squeeze()
        self.sampler = WalkerAlias(np.power(counter, power))
        
    def negative_sampling(self, shape):
        return tensor(self.sampler.sample(shape=shape), dtype=torch.long, device=self.device)
    
    def forward(self, X, context):
        batch_size, seq_len = X.size()
        embedding = self.W(X)
        pos_loss = self.log_loss((embedding * context).sum(2))

        neg_samples = self.negative_sampling(shape=(batch_size, seq_len, self.num_neg))
        neg_embedding = self.W(neg_samples)
        neg_loss = self.log_loss((-neg_embedding * context.unsqueeze(2)).sum(3)).sum(2)
        return -(pos_loss + neg_loss).sum()
        
        
        

Writing ../../context2vec2/src/negative_sampling.py


In [17]:
# train_path = '../../../../../data/processed/raw_wikitext-2.npy'
# sentences = np.load(train_path)
# batch_size = 100
# min_freq = 1
# device = 'cpu'

# dataset = WikiDataset(sentences, batch_size, min_freq, device)

In [26]:
%%writefile ../../context2vec2/src/model.py

from negative_sampling import NegativeSampling
from torch.nn.init import kaiming_normal
from torch import tensor
import torch
import torch.nn as nn
import numpy as np


def create_embedding_layer(vocab_size, word_embed_size, pad_idx):
    return nn.Embedding(num_embeddings=vocab_size,
                                    embedding_dim=word_embed_size,
                                    padding_idx=pad_idx)

def create_rnn_layer(word_embed_size, hidden_size, n_layers, batch_first, layer_type=nn.LSTM):
    return layer_type(input_size=word_embed_size,
                               hidden_size=hidden_size,
                               num_layers=n_layers,
                               batch_first=batch_first) 

class Context2vec(nn.Module):
    def __init__(self, vocab_size, counter, word_embed_size, hidden_size, n_layers, bidirectional, dropout,
                 pad_idx, device, inference):

        super().__init__()
        self.vocab_size = vocab_size
        self.word_embed_size = word_embed_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.device = device
        self.inference = inference
        self.rnn_output_size = hidden_size
        
        # embedding
        self.left2right_embed = create_embedding_layer(vocab_size, word_embed_size, pad_idx)
        self.right2left_embed = create_embedding_layer(vocab_size, word_embed_size, pad_idx)
        for embed in [self.left2right_embed, self.right2left_embed]:
            init_embeddings(embed)
        # rnn
        self.left2right_rnn = create_rnn_layer(word_embed_size, word_embed_size, n_layers, batch_first=True)
        self.right2left_rnn = create_rnn_layer(word_embed_size, word_embed_size, n_layers, batch_first=True)
        # dropout
        self.dropout = nn.Dropout(dropout)
        # loss
        self.neg_sample_loss = NegativeSampling(hidden_size, counter, pad_idx=pad_idx, num_neg=10, power=0.75,
                                          device=device) # num_neg=10, power=0.75 used in paper
        
        self.top_model = NeuralNet(input_size=hidden_size*2, mid_size=hidden_size*2, output_size=hidden_size,
                                                               dropout=dropout)
        
    def forward(self, X, y, target_pos=None):
        batch_size, seq_len = X.size()
        X_reversed = X.flip(1)[:, :-1]
        X = X[:, :-1]
        
        left2right_embed = self.left2right_embed(X)
        right2left_embed = self.right2left_embed(X_reversed)
        
        left2right_out, _ = self.left2right_rnn(left2right_embed)
        right2left_out, _ = self.right2left_rnn(right2left_embed)
        
        left2right_out = left2right_out[:, :-1, :]
        right2left_out = right2left_out[:, :-1, :].flip(1)
        # TESTING
        if self.inference:
            left2right_out = left2right_out[0, target_pos]
            right2left_out = right2left_out[0, target_pos]
            out = self.top_model(torch.cat((left2right_out, right2left_out), dim=0))
            return out
        # TRAINING 
        else:
            out = self.top_model(torch.cat((left2right_out, right2left_out), dim=2)) # dim = 2
            loss = self.neg_sample_loss(y, out)
            return loss 
        
    def run_inference(self, input_tokens, target, target_pos, k=10):
        context_vector = self.forward(input_tokens, target=None, target_pos=target_pos)
        if target is None:
            topv, topi = ((self.neg_sample_loss.W.weight*context_vector).sum(dim=1)).data.topk(k)
            return topv, topi
        else:
            context_vector /= torch.norm(context_vector, p=2)
            target_vector = self.neg_sample_loss.W.weight[target]
            target_vector /= torch.norm(target_vector, p=2)
            similarity = (target_vector * context_vector).sum()
            return similarity.item()
        
        
        
class NeuralNet(nn.Module):

    def __init__(self, input_size, mid_size, output_size, n_layers=2, dropout=0.3, activation_function='relu'):
        super().__init__()
        self.input_size = input_size
        self.mid_size = mid_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop = nn.Dropout(dropout)

        self.MLP = nn.ModuleList()
        if n_layers == 1:
            self.MLP.append(nn.Linear(input_size, output_size))
        else:
            self.MLP.append(nn.Linear(input_size, mid_size))
            for _ in range(n_layers - 2):
                self.MLP.append(nn.Linear(mid_size, mid_size))
            self.MLP.append(nn.Linear(mid_size, output_size))

        if activation_function == 'tanh':
            self.activation_function = nn.Tanh()
        elif activation_function == 'relu':
            self.activation_function = nn.ReLU()
        else:
            raise NotImplementedError

    def forward(self, x):
        out = x
        for i in range(self.n_layers-1):
            out = self.MLP[i](self.drop(out))
            out = self.activation_function(out)
        return self.MLP[-1](self.drop(out))
    
    
# class NeuralNet(nn.Module):
#     def __init__(self, out_sz, sizes, drops, y_range=None, use_bn=False, f=F.relu)
#     def __init__(self, input_size, mid_size, output_size, dropout):
#         super().__init__()
        
#         self.linear = nn.ModuleList([nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)])
#         self.bns = nn.ModuleList([nn.BatchNorm1d(size) for size in sizes[1:]])
#         for layer in self.linear:
#             kaiming_normal(layer.weight.data)
#         self.dropout = [nn.Dropout(drop) for drop in drops]
#         self.output = nn.Linear(sizes[-1], 1)
#         kaiming_normal(self.output.weight.data)
#         self.f = f
#         self.use_bn = use_bn
            
        
#     def forward(self, X):
#         for linear, drop, norm in zip(self.linear, self.dropout, self.bns):
#             X = self.f(linear(X))
#             if self.use_bn: 
#                 X = norm(X)
#             X = drop(X)
#         X = self.output(X)

Overwriting ../../context2vec2/src/model.py


In [19]:
from torch.nn.init import kaiming_normal


class NeuralNet(nn.Module):

    def __init__(self, input_size, mid_size, output_size, n_layers=2, dropout=0.3, activation_function='relu'):
        super().__init__()
        self.input_size = input_size
        self.mid_size = mid_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.drop = nn.Dropout(dropout)

        self.MLP = nn.ModuleList()
        if n_layers == 1:
            self.MLP.append(nn.Linear(input_size, output_size))
        else:
            self.MLP.append(nn.Linear(input_size, mid_size))
            for _ in range(n_layers - 2):
                self.MLP.append(nn.Linear(mid_size, mid_size))
            self.MLP.append(nn.Linear(mid_size, output_size))

        if activation_function == 'tanh':
            self.activation_function = nn.Tanh()
        elif activation_function == 'relu':
            self.activation_function = nn.ReLU()
        else:
            raise NotImplementedError

    def forward(self, x):
        out = x
        for i in range(self.n_layers-1):
            out = self.MLP[i](self.drop(out))
            out = self.activation_function(out)
        return self.MLP[-1](self.drop(out))
    
    
# class NeuralNet(nn.Module):
#     def __init__(self, out_sz, sizes, drops, y_range=None, use_bn=False, f=F.relu)
#     def __init__(self, input_size, mid_size, output_size, dropout):
#         super().__init__()
        
#         self.linear = nn.ModuleList([nn.Linear(sizes[i], sizes[i+1]) for i in range(len(sizes)-1)])
#         self.bns = nn.ModuleList([nn.BatchNorm1d(size) for size in sizes[1:]])
#         for layer in self.linear:
#             kaiming_normal(layer.weight.data)
#         self.dropout = [nn.Dropout(drop) for drop in drops]
#         self.output = nn.Linear(sizes[-1], 1)
#         kaiming_normal(self.output.weight.data)
#         self.f = f
#         self.use_bn = use_bn
            
        
#     def forward(self, X):
#         for linear, drop, norm in zip(self.linear, self.dropout, self.bns):
#             X = self.f(linear(X))
#             if self.use_bn: 
#                 X = norm(X)
#             X = drop(X)
#         X = self.output(X)

In [27]:
%%writefile ../../context2vec2/main.py

import numpy as np
import os
import time
import torch
from torch import optim
from src.mscc_eval import mscc_evaluation
from src.model import Context2vec
# from src.util.args import parse_args
from src.dataset import WikiDataset
# from src.util.config import Config
# from src.util.io import write_embedding, write_config, read_config, load_vocab
import boto3
from io import BytesIO


array_file = True  
text_file = False
use_s3 = False
use_validation_set = True
# validation_data = '../../../../data/processed/rawwikitext-2-valid.npy'
validation_data = '../../../../../data/processed/rawwikitext-103-valid.npy'
S3_BUCKET = 'handwrittingdetection'
S3_WIKI_TRAIN_PATH = 'data/wiki_train/rawwikitext-2-train.npy'
S3_WIKI_VAL_PATH = 'data/wiki_valid/rawwikitext-2-valid.npy'
log_dir_name = 'logs'
log_filename = 'log_dir1.txt'



train = True
word_embed_size = 300
hidden_size = 300
n_layers = 1
dropout = 0.00
n_epochs = 1
batch_size = 100
min_freq = 1
ns_power = 0.75
learning_rate = 1e-4
gpu_id = 0

def main(train_path):
#     use_cuda = torch.cuda.is_available()
    use_cuda = torch.cuda.is_available() and gpu_id > -1
    max_sent_length = 64
    if use_cuda:
        device = torch.device('cuda:{}'.format(gpu_id))
        torch.cuda.set_device(gpu_id)
    else:
        device = torch.device('cpu')

    if train:
#         batch_size = batch_size
#         n_epochs = n_epochs
#         word_embed_size = word_embed_size
#         hidden_size = hidden_size
#         learning_rate = learning_rate

        
        if use_s3:
            print('Loading Training Data from S3 bucket {} -- {}'.format(S3_BUCKET, S3_WIKI_TRAIN_PATH))
            client = boto3.resource('s3')
            bucket = client.Bucket(S3_BUCKET)
            sentences = np.load(BytesIO(bucket.Object(S3_WIKI_TRAIN_PATH).get()['Body'].read()))
        else:
            sentences = np.load(train_path)
            
        
        
        print('Creating dataset')
        dataset = WikiDataset(sentences, batch_size, min_freq, device)
        counter = np.array([dataset.vocab.freqs[word] if word in dataset.vocab.freqs else 0
                            for word in dataset.vocab.itos])
        model = Context2vec(vocab_size=len(dataset.vocab),
                            counter=counter,
                            word_embed_size=word_embed_size,
                            hidden_size=hidden_size,
                            n_layers=n_layers,
                            bidirectional=True,
                            dropout=dropout,
                            pad_idx=dataset.pad_idx,
                            device=device,
                            inference=False).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)

        print('batch_size:{}, n_epochs:{}, word_embed_size:{}, hidden_size:{}, device:{}'.format(
                                                batch_size, n_epochs, word_embed_size, hidden_size, device))
        print(model)
        
        if use_validation_set:
            if use_s3:
                print('Loading Validation Data from S3 bucket {} -- {}'.format(S3_BUCKET, S3_WIKI_VAL_PATH))
                val_sentences = np.load(BytesIO(bucket.Object(S3_WIKI_VAL_PATH).get()['Body'].read()))
            else:
                val_sentences = np.load(validation_data)
            
            print('Creating Validation dataset')
            val_dataset = WikiDataset(val_sentences, batch_size, min_freq, device)
            val_counter = np.array([val_dataset.vocab.freqs[word] if word in val_dataset.vocab.freqs else 0
                                for word in val_dataset.vocab.itos])
            
        log_dir = os.path.dirname(log_dir_name)
        if log_dir and not os.path.exists(log_dir):
            os.makedirs(log_dir)
            
            
        best_val_score = float('inf')
        print('Training Begins')
        interval = 1e6
        for epoch in range(n_epochs):
            begin_time = time.time()
            cur_at = begin_time
            total_loss = 0.0
            val_total_loss = 0.0
            word_count = 0
            next_count = interval
            last_accum_loss = 0.0
            last_word_count = 0
            
            model.train() 
            for iterator in dataset.get_batch_iter(batch_size):
                for batch in iterator:
                    sentence = getattr(batch, 'sentence')
                    target = sentence[:, 1:-1]
                    if target.size(0) == 0:
                        continue
                    optimizer.zero_grad()
                    loss = model(sentence, target)
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.data.mean()

                    minibatch_size, sentence_length = target.size()
                    word_count += minibatch_size * sentence_length
                    accum_mean_loss = float(total_loss)/word_count if total_loss > 0.0 else 0.0
                    if word_count >= next_count:
                        now = time.time()
                        duration = now - cur_at
                        throuput = float((word_count-last_word_count)) / (now - cur_at)
                        cur_mean_loss = (float(total_loss)-last_accum_loss)/(word_count-last_word_count)
                        print('{} words, {:.2f} sec, {:.2f} words/sec, {:.4f} accum_loss/word, {:.4f} cur_loss/word'
                              .format(word_count, duration, throuput, accum_mean_loss, cur_mean_loss))
                        next_count += interval
                        cur_at = now
                        last_accum_loss = float(total_loss)
                        last_word_count = word_count


            
            # ---------
            # VAL PHASE
            model.eval()
            for val_iterator in val_dataset.get_batch_iter(batch_size):
                with torch.no_grad():
                    for batch in val_iterator:
                        val_sentence = getattr(batch, 'sentence')
                        val_target = val_sentence[:, 1:-1]
                        if val_target.size(0) == 0:
                            continue
                        val_loss = model(val_sentence, val_target)
                        val_total_loss += val_loss.data.mean()
            print('Train loss: {} -- Valid loss: {}'.format(total_loss.item(), val_total_loss.item()))
            print()
            
    
        # ---------
            print(os.path.join(log_dir_name, log_filename))
            print(log_dir_name + '/' + log_filename)
            with open(os.path.join(log_dir_name, log_filename), 'a') as f:
                f.write(str(epoch) + ' ' + str(total_loss.item()) + ' ' + str(val_total_loss.item()) + '\n')


Overwriting ../../context2vec2/main.py


In [21]:
train_path = '../../../../../data/processed/rawwikitext-103-valid.npy'
main(train_path)

Creating dataset
batch_size:100, n_epochs:1, word_embed_size:300, hidden_size:300, device:cuda:0
Context2vec(
  (left2right_embed): Embedding(33201, 300, padding_idx=1)
  (right2left_embed): Embedding(33201, 300, padding_idx=1)
  (left2right_rnn): LSTM(300, 300, batch_first=True)
  (right2left_rnn): LSTM(300, 300, batch_first=True)
  (dropout): Dropout(p=0.0)
  (neg_sample_loss): NegativeSampling(
    (W): Embedding(33201, 300, padding_idx=1)
    (log_loss): LogSigmoid()
  )
  (top_model): NeuralNet(
    (drop): Dropout(p=0.0)
    (MLP): ModuleList(
      (0): Linear(in_features=600, out_features=600, bias=True)
      (1): Linear(in_features=600, out_features=300, bias=True)
    )
    (activation_function): ReLU()
  )
)
Creating Validation dataset
Training Begins




1000180 words, 8.02 sec, 124633.15 words/sec, 3.7575 accum_loss/word, 3.7575 cur_loss/word




Train loss: 6812877.0 -- Valid loss: 5959606.0

logs/log_dir1.txt
logs/log_dir1.txt


FileNotFoundError: [Errno 2] No such file or directory: 'logs/log_dir1.txt'

In [None]:
ls ../../../../../data/processed/rawwikitext-103-valid.npy