In [1]:
import collections
import re
import torch
from d2l import torch as d2l

There was a problem when trying to write in your cache folder (C:\Users\HP\.cache\huggingface\hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.


In [2]:
class TimeMachine(d2l.DataModule):
    """The Time Machine dataset."""
    def _download(self):
        fname = d2l.download(d2l.DATA_URL + 'timemachine.txt', self.root, 
                             '090b5e7e70c295757f55df93cb0a180b9691891a')
        with open(fname) as f:
            return f.read()
@d2l.add_to_class(TimeMachine)

def _preprocess(self, text):
    return re.sub('[^A-Za-z]+', ' ', text).lower()

@d2l.add_to_class(TimeMachine)
def _tokenize(self, text):
  return list(text)

@d2l.add_to_class(TimeMachine)
def build(self, raw_text, vocab = None):
  tokens = self._tokenize(self._preprocess(raw_text))
  if vocab is None: vocab = Vocab(tokens)
  corpus = [vocab[token] for token in tokens]
  return corpus, vocab

class Vocab:
  """Vocab"""
  def __init__(self, tokens = [], min_freq=0, reversed_tokens = []):
    if tokens and isinstance(tokens[0], list):
      tokens = [token for line in tokens for token in line]

      #Count the frequencies
    counter = collections.Counter(tokens)
    self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse = True)
      #The list of unique tokens:
    self.idx_to_token = list(sorted(set(['<unk>'] + reversed_tokens + [
    token for token, freq in self.token_freqs if freq >= min_freq
      ])))
    self.token_to_idx = {token:idx
                           for idx, token in enumerate(self.idx_to_token)}
  def __len__(self):
    return len(self.idx_to_token)
  def __getitem__(self, tokens):
    if not isinstance(tokens, (list, tuple)):
      return self.token_to_idx.get(tokens, self.unk)
    return [self.__getitem__(token) for token in tokens]
  def to_tokens(self, indices):
    if hasattr(indices, '__len__') and len(indices) > 1:
      return [self.idx_to_token[int(index)] for index in indices]
    return self.idx_to_token[indices]

  @property
  def unk(self):
    return self.token_to_idx['<unk>']