In [1]:
%load_ext autoreload
%autoreload 2
import os

import torch 
from torch.utils.data import DataLoader

import fairseq
from fairseq import options
from fairseq_cli.preprocess import main as preprocess
from fairseq.data import Dictionary, TokenBlockDataset, MonolingualDataset

## Preprocess dataset Wiki103

In [2]:
raw_dataset = os.path.join('/mnt/dl/fairseq/Language_Model/wikitext-103-v1/wikitext-103/')

In [3]:
preprocessed_dataset = os.path.join(raw_dataset, 'data_bin')

In [4]:
os.listdir(raw_dataset)

['wiki.test.tokens', 'data_bin', 'wiki.valid.tokens', 'wiki.train.tokens']

In [5]:
parser = options.get_preprocessing_parser()

In [6]:
args = parser.parse_args([])

In [7]:
args.only_source = True
args.trainpref = os.path.join(raw_dataset, 'wiki.train.tokens')
args.validpref = os.path.join(raw_dataset, 'wiki.valid.tokens')
args.testpref = os.path.join(raw_dataset, 'wiki.test.tokens')
args.destdir = preprocessed_dataset
args.workers = 6

In [8]:
# TODO uncomment
# preprocess(args)

In [9]:
train_data_path = os.path.join(preprocessed_dataset, "train")

In [10]:
dictionary = Dictionary.load(os.path.join(preprocessed_dataset, 'dict.txt'))

In [11]:
dictionary.symbols[:10]

['<s>', '<pad>', '</s>', '<unk>', 'the', ',', '.', 'of', 'and', 'to']

In [12]:
len(dictionary)

267744

In [13]:
output_dictionary = dictionary

In [14]:
dataset = fairseq.data.data_utils.load_indexed_dataset(train_data_path, dictionary, args.dataset_impl)

2023-06-20 16:52:07 | INFO | fairseq.data.data_utils | loaded 1,801,350 examples from: /mnt/dl/fairseq/Language_Model/wikitext-103-v1/wikitext-103/data_bin/train


In [15]:
dataset

<fairseq.data.indexed_dataset.MMapIndexedDataset at 0x7f3c8ec118b0>

In [16]:
len(dataset)

1801350

In [17]:
dataset.sizes

array([  1,   6,   1, ..., 157,  14,   1], dtype=int32)

In [18]:
for i in range(10):
    string = dictionary.string(dataset[i].unsqueeze(0))
    print(string)
    print("---")



---
= Valkyria Chronicles III =
---

---
Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .
---
The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series

In [19]:
with open(os.path.join(raw_dataset, 'wiki.train.tokens')) as f:
    for i, line in enumerate(f):
        if i == 10:
            break
        print(line)
        print("---")    


 

---
 = Valkyria Chronicles III = 

---
 

---
 Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . 

---
 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [20]:
tokens_per_sample = 512
max_tokens = 2048
shorten_method = "none"
shorten_data_split_list = ""
seed = 0
split = "train"
sample_break_mode = "none"
split_path = os.path.join(train_data_path)

In [21]:
shorten_dataset = fairseq.data.shorten_dataset.maybe_shorten_dataset(
            dataset,
            split,
            shorten_data_split_list,
            shorten_method,
            tokens_per_sample,
            seed,
        )

In [22]:
shorten_dataset is dataset

True

In [23]:
token_dataset = TokenBlockDataset(shorten_dataset, shorten_dataset.sizes,
                                  block_size=tokens_per_sample,
                                   pad=dictionary.pad(),
                                   eos=dictionary.eos(),
                                   break_mode=sample_break_mode,
                                   include_targets=True,
                                   split_path=split_path,
                                )

In [24]:
token_dataset.sizes

array([512, 512, 512, ..., 512, 512, 141], dtype=uint16)

In [25]:
len(token_dataset[0])

3

In [26]:
dictionary.string(token_dataset[0][0].unsqueeze(0)) # source

'= Valkyria Chronicles III = Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Char

In [27]:
dictionary.string(token_dataset[0][1].unsqueeze(0)) # item

'= Valkyria Chronicles III = Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Char

In [28]:
dictionary.string(token_dataset[0][2].unsqueeze(0)) # past_target

'<pad> = Valkyria Chronicles III = Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers 

In [29]:
dictionary.string(token_dataset[1][0].unsqueeze(0))

'them having a higher difficulty than those found in the rest of the game . There are also love simulation elements related to the game \'s two main heroines , although they take a very minor role . The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills uni

In [30]:
dictionary.string(token_dataset[-1][0].unsqueeze(0))

'podcast , " we [ Bungie ] are focusing on getting our listeners and fans familiar with a bunch of the different faces at Bungie studios . " Brought back after close to a year @-@ long hiatus , the podcast now features Bungie news and interviews with staff members about their jobs and working at the studio . Smith had the title of " Bungie Community Manager " at Bungie , and has given interviews with the press about the company \'s recent products , including Halo 3 : ODST . Smith was among other writers @-@ turned @-@ game developers who held a discussion on the topic at the 2009 Game Developers Conference . Smith worked on player investment for Halo : Reach . He worked as design lead on Bungie \'s 2014 video game Destiny .'

In [31]:
dictionary.bos(), dictionary.eos(), dictionary.pad()

(0, 2, 1)

In [32]:
token_dataset[0][0]

tensor([     2,      2,     12,  52468,  11394,   1141,     12,      2,      2,
        158338,    129,  52468,     94,     45,      3,  11394,     25,    465,
            45, 267683,      5,   6444,      6,  52468,      7,      4,  21647,
            94,     24,      5,   1999,   1149,      9,     21,  52468,  11394,
          1141,    757,    663,      5,     26,     11,   6744,    296,     15,
           551,    290,     84,    436,     23,   5107,      8, 169397,     20,
             4,   2104,  13920,      6,  10455,     10,    226,    310,     10,
           663,      5,     31,     26,      4,    233,     84,     10,      4,
         52468,    127,      6,  65394,      4,    161,   7432,      7,   6744,
             8,    838,     15,     64,   2274,     21,     48,   7454,      5,
             4,    331,    639,   2807,      9,      4,     42,     84,      8,
          1763,      4,     13,  41313,     13,      5,     11,  23595,    411,
          1240,   1795,      4,   1873, 

In [33]:
shorten_dataset[0]

tensor([2])

In [34]:
shorten_dataset[1]

tensor([   12, 52468, 11394,  1141,    12,     2])

In [35]:
shorten_dataset[2]

tensor([2])

In [36]:
shorten_dataset[3]

tensor([158338,    129,  52468,     94,     45,      3,  11394,     25,    465,
            45, 267683,      5,   6444,      6,  52468,      7,      4,  21647,
            94,     24,      5,   1999,   1149,      9,     21,  52468,  11394,
          1141,    757,    663,      5,     26,     11,   6744,    296,     15,
           551,    290,     84,    436,     23,   5107,      8, 169397,     20,
             4,   2104,  13920,      6,  10455,     10,    226,    310,     10,
           663,      5,     31,     26,      4,    233,     84,     10,      4,
         52468,    127,      6,  65394,      4,    161,   7432,      7,   6744,
             8,    838,     15,     64,   2274,     21,     48,   7454,      5,
             4,    331,    639,   2807,      9,      4,     42,     84,      8,
          1763,      4,     13,  41313,     13,      5,     11,  23595,    411,
          1240,   1795,      4,   1873,      7,  86993,     76,      4,   1037,
        166355,    195,     54,   1857, 

In [37]:
shorten_dataset[4]

tensor([    16,     84,    142,    424,     10,    288,      5,   2379,     74,
            11,    213,   1412,      7,      4,    143,    994,     18,  52468,
         11394,    360,      6,    370,     31,   2421,      4,   1257,    523,
             7,      4,    127,      5,     31,     46,   4672,   1441,  17770,
             5,     96,     21,    403,      4,     84,     69,  45644,     20,
           127,  20004,      6,  12040,   3531,      3, 237830,      8,   2620,
         51822,  38407,    125,    371,     28,    441,   8170,      5,    182,
            22,  52468,  11394,    360,    720,  39953,  51883,      6,     77,
           213,    131,      7,   1448,   4832,      4,   1291,      6,     16,
            84,     17,    682,   1256,     14,   4664,     23,    186,  47260,
             6,      2])

In [38]:
token_dataset[1][0]

tensor([   107,    333,     11,    977,   2961,     83,    261,    158,     10,
             4,    684,      7,      4,     84,      6,    298,     39,     46,
           743,  13910,    776,    997,      9,      4,     84,     17,     53,
           272,  20891,      5,    286,     61,    350,     11,    260,   1183,
           296,      6,      2,     16,     84,     17,    521,    236,      5,
             4,      3,    236,      5,     26,    872,     74,   1276,     28,
             3,  11394,      6,    215,   2824,      5,    485,   5768,    185,
          1240,    397,     11,    318,     15,    225,   4019,      7,      4,
          6605,   3808,     45,    536,     11,    235,     26,   1158,      5,
             4,    346,   2840,      4,    235,    168,      4,   6605,     10,
           233,     15,    844,      6,     77,    235,    128,     78,   1096,
           536,    406,     15,    870,      5,     38,    429,    128,     37,
          1915,   1441,   1935,     29, 

In [39]:
dictionary.string(token_dataset[1][0].unsqueeze(0)) 

'them having a higher difficulty than those found in the rest of the game . There are also love simulation elements related to the game \'s two main heroines , although they take a very minor role . The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills uni

In [40]:
dictionary.string(token_dataset[1][1].unsqueeze(0)) 

'having a higher difficulty than those found in the rest of the game . There are also love simulation elements related to the game \'s two main heroines , although they take a very minor role . The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills unique t

In [41]:
dictionary.string(token_dataset[1][2].unsqueeze(0)) 

'of them having a higher difficulty than those found in the rest of the game . There are also love simulation elements related to the game \'s two main heroines , although they take a very minor role . The game \'s battle system , the <unk> system , is carried over directly from <unk> Chronicles . During missions , players select each unit using a top @-@ down perspective of the battlefield map : once a character is selected , the player moves the character around the battlefield in third @-@ person . A character can only act once per @-@ turn , but characters can be granted multiple turns at the expense of other characters \' turns . Each character has a field and distance of movement limited by their Action Gauge . Up to nine characters can be assigned to a single mission . During gameplay , characters will call out if something happens to them , such as their health points ( HP ) getting low or being knocked out by enemy attacks . Each character has specific " Potentials " , skills 

In [42]:
token_dataset[0][0].unsqueeze(0)

tensor([[     2,      2,     12,  52468,  11394,   1141,     12,      2,      2,
         158338,    129,  52468,     94,     45,      3,  11394,     25,    465,
             45, 267683,      5,   6444,      6,  52468,      7,      4,  21647,
             94,     24,      5,   1999,   1149,      9,     21,  52468,  11394,
           1141,    757,    663,      5,     26,     11,   6744,    296,     15,
            551,    290,     84,    436,     23,   5107,      8, 169397,     20,
              4,   2104,  13920,      6,  10455,     10,    226,    310,     10,
            663,      5,     31,     26,      4,    233,     84,     10,      4,
          52468,    127,      6,  65394,      4,    161,   7432,      7,   6744,
              8,    838,     15,     64,   2274,     21,     48,   7454,      5,
              4,    331,    639,   2807,      9,      4,     42,     84,      8,
           1763,      4,     13,  41313,     13,      5,     11,  23595,    411,
           1240,   1795,    

In [43]:
token_dataset[0][1].unsqueeze(0)

tensor([[     2,     12,  52468,  11394,   1141,     12,      2,      2, 158338,
            129,  52468,     94,     45,      3,  11394,     25,    465,     45,
         267683,      5,   6444,      6,  52468,      7,      4,  21647,     94,
             24,      5,   1999,   1149,      9,     21,  52468,  11394,   1141,
            757,    663,      5,     26,     11,   6744,    296,     15,    551,
            290,     84,    436,     23,   5107,      8, 169397,     20,      4,
           2104,  13920,      6,  10455,     10,    226,    310,     10,    663,
              5,     31,     26,      4,    233,     84,     10,      4,  52468,
            127,      6,  65394,      4,    161,   7432,      7,   6744,      8,
            838,     15,     64,   2274,     21,     48,   7454,      5,      4,
            331,    639,   2807,      9,      4,     42,     84,      8,   1763,
              4,     13,  41313,     13,      5,     11,  23595,    411,   1240,
           1795,      4,   1

In [44]:
token_dataset[0][2].unsqueeze(0)

tensor([[     1,      2,      2,     12,  52468,  11394,   1141,     12,      2,
              2, 158338,    129,  52468,     94,     45,      3,  11394,     25,
            465,     45, 267683,      5,   6444,      6,  52468,      7,      4,
          21647,     94,     24,      5,   1999,   1149,      9,     21,  52468,
          11394,   1141,    757,    663,      5,     26,     11,   6744,    296,
             15,    551,    290,     84,    436,     23,   5107,      8, 169397,
             20,      4,   2104,  13920,      6,  10455,     10,    226,    310,
             10,    663,      5,     31,     26,      4,    233,     84,     10,
              4,  52468,    127,      6,  65394,      4,    161,   7432,      7,
           6744,      8,    838,     15,     64,   2274,     21,     48,   7454,
              5,      4,    331,    639,   2807,      9,      4,     42,     84,
              8,   1763,      4,     13,  41313,     13,      5,     11,  23595,
            411,   1240,   1

In [45]:
targets = ["future"]
batch_size = 32

In [46]:
lm_dataset = MonolingualDataset(
            dataset=token_dataset,
            sizes=dataset.sizes,
            src_vocab=dictionary,
            tgt_vocab=output_dictionary,
            add_eos_for_other_targets=False,
            shuffle=False,
            # shuffle=True,
            targets=targets,
            add_bos_token=False,
            fixed_pad_length=None,
            pad_to_bsz=batch_size,
        )

In [47]:
lm_dataset[0]["source"]

tensor([     2,      2,     12,  52468,  11394,   1141,     12,      2,      2,
        158338,    129,  52468,     94,     45,      3,  11394,     25,    465,
            45, 267683,      5,   6444,      6,  52468,      7,      4,  21647,
            94,     24,      5,   1999,   1149,      9,     21,  52468,  11394,
          1141,    757,    663,      5,     26,     11,   6744,    296,     15,
           551,    290,     84,    436,     23,   5107,      8, 169397,     20,
             4,   2104,  13920,      6,  10455,     10,    226,    310,     10,
           663,      5,     31,     26,      4,    233,     84,     10,      4,
         52468,    127,      6,  65394,      4,    161,   7432,      7,   6744,
             8,    838,     15,     64,   2274,     21,     48,   7454,      5,
             4,    331,    639,   2807,      9,      4,     42,     84,      8,
          1763,      4,     13,  41313,     13,      5,     11,  23595,    411,
          1240,   1795,      4,   1873, 

In [48]:
lm_dataset[0]["target"]

tensor([     2,     12,  52468,  11394,   1141,     12,      2,      2, 158338,
           129,  52468,     94,     45,      3,  11394,     25,    465,     45,
        267683,      5,   6444,      6,  52468,      7,      4,  21647,     94,
            24,      5,   1999,   1149,      9,     21,  52468,  11394,   1141,
           757,    663,      5,     26,     11,   6744,    296,     15,    551,
           290,     84,    436,     23,   5107,      8, 169397,     20,      4,
          2104,  13920,      6,  10455,     10,    226,    310,     10,    663,
             5,     31,     26,      4,    233,     84,     10,      4,  52468,
           127,      6,  65394,      4,    161,   7432,      7,   6744,      8,
           838,     15,     64,   2274,     21,     48,   7454,      5,      4,
           331,    639,   2807,      9,      4,     42,     84,      8,   1763,
             4,     13,  41313,     13,      5,     11,  23595,    411,   1240,
          1795,      4,   1873,      7, 

In [49]:
train_dataloader = DataLoader(dataset=lm_dataset, collate_fn=lm_dataset.collater,
                              shuffle=False, batch_size=3)

In [50]:
for i, batch in enumerate(train_dataloader):
    print(batch)
    break

{'id': tensor([0, 1, 2]), 'nsentences': 3, 'ntokens': 1536, 'net_input': {'src_tokens': tensor([[     2,      2,     12,  ...,      5,    105,      7],
        [   107,    333,     11,  ...,      4,   2855,      5],
        [ 15584,     23,     43,  ..., 201798,      8,  12003],
        ...,
        [     1,      1,      1,  ...,      1,      1,      1],
        [     1,      1,      1,  ...,      1,      1,      1],
        [     1,      1,      1,  ...,      1,      1,      1]]), 'src_lengths': tensor([512, 512, 512])}, 'target': tensor([[    2,    12, 52468,  ...,   105,     7,   107],
        [  333,    11,   977,  ...,  2855,     5, 15584],
        [   23,    43, 11484,  ...,     8, 12003,     4],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])}


In [51]:
batch['net_input']['src_tokens']

tensor([[     2,      2,     12,  ...,      5,    105,      7],
        [   107,    333,     11,  ...,      4,   2855,      5],
        [ 15584,     23,     43,  ..., 201798,      8,  12003],
        ...,
        [     1,      1,      1,  ...,      1,      1,      1],
        [     1,      1,      1,  ...,      1,      1,      1],
        [     1,      1,      1,  ...,      1,      1,      1]])

In [52]:
batch['target']

tensor([[    2,    12, 52468,  ...,   105,     7,   107],
        [  333,    11,   977,  ...,  2855,     5, 15584],
        [   23,    43, 11484,  ...,     8, 12003,     4],
        ...,
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1],
        [    1,     1,     1,  ...,     1,     1,     1]])

In [53]:
batch['net_input']['src_tokens'].size()

torch.Size([32, 512])

## Model

In [54]:
from fairseq.models.transformer_lm import TransformerLanguageModel, transformer_lm_baevski_wiki103
from fairseq.tasks.language_modeling import LanguageModelingTask

In [55]:
model_args = options.get_parser("Model", default_task='language_modeling')

In [56]:
transformer_lm_baevski_wiki103(model_args)

In [57]:
model_args.__dict__

{'description': None,
 'argument_default': None,
 'prefix_chars': '-',
 'conflict_handler': 'error',
 '_registries': {'action': {None: argparse._StoreAction,
   'store': argparse._StoreAction,
   'store_const': argparse._StoreConstAction,
   'store_true': argparse._StoreTrueAction,
   'store_false': argparse._StoreFalseAction,
   'append': argparse._AppendAction,
   'append_const': argparse._AppendConstAction,
   'count': argparse._CountAction,
   'help': argparse._HelpAction,
   'version': argparse._VersionAction,
   'parsers': argparse._SubParsersAction,
   'extend': argparse._ExtendAction},
  'type': {None: <function argparse.ArgumentParser.__init__.<locals>.identity(string)>}},
 '_actions': [_HelpAction(option_strings=['-h', '--help'], dest='help', nargs=0, const=None, default='==SUPPRESS==', type=None, choices=None, help='show this help message and exit', metavar=None),
  _StoreTrueAction(option_strings=['--no-progress-bar'], dest='no_progress_bar', nargs=0, const=True, default=Fa

In [58]:
train_data_path

'/mnt/dl/fairseq/Language_Model/wikitext-103-v1/wikitext-103/data_bin/train'

In [59]:
args.data = preprocessed_dataset
args.output_dictionary_size = -1

In [60]:
task = LanguageModelingTask.setup_task(args)

2023-06-20 16:52:11 | INFO | fairseq.tasks.language_modeling | dictionary: 267744 types


In [61]:
model = TransformerLanguageModel.build_model(model_args, task)

In [62]:
model.cuda()

TransformerLanguageModel(
  (decoder): TransformerDecoder(
    (dropout_module): FairseqDropout()
    (embed_tokens): AdaptiveInput(
      (embeddings): ModuleList(
        (0): Sequential(
          (0): Embedding(20000, 1024, padding_idx=1)
          (1): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (1): Sequential(
          (0): Embedding(40000, 256)
          (1): Linear(in_features=256, out_features=1024, bias=False)
        )
        (2): Sequential(
          (0): Embedding(207744, 64)
          (1): Linear(in_features=64, out_features=1024, bias=False)
        )
      )
    )
    (embed_positions): SinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-15): 16 x TransformerDecoderLayerBase(
        (dropout_module): FairseqDropout()
        (self_attn): MultiheadAttention(
          (dropout_module): FairseqDropout()
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, o

In [63]:
model.eval()

TransformerLanguageModel(
  (decoder): TransformerDecoder(
    (dropout_module): FairseqDropout()
    (embed_tokens): AdaptiveInput(
      (embeddings): ModuleList(
        (0): Sequential(
          (0): Embedding(20000, 1024, padding_idx=1)
          (1): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (1): Sequential(
          (0): Embedding(40000, 256)
          (1): Linear(in_features=256, out_features=1024, bias=False)
        )
        (2): Sequential(
          (0): Embedding(207744, 64)
          (1): Linear(in_features=64, out_features=1024, bias=False)
        )
      )
    )
    (embed_positions): SinusoidalPositionalEmbedding()
    (layers): ModuleList(
      (0-15): 16 x TransformerDecoderLayerBase(
        (dropout_module): FairseqDropout()
        (self_attn): MultiheadAttention(
          (dropout_module): FairseqDropout()
          (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
          (v_proj): Linear(in_features=1024, o

In [64]:
with torch.no_grad():
    model(batch['net_input']['src_tokens'].cuda())

