In [153]:
import torch
import torch.nn as nn

In [6]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce GTX 1660 SUPER'

In [1]:
class TrainerConfig:
    # optimization parameters
    max_epochs = 10
    batch_size = 64
    learning_rate = 3e-4
    betas = (0.9, 0.95)
    grad_norm_clip = 1.0
    weight_decay = 0.1 # only applied on matmul weights
    # learning rate decay params: linear warmup followed by cosine decay to 10% of original
    lr_decay = False
    warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper, but may not be good defaults elsewhere
    final_tokens = 260e9 # (at what point we reach 10% of original LR)
    # checkpoint settings
    ckpt_path = None
    num_workers = 0 # for DataLoader
    writer = None
    
    def __init__(self, **kwargs):
        for k,v in kwargs.items():
            setattr(self, k, v)

In [8]:
hyperparameters = {
            "max_epochs": 75,
            "batch_size": 256,
            "learning_rate": 6e-4,
            "lr_decay": True,
            "warmup_tokens": 512*20,
            "final_tokens": 200,
            "num_workers": 4
        }

In [15]:
tconf = TrainerConfig(**hyperparameters)

In [16]:
tconf.num_workers

4

In [17]:
import dataset


In [18]:
pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)
pretrain_dataset

NameError: name 'text' is not defined

In [9]:
import random
import torch
from torch.utils.data import Dataset
import argparse

"""
The input-output pairs (x, y) of the NameDataset are of the following form:

  x: Where was Khatchig Mouradian born?⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Lebanon⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  x: Where was Jacob Henry Studer born?⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  y: □□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□⁇Columbus⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□

Using the PAD_CHAR characters in y before the ⁇[place] keeps the trainer from
optimizing the model to predict the question, "Where was...".

Note that the NameDataset should take the pretraining_dataset defined in run.py
as an input. This is to allow the vocab specification of the NameDataset to be
the same as that of the pretraining dataset.

You don't need to implement anything in NameDataset.
"""

class NameDataset(Dataset):
    def __init__(self, pretraining_dataset, data):
        self.MASK_CHAR = u"\u2047" # the doublequestionmark character, for mask
        self.PAD_CHAR = u"\u25A1" # the empty square character, for pad
        self.itos = pretraining_dataset.itos 
        self.stoi = pretraining_dataset.stoi 
        self.block_size = pretraining_dataset.block_size
        self.data = list(data.encode('utf-8').decode('ascii', errors='ignore').split('\n'))

    def __len__(self):
        # returns the length of the dataset
        return len(self.data) - 1

    def __getitem__(self, idx):
        inp, oup = self.data[idx].split('\t')
        x = inp + self.MASK_CHAR + oup + self.MASK_CHAR
        x = x + self.PAD_CHAR*(self.block_size - len(x))
        y = self.PAD_CHAR*(len(inp)-1) + x[len(inp):]
        
        x = x[:-1]
        x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long)
        y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long)
        return x, y


"""
[part e]

Write a class that yields examples of a simplified span corruption objective.
Do not change the signature of the __init__ or __getitem__ functions.

Make sure to implement the full spec for full credit -- we list below the
criteria that must be satisfied for a full implementation.

--------------
Vocabulary Specification

Your vocabulary is to be accessible via two dictionaries:
  self.stoi: a dictionary from characters in the vocabulary to indices of type
      int
  self.itos: a dictionary from indices of type int to characters in the
      vocabulary

Your vocabulary must have the following form: 

  Identifier 0 must be assigned to the unicode element u"\u25A1".
      This is the empty_square_character.
      Further, let self.PAD_CHAR = u"\u25A1"
  Identifier 1 must be assigned to the unicode element u"\u2047".
      This is the doublequestionmark character, which we'll use
      as a sentinel to represent that text is missing from the input
      Further, let self.MASK_CHAR = u"\u2047"
  Identifiers 2, ..., len(self.itos)-1 should be the sorted list of characters
      that appear in the data argument.

--------------
Masking Specification

The __getitem__ function takes an index and returns a data point (x, y) where
x and y are Long tensors of length self.block_size. x encodes the input
sequence, and y encodes the output sequence.

0. Use the idx argument of __getitem__ to retrieve the element of self.data
at the given index. We'll call the resulting data entry a document.

1. Randomly truncate the document to a length no less than 4 characters,
and no more than int(self.block_size*7/8) characters.

- IMPORTANT: You are free to decide how to perform this random truncation, but
make sure that the length is picked _randomly_ (every possible length from 4
to int(self.block_size*7/8) has a chance of being picked) for full credit.

2. Now, break the (truncated) document into three substrings:
    
    [prefix] [masked_content] [suffix]

  In other words, choose three strings prefix, masked_content and suffix
    such that prefix + masked_content + suffix = [the original document].
  The length of [masked_content] should be random, and 1/4 the length of the
    truncated document on average.

- IMPORTANT: You are free to decide how to perform this operation, but
make sure that the length is picked _randomly_ (has a chance of being more or
less than 1/4 the length of the truncated document) for full credit.

3. Rearrange these substrings into the following form:

    [prefix] MASK_CHAR [suffix] MASK_CHAR [masked_content] [pads]
  
  This resulting string, denoted masked_string, serves as the output example.
  Here MASK_CHAR is the masking character defined in Vocabulary Specification,
    and [pads] is a string of repeated PAD_CHAR characters chosen so that the
    entire string is of length self.block_size.
  Intuitively, the [masked_content], a string, is removed from the document and
    replaced with MASK_CHAR (the masking character defined in Vocabulary
    Specification). After the suffix of the string, the MASK_CHAR is seen again,
    followed by the content that was removed, and the padding characters.

4. We now use masked_string to construct the input and output example pair. To
do so, simply take the input string to be masked_string[:-1], and the output
string to be masked_string[1:]. In other words, for each character, the goal is
to predict the next character in the masked string.

5. Making use of the vocabulary that you defined, encode the resulting input
and output strings as Long tensors and return the resulting data point.

----------------
Here are some examples of input-output pairs (x, y):

  x: Khatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  y: hatchig Mouradian. Khatchig Mouradian is a jour⁇and tran⁇nalist, writer ⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□

  x: Jaco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  y: aco⁇enry ⁇b H⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□

  x: John Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
  y: ohn Stephen. Born in Glasgow, Steph⁇lder's apprentice on⁇en became a we⁇□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□


"""
class CharCorruptionDataset(Dataset):
    def __init__(self, data, block_size):
        self.MASK_CHAR = u"\u2047" # the doublequestionmark character, for mask
        self.PAD_CHAR = u"\u25A1" # the empty square character, for pad

        chars = list(sorted(list(set(data))))
        assert self.MASK_CHAR not in chars 
        assert self.PAD_CHAR not in chars
        chars.insert(0, self.MASK_CHAR)
        chars.insert(0, self.PAD_CHAR)

        self.stoi = { ch:i for i,ch in enumerate(chars) }
        self.itos = { i:ch for i,ch in enumerate(chars) }

        data_size, vocab_size = len(data), len(chars)
        print('data has %d characters, %d unique.' % (data_size, vocab_size))

        self.block_size = block_size
        self.vocab_size = vocab_size
        self.data = data.split('\n')

    def __len__(self):
        # returns the length of the dataset
        return len(self.data)

    def __getitem__(self, idx):
        doc = self.data[idx]
        truncated_len = int(torch.randint(low=4, high=int(self.block_size * 7/8) + 1, size=(1,))[0]) #random.randint(4, int(self.block_size*7/8))
        truncated_doc = doc[:truncated_len]

        #masked_content_len = int(random.normalvariate(truncated_len/4, 1))
        masked_content_len = int(torch.randint(low=1, high=2*int(truncated_len/4), size=(1,))[0])
        masked_content_index = int(torch.randint(low=0, high=truncated_len - int(truncated_len/4) + 1, size=(1,))[0])
        
        prefix = truncated_doc[:masked_content_index]
        suffix = truncated_doc[masked_content_index + masked_content_len:]
        masked_content = truncated_doc[masked_content_index : masked_content_index + masked_content_len]

        masked_string = prefix + self.MASK_CHAR + suffix + self.MASK_CHAR + masked_content + self.PAD_CHAR*(self.block_size - len(prefix + self.MASK_CHAR + suffix + self.MASK_CHAR + masked_content) + 1)

        x = masked_string[:-1]
        y = masked_string[1:]
        
        x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long)
        y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long)
        return x, y
"""
Code under here is strictly for your debugging purposes; feel free to modify
as desired.
"""

'\nCode under here is strictly for your debugging purposes; feel free to modify\nas desired.\n'

In [11]:
class NameDataset(Dataset):
    def __init__(self, pretraining_dataset, data):
        self.MASK_CHAR = u"\u2047" # the doublequestionmark character, for mask
        self.PAD_CHAR = u"\u25A1" # the empty square character, for pad
        self.itos = pretraining_dataset.itos 
        self.stoi = pretraining_dataset.stoi 
        self.block_size = pretraining_dataset.block_size
        self.data = list(data.encode('utf-8').decode('ascii', errors='ignore').split('\n'))

    def __len__(self):
        # returns the length of the dataset
        return len(self.data) - 1

    def __getitem__(self, idx):
        inp, oup = self.data[idx].split('\t')
        x = inp + self.MASK_CHAR + oup + self.MASK_CHAR
        x = x + self.PAD_CHAR*(self.block_size - len(x))
        y = self.PAD_CHAR*(len(inp)-1) + x[len(inp):]
        
        x = x[:-1]
        x = torch.tensor([self.stoi[c] for c in x], dtype=torch.long)
        y = torch.tensor([self.stoi[c] for c in y], dtype=torch.long)
        return x, y


In [6]:
data = open('../wiki.txt', 'r', encoding='utf-8').read()
data = data.split('\n')

In [16]:
corruption_dataset = CharCorruptionDataset(open('../wiki.txt', encoding='utf-8').read(), 128)
name_dataset = NameDataset(corruption_dataset,open('../birth_places_train.tsv', encoding='utf-8').read())
for _, example in zip(range(4), corruption_dataset):
    x,y = example
    print('x:', ''.join([name_dataset.itos[int(c)] for c in x]))

data has 418352 characters, 256 unique.
x: Khatchig Mouradian. Khatchig⁇ writer and transl⁇ Mouradian is a journalist,□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
x: Jacob Henry Stu⁇ Studer (26 February 1840 Columbus, Ohio - 2 August 1904 New⁇der. Jacob Henry□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
x: Joh⁇ in Glasgow, Step⁇n Stephen. Born□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□
x: Georgina Willis. Georgina Willis is an award winning film ⁇ was born⁇director who□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□□


In [26]:
corruption_dataset.__len__()

2938

In [19]:
for example in corruption_dataset:
    print(example)

(tensor([39, 63, 56, 75, 58, 63, 64, 62,  3, 41, 70, 76, 73, 56, 59, 64, 56, 69,
        14,  3, 39, 63, 56, 75, 58, 63, 64, 62,  3,  1, 70, 76, 73, 69,  1, 41,
        70, 76, 73, 56, 59, 64, 56, 69,  3, 64, 74,  3, 56,  3, 65,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]), tensor([63, 56, 75, 58, 63, 64, 62,  3, 41, 70, 76, 73, 56, 59, 64, 56, 69, 14,
         3, 39, 63, 56, 75, 58, 63, 64, 62,  3,  1, 70, 76, 73, 69,  1, 41, 70,
        76, 73, 56, 59, 64, 56, 69,  3, 64, 74,  3, 56,  3, 65,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  

In [24]:
data = open('../wiki.txt', encoding='utf-8').read()
chars = list(sorted(list(set(data))))
chars

['\n',
 ' ',
 '!',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '\xa0',
 '£',
 '\xad',
 'Á',
 'Å',
 'É',
 'Ó',
 'Ö',
 'Ø',
 'Ü',
 'ß',
 'à',
 'á',
 'ã',
 'ä',
 'å',
 'æ',
 'ç',
 'è',
 'é',
 'ë',
 'í',
 'ï',
 'ñ',
 'ó',
 'ô',
 'ö',
 'ø',
 'ü',
 'ý',
 'ă',
 'ą',
 'ć',
 'Č',
 'č',
 'ě',
 'ğ',
 'ī',
 'İ',
 'ı',
 'ł',
 'ń',
 'ō',
 'Ő',
 'ő',
 'œ',
 'ř',
 'ś',
 'ş',
 'Š',
 'š',
 'ť',
 'ū',
 'Ż',
 'ż',
 'Ž',
 'ž',
 'ș',
 'Γ',
 'Μ',
 'ά',
 'έ',
 'α',
 'γ',
 'η',
 'ι',
 'κ',
 'ν',
 'ο',
 'ρ',
 'ς',
 'τ',
 'υ',
 'ω',
 'ώ',
 'Ј',
 'А',
 'В',
 'Г',
 'И',
 'К',
 'П',
 'Р',
 'С',
 'а',
 'б

In [29]:
corruption_dataset.stoi

{'□': 0,
 '⁇': 1,
 '\n': 2,
 ' ': 3,
 '!': 4,
 '%': 5,
 '&': 6,
 "'": 7,
 '(': 8,
 ')': 9,
 '*': 10,
 '+': 11,
 ',': 12,
 '-': 13,
 '.': 14,
 '/': 15,
 '0': 16,
 '1': 17,
 '2': 18,
 '3': 19,
 '4': 20,
 '5': 21,
 '6': 22,
 '7': 23,
 '8': 24,
 '9': 25,
 ':': 26,
 ';': 27,
 '?': 28,
 'A': 29,
 'B': 30,
 'C': 31,
 'D': 32,
 'E': 33,
 'F': 34,
 'G': 35,
 'H': 36,
 'I': 37,
 'J': 38,
 'K': 39,
 'L': 40,
 'M': 41,
 'N': 42,
 'O': 43,
 'P': 44,
 'Q': 45,
 'R': 46,
 'S': 47,
 'T': 48,
 'U': 49,
 'V': 50,
 'W': 51,
 'X': 52,
 'Y': 53,
 'Z': 54,
 '`': 55,
 'a': 56,
 'b': 57,
 'c': 58,
 'd': 59,
 'e': 60,
 'f': 61,
 'g': 62,
 'h': 63,
 'i': 64,
 'j': 65,
 'k': 66,
 'l': 67,
 'm': 68,
 'n': 69,
 'o': 70,
 'p': 71,
 'q': 72,
 'r': 73,
 's': 74,
 't': 75,
 'u': 76,
 'v': 77,
 'w': 78,
 'x': 79,
 'y': 80,
 'z': 81,
 '\xa0': 82,
 '£': 83,
 '\xad': 84,
 'Á': 85,
 'Å': 86,
 'É': 87,
 'Ó': 88,
 'Ö': 89,
 'Ø': 90,
 'Ü': 91,
 'ß': 92,
 'à': 93,
 'á': 94,
 'ã': 95,
 'ä': 96,
 'å': 97,
 'æ': 98,
 'ç': 99,
 'è

In [3]:
class Example:
    def __init__(self, items):
        self.items = items

    def __getitem__(self, index):
        print(f"__getitem__ method called with index: {index}")
        return self.items[index]

# Creating an instance of the Example class
example_instance = Example([1, 2, 3, 4, 5])



__getitem__ method called with index: 2


3

In [10]:
truncated_len = int(torch.randint(low=4, high=int(128 * 7/8) + 1, size=(1,))[0]) #random.randint(4, int(self.block_size*7/8))
truncated_len

59

In [18]:
masked_content_len = int(torch.randint(low=1, high=2*int(truncated_len/4), size=(1,))[0])
masked_content_len

22

In [32]:
import torch.nn as nn

ln1 = nn.LayerNorm(1)
ln1

LayerNorm((1,), eps=1e-05, elementwise_affine=True)

In [33]:
x=torch.tensor([[1,2,3,4,5,6,7,8,9]], dtype=torch.float)
x

tensor([[1., 2., 3., 4., 5., 6., 7., 8., 9.]])

In [36]:
B=torch.tensor([[1,2,3,4],[5,6,7,8]], dtype=torch.float)
T = torch.tensor([[1,2,3,4],[5,6,7,8]], dtype=torch.float)

In [35]:
key = nn.Linear(3, 4)
key

Linear(in_features=3, out_features=4, bias=True)

In [46]:

x.size()
y = x.view(2,8)


In [47]:
x

tensor([[ 1.7636,  1.1262, -0.8324, -0.8365],
        [ 0.3033, -1.5052,  0.5895, -0.4196],
        [-0.7496,  0.4425, -0.2253, -1.5287],
        [ 0.1431, -1.9453,  0.4785, -2.5301]])

In [48]:
y

tensor([[ 1.7636,  1.1262, -0.8324, -0.8365,  0.3033, -1.5052,  0.5895, -0.4196],
        [-0.7496,  0.4425, -0.2253, -1.5287,  0.1431, -1.9453,  0.4785, -2.5301]])

In [76]:
key = nn.Linear(3, 4,False)
key(x)

tensor([[-0.1884, -0.0708,  0.3601, -0.5366],
        [-0.3250, -0.1338, -0.1603, -1.2233],
        [ 0.3715,  0.0050,  0.2308, -0.3758],
        [ 0.7728,  0.0558, -0.2291, -0.3763]], grad_fn=<MmBackward0>)

In [59]:
x

tensor([[ 0.9786,  1.4042,  0.0904],
        [-0.6065,  1.9472, -0.8633],
        [ 1.8183,  1.0048, -0.2881],
        [ 1.3286,  0.4521, -1.0205]])

In [53]:
key

Linear(in_features=3, out_features=4, bias=True)

In [3]:
import torch.nn as nn
tok_emb = nn.Embedding(12, 13)

In [4]:
tok_emb

Embedding(12, 13)

In [7]:
tok_emb([3])

TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [10]:
import torch
import torch.nn as nn


In [14]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)


In [15]:
input

tensor([[ 0.3685, -0.4384, -0.4744,  0.3143, -0.5882],
        [-0.1096,  0.4474,  1.0513, -0.8226,  2.0705],
        [-1.2948, -1.6219, -0.7120,  1.8780, -0.1090]], requires_grad=True)

In [16]:
target

tensor([2, 4, 1])

In [17]:
output = loss(input, target)
output.backward()

In [22]:
output

tensor(2.1016, grad_fn=<NllLossBackward0>)

In [21]:
import torch
import torch.nn.functional as F

# Assume we have 5 samples and 3 classes
logits = torch.randn(5, 3)  # Random logits
targets = torch.tensor([0, 1, 2, 0, 1])  # Ground truth labels

# Compute cross entropy loss
loss = F.cross_entropy(logits, targets)

print(loss)

tensor(1.3299)


In [23]:
logits

tensor([[ 0.2323,  0.2462,  0.5343],
        [-0.4068,  0.3998,  1.1895],
        [ 0.0352, -0.4770, -1.1875],
        [-0.9014, -0.5229,  0.1068],
        [ 0.7278,  1.7265,  1.0562]])

In [24]:
targets

tensor([0, 1, 2, 0, 1])

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim

# Assume we have 5 samples and 3 classes
logits = torch.randn(5, 3, requires_grad=True)  # Random logits
targets = torch.tensor([0, 1, 2, 0, 1])  # Ground truth labels

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Compute the loss
loss = loss_fn(logits, targets)

# Create an optimizer
optimizer = optim.SGD([logits], lr=0.3)


In [48]:
logits

tensor([[-0.3789, -0.0390, -1.4545],
        [-2.5439,  0.9085,  1.1939],
        [ 1.2779,  0.5343, -1.7550],
        [-0.1842, -0.6656,  0.8557],
        [ 0.1732,  1.1750,  0.4120]], requires_grad=True)

In [49]:
target

tensor([2, 4, 1])

In [50]:
loss

tensor(1.4844, grad_fn=<NllLossBackward0>)

In [51]:
optimizer.zero_grad()


In [52]:
loss.backward()


In [53]:
loss

tensor(1.4844, grad_fn=<NllLossBackward0>)

In [54]:
optimizer.step()

In [55]:
logits

tensor([[-0.3408, -0.0697, -1.4620],
        [-2.5447,  0.9431,  1.1601],
        [ 1.2385,  0.5156, -1.6969],
        [-0.1377, -0.6740,  0.8175],
        [ 0.1612,  1.2022,  0.3967]], requires_grad=True)

In [56]:
loss = loss_fn(logits, targets)


In [57]:
loss

tensor(1.4352, grad_fn=<NllLossBackward0>)

In [61]:
import pprint
pp = pprint.PrettyPrinter()

In [66]:
# Create an example tensor
# requires_grad parameter tells PyTorch to store gradients
x = torch.tensor([2.], requires_grad=True)

# Print the gradient if it is calculated
# Currently None since x is a scalar
pp.pprint(x.grad)

None


In [63]:
# Calculating the gradient of y with respect to x
y = x * x * 3 # 3x^2
y.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x^2)/d(x) = 6x = 12

tensor([12.])


In [69]:
z = x * x * 3 # 3x^2
z.backward()
pp.pprint(x.grad)

tensor([36.])


In [71]:
input = torch.ones(2,3,4)
input

tensor([[[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]],

        [[1., 1., 1., 1.],
         [1., 1., 1., 1.],
         [1., 1., 1., 1.]]])

In [72]:
class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size 
    self.hidden_size = hidden_size 

    # Defining of our layers
    self.linear = nn.Linear(self.input_size, self.hidden_size)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(self.hidden_size, self.input_size)
    self.sigmoid = nn.Sigmoid()
    
  def forward(self, x):
    linear = self.linear(x)
    relu = self.relu(linear)
    linear2 = self.linear2(relu)
    output = self.sigmoid(linear2)
    return output

In [73]:
import torch.optim as optim

In [119]:
# Create the y data
y = torch.ones(10, 5)

# Add some noise to our goal y to generate our x
# We want out model to predict our original data, albeit the noise
x = y + torch.randn_like(y)
x

tensor([[ 2.2380,  1.9940,  0.8967,  0.6325,  1.7934],
        [ 0.9121,  0.5858,  0.6514,  1.2518,  0.4956],
        [ 1.5544,  2.1912,  2.4079,  1.2920,  1.2117],
        [ 2.5505,  0.3734,  1.1715,  1.5561,  0.7659],
        [ 0.0755,  2.0241,  1.2270, -0.0053,  0.9008],
        [ 0.4248,  1.0350,  2.1698,  1.8442,  2.2809],
        [-1.1347,  1.4964,  0.0485,  1.3198,  1.4059],
        [ 0.1856,  1.6858, -0.8963, -1.1318, -0.5919],
        [ 2.4441,  2.5690,  0.3633,  0.8984,  2.2018],
        [-0.1601,  0.8451,  1.0034,  0.8799, -0.4442]])

In [130]:
# Instantiate the model
model = MultilayerPerceptron(5, 3)

# Define the optimizer
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.CrossEntropyLoss()

# Calculate how our model is doing now
y_pred = model(x)
loss_function(y_pred, y).item()

8.098772048950195

In [111]:
model(x)

tensor([[0.4714, 0.4420, 0.3863, 0.3196, 0.4875],
        [0.4694, 0.4836, 0.3220, 0.2516, 0.4793],
        [0.4005, 0.5518, 0.3147, 0.2755, 0.5232],
        [0.5000, 0.4244, 0.3970, 0.3418, 0.4558],
        [0.4848, 0.4476, 0.3685, 0.3053, 0.4690],
        [0.4806, 0.4635, 0.3460, 0.2800, 0.4703],
        [0.4164, 0.4360, 0.4340, 0.3516, 0.5593],
        [0.4033, 0.4849, 0.3857, 0.3200, 0.5524],
        [0.5109, 0.4437, 0.3584, 0.3036, 0.4371],
        [0.4358, 0.4745, 0.4125, 0.3974, 0.4979]], grad_fn=<SigmoidBackward0>)

In [112]:
y

tensor([[1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1.]])

In [128]:
for epoch in range(1000):
  # Set the gradients to 0
  adam.zero_grad()

  # Get the model predictions
  y_pred = model(x)

  # Get the loss
  loss = loss_function(y_pred, y)

  # Print stats
  print(f"Epoch {epoch+1}: traing loss: {loss}")

  # Compute the gradients
  loss.backward()

  # Take a step to optimize the weights
  adam.step()


Epoch 1: traing loss: 8.047189712524414
Epoch 2: traing loss: 8.047189712524414
Epoch 3: traing loss: 8.047189712524414
Epoch 4: traing loss: 8.047189712524414
Epoch 5: traing loss: 8.047189712524414
Epoch 6: traing loss: 8.047189712524414
Epoch 7: traing loss: 8.047189712524414
Epoch 8: traing loss: 8.047189712524414
Epoch 9: traing loss: 8.047189712524414
Epoch 10: traing loss: 8.047189712524414
Epoch 11: traing loss: 8.047189712524414
Epoch 12: traing loss: 8.047189712524414
Epoch 13: traing loss: 8.047189712524414
Epoch 14: traing loss: 8.047189712524414
Epoch 15: traing loss: 8.047189712524414
Epoch 16: traing loss: 8.047189712524414
Epoch 17: traing loss: 8.047189712524414
Epoch 18: traing loss: 8.047189712524414
Epoch 19: traing loss: 8.047189712524414
Epoch 20: traing loss: 8.047189712524414
Epoch 21: traing loss: 8.047189712524414
Epoch 22: traing loss: 8.047189712524414
Epoch 23: traing loss: 8.047189712524414
Epoch 24: traing loss: 8.047189712524414
Epoch 25: traing loss: 8.

In [126]:
x

tensor([[ 2.2380,  1.9940,  0.8967,  0.6325,  1.7934],
        [ 0.9121,  0.5858,  0.6514,  1.2518,  0.4956],
        [ 1.5544,  2.1912,  2.4079,  1.2920,  1.2117],
        [ 2.5505,  0.3734,  1.1715,  1.5561,  0.7659],
        [ 0.0755,  2.0241,  1.2270, -0.0053,  0.9008],
        [ 0.4248,  1.0350,  2.1698,  1.8442,  2.2809],
        [-1.1347,  1.4964,  0.0485,  1.3198,  1.4059],
        [ 0.1856,  1.6858, -0.8963, -1.1318, -0.5919],
        [ 2.4441,  2.5690,  0.3633,  0.8984,  2.2018],
        [-0.1601,  0.8451,  1.0034,  0.8799, -0.4442]])

In [129]:
model(x)

tensor([[0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536],
        [0.5536, 0.5536, 0.5536, 0.5536, 0.5536]], grad_fn=<SigmoidBackward0>)

In [104]:
import torch.nn as nn

# Define a simple model
model = nn.Sequential(
    nn.Linear(10, 5),
    nn.ReLU(),
    nn.Linear(5, 2),
)

# Print model parameters
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values : {param[:2]} \n")

Layer: 0.weight | Size: torch.Size([5, 10]) | Values : tensor([[ 0.2020,  0.2826, -0.2620, -0.2888,  0.2682, -0.2204, -0.2793,  0.3060,
          0.2224,  0.0625],
        [-0.1405,  0.0901,  0.1986, -0.0807, -0.0470, -0.2374,  0.2201, -0.0554,
          0.1359,  0.0497]], grad_fn=<SliceBackward0>) 

Layer: 0.bias | Size: torch.Size([5]) | Values : tensor([0.2226, 0.2083], grad_fn=<SliceBackward0>) 

Layer: 2.weight | Size: torch.Size([2, 5]) | Values : tensor([[-0.0994,  0.2941, -0.3815, -0.2457,  0.3082],
        [ 0.1129, -0.1426,  0.1425, -0.4010, -0.0539]],
       grad_fn=<SliceBackward0>) 

Layer: 2.bias | Size: torch.Size([2]) | Values : tensor([-0.0136,  0.0030], grad_fn=<SliceBackward0>) 



In [134]:
key = nn.Linear(128, 128)
    

Linear(in_features=128, out_features=128, bias=True)

In [144]:
m = nn.Linear(20, 30)
input = torch.randn(128, 20)
output = m(input)
print(output.size())

torch.Size([128, 30])


In [145]:
output

tensor([[ 1.4549,  0.0586, -1.0942,  ..., -0.4846,  0.6926,  0.8425],
        [ 1.6630,  0.7181, -0.3331,  ..., -0.7131,  0.0490, -0.3553],
        [-0.1423,  0.4225,  0.6148,  ...,  0.2812, -0.6399,  0.7453],
        ...,
        [-0.3899, -0.1400,  0.4106,  ...,  0.2905, -0.7879,  0.5667],
        [ 0.2438, -0.1464,  0.4203,  ...,  0.2475, -0.3726,  1.9641],
        [ 0.3818,  0.9384, -0.0820,  ...,  0.4024,  0.0724,  0.1813]],
       grad_fn=<AddmmBackward0>)

In [150]:
# NLP Example
batch, sentence_length, embedding_dim = 20, 5, 10
embedding = torch.randn(batch, sentence_length, embedding_dim)
layer_norm = nn.LayerNorm(embedding_dim)
# Activate module
layer_norm(embedding)
# Image Example
N, C, H = 2,3,4
input = torch.randn(N, C, H)
# Normalize over the last three dimensions (i.e. the channel and spatial dimensions)
# as shown in the image below
layer_norm = nn.LayerNorm([C, H])
output = layer_norm(input)

In [151]:
input

tensor([[[ 0.8793, -0.4320, -0.3556, -1.0118],
         [-0.2783, -0.2801, -1.4074,  1.0380],
         [-1.5029,  0.6193,  1.4001, -0.5501]],

        [[-1.3142, -0.5088, -0.3630,  0.3931],
         [ 1.2143, -2.4591, -1.6205,  0.3985],
         [ 0.7279,  0.5629,  0.1058, -0.1416]]])

In [152]:
output

tensor([[[ 1.1387, -0.3024, -0.2185, -0.9397],
         [-0.1336, -0.1355, -1.3745,  1.3131],
         [-1.4795,  0.8529,  1.7112, -0.4322]],

        [[-1.0339, -0.2511, -0.1094,  0.6254],
         [ 1.4235, -2.1466, -1.3316,  0.6306],
         [ 0.9508,  0.7904,  0.3462,  0.1057]]],
       grad_fn=<NativeLayerNormBackward0>)

In [155]:
C = nn.Parameter(nn.init.xavier_uniform_(torch.empty(1, 128, 210)))

In [164]:
A= nn.Parameter(nn.init.xavier_uniform_(torch.empty(1, 2, 2)))
A

Parameter containing:
tensor([[[0.6377, 0.7076],
         [0.2699, 0.0889]]], requires_grad=True)