In [9]:
import torch
import torch.nn as nn
# from torch.nn import functional as F

In [10]:
####### PRE-RELEASE CONFIG ########
torch.manual_seed(1337)

<torch._C.Generator at 0x7f3e4209b050>

In [12]:
################################## DATASET LOADING #############################
with open('sentences.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [13]:
################################## TOKENIZATION ################################
def encoder(text: str):
  """
  the encoder() function takes text as an input and return back its tokenized
  form.
  :param text : the text you're willing to encode.
  :return: the tokenized form of the text passed as a parameter.
  """
  chars = sorted(list(set(text)))
  words_to_int_map = { ch:i for i,ch in enumerate(chars) }
  encoded = [words_to_int_map[c] for c in text]
  chars_length = len(chars)
  return encoded, chars_length

def decoder(vector: str):
  """
  the decoder() function takes encoded takes as an input and returns back its
  original form.
  :param vector : the text you're willing to encode.
  :return: the original form of the text passed as a parameter.
  """
  chars = sorted(list(set(text)))
  int_to_words_map = {i:ch for i,ch in enumerate(chars)}
  decoded = ''.join([int_to_words_map[i] for i in vector])
  chars_length = len(chars)
  return decoded, chars_length

In [14]:
############################### DATASET SPLITS #################################
encoded, vocab_size = encoder(text)
######### DATASET = TRAINING SET + TESTING SET ##############
data = torch.tensor(encoded, dtype=torch.long) # The encoded tensor
tl = int(0.8*len(data)) 
train_data = data[:tl]
test_data = data[tl:]

In [28]:
############################## HYPERPARAMETERS #################################
batch_size = 64
block_size = 256 
max_iters = 2000
eval_interval = 200
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 400
n_embd = 300
n_head = 4
n_layer = 4
dropout = 0.2

In [29]:

def batchifier(split):
    """
    batchifier is used to generate a small batch of inputs and targets.
    :param split: the split from which you want to generate the batch.
    :return: the inputs and targets.
    """
    data = train_data if split == 'train' else test_data
    random_set = torch.randint(len(data) - block_size, (batch_size,))
    x_batch= torch.stack([data[i:i+block_size] for i in random_set])
    y_batch = torch.stack([data[i+1:i+block_size+1] for i in random_set])
    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    return x_batch, y_batch

@torch.no_grad()
def loss_estimator():
    """
    loss_estimator is used to estimate the loss of at every training step.
    """
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = batchifier(split)
            tokenized_words, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ The head class represents the head of the attention mechanism in the 
    transformer."""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        """
        batches,time,channels = x.shape
        k = self.key(x)
        q = self.query(x)
        weights = q @ k.transpose(-2,-1) * channels**-0.5
        weights = weights.masked_fill(self.tril[:time, :time] == 0, float('-inf'))
        weights = nn.functional.softmax(weights, dim=-1)
        weights = self.dropout(weights)
        v = self.value(x)
        out = weights @ v
        return out

class MultiHeadAttention(nn.Module):
    """ A parallel attention mechanism that uses mulitple heads working in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        MultiHeadAttention.forward() is used to concatenate between different output
        in a parallel context.
        """
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ The Feed Forward Neural Network is used reformulate the inputs in a 
    digestable form."""

    def __init__(self, n_embd):
        super().__init__()
        self.freeforward_net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.freeforward_net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.layer_norm_1 = nn.LayerNorm(n_embd)
        self.layer_norm_2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.layer_norm_1(x))
        x = x + self.ffwd(self.layer_norm_2(x))
        return x

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        batches, time = idx.shape
        token_embeddings = self.token_embedding_table(idx)
        positional_embedding = self.position_embedding_table(torch.arange(time, device=device))
        x = token_embeddings + positional_embedding
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            batches, time, channels = logits.shape
            logits = logits.view(batches*time, channels)
            targets = targets.view(batches*time)
            loss = nn.functional.cross_entropy(logits, targets)

        return logits, loss

    def generate_tokens(self, wid, max_new_tokens):
        """
        generate_tokens is used to generate new words based on the training that
        has been doing.

        :param wid: is an array that holds the indices in the current context
        of size (B, T).
        :param max_new_tokens: the maximum characters to generate.
        """
      
        for _ in range(max_new_tokens):
            # crop wid to the last block_size tokens
            wid_cond = wid[:, -block_size:]
            # Predict
            tokenized_words, loss = self(wid_cond)
            # Focus only on the last time step
            tokenized_words = tokenized_words[:, -1, :]
            # Apply softmax to get probabilities
            probalistic_dist = nn.functional.softmax(tokenized_words, dim=-1)
            # Sample from the distribution
            wid_next = torch.multinomial(probalistic_dist, num_samples=1)
            # Concatenate sampled index to the running sequence
            wid = torch.cat((wid, wid), dim=1)
        return wid
    
model = Transformer()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'Million Parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = loss_estimator()
        print(f"Step {iter}: Train Loss {losses['train']:.4f}, Test Loss {losses['test']:.4f}")

    # sample a batch of data
    xb, yb = batchifier('train')

    # evaluate the loss
    tokenized_words, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


4.444859 Million Parameters
Step 0: Train Loss 4.2254, Test Loss 4.2274
Step 200: Train Loss 2.2631, Test Loss 2.4177
Step 400: Train Loss 1.0520, Test Loss 2.1896
Step 600: Train Loss 0.1559, Test Loss 3.2147
Step 800: Train Loss 0.0724, Test Loss 4.0043
Step 1000: Train Loss 0.0593, Test Loss 4.3786
Step 1200: Train Loss 0.0519, Test Loss 4.6639
Step 1400: Train Loss 0.0482, Test Loss 4.8161
Step 1600: Train Loss 0.0457, Test Loss 4.8988
Step 1800: Train Loss 0.0439, Test Loss 5.0649
Step 1999: Train Loss 0.0429, Test Loss 5.1515


In [16]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate_tokens(context, max_new_tokens=2000)[0].tolist())) ######### Input 1 - MOHAMED CHOUKRI


is suddenly asleeping alone. Then I turned to Inher tree a pleasy that third I rubbed
slept their as gener, sad attnames that Quess the matter back
159

MOHAMED C H O U K R I

merrip in ordered, and I reached thinking the
trutting stomalking towards throat, wonder object s bend overhead,
how to the Comes dog.
Come on. Get out of the watches. W hen I got a speak up it on
like a litd car neither?
No.
19

MOHAMED C H O UK R I

The tire in bed, he climbed ove his it out into a
room into the alread with all, with
the rapart notice to Tetuan,
and the police of the gatech. What dogs him? Choukri open the way country up
the btablackets.
And looks at the old door is the cows. I continued the bottl
and sitting the darks with it.
W hat a difficulwards the Rif stood two that lefty is not my little. One
generminuto some so for see where it seven inte me, she saids even two
or by the entrance was job in So norry. I thouse evet one
of a busy years footsteps and asking a syuffered, until as I did.


In [21]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate_tokens(context, max_new_tokens=1000)[0].tolist())) ###### Generation 2 - Darija


wach kent li af?
kan nak f jami3 l7alat ljow f merrakch zine.
kat3jbni chemch.
bghit ghir ntbessi bl aSfo f ch7al had nkon rasna awla karta.
hadchi bikhir o 3aSir dial limoun sghir?
o wach bghiti telj fikra dial bghiti l9hwa fih?
Caputshino?
"bghit chi kass kbir dial limoun m3Sor, bla telj."
ayeh mer7at l program dial ter
o kayban lia llah merrak li kaybo lia lina merra o tsowe
jmhdma walo man kay3jboni lia lmossansa marsh la dial chi wa7d yji y3t9na
"ayeh, sir 3ellah, njerbo!"
Aaaaaaah
"katre33ed, yak chi bass makayn?"
bessa7?
"tana mstressi, frask wa7d tiyara ta7t lbar7 f Texas?"
idan 7na f nefs l7ala
Tbib mabghi lia chwia dial ta bghiti.
"mangoulch l kolchi ghadi ykheli l balk melhi.
achmen l3bat 3zaz 3lik?
Poker!
wla 3lach mat9tare7ch chi 7aja
"Ah, nl3b poker."
hadi hia ll3ba piano? kantmenna
kanl3b lpiano
kandir piano merra merra o kayl3b lia lwa7d
o men lkhdma wach kaykhaSS khrj m3a3ndich mak lsin tkhiyya bezzaf dial l3ilaj?
rak kat7eTTini f wD3ia S3iba.
ana rah Tbib nefsi dialk

In [22]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate_tokens(context, max_new_tokens=1000)[0].tolist())) ####### Generation 3 - Darija


mlli tan9der njib.
o chi kass kbir had chi 9Ser nsber l dakchi.
"fekkenti kantSenneT lik o o men hadchi kachfek... anta 3ndich kandir had li kank.
"machi mochkil 3ndi la did bezzaf dial lkhdma, walakin lan."
"mazal 3ndi snani kamlin t9riban, hamdollah!"
bghitih tgherer!
"klghalib atgoul l alkarta diali kayn kanharj 7sen men dima 7aS kon hna fl3mer dialk.
wach ma3ndi l3arD 3la Rachid.
"achi bia ghadi ykhal mo7al l3mer, hadi makan dima ana la makana 3zal 9bel jadawil Darb"
jadawil DDarb?
"ayeh, 3reft fchkel had l9adia"
walakin l ostada diali diali dik lw9t kant kadowwez lwa cha o teha hadial  kaykhiyyna chi sohel.
hadi fikra momtana o lmada lli ghadi nha chab ghir fl9ent, o b9a kayjerr f ser ykh leb bezzaf dial l7lwa bach tkon bikhir.
"fekkertini, kantmenna matkonch daba, bach tbent lia howa tla bghiti."
walakin chkon daba hadik Anya kantkheliwha bou7dha
ila li wellaSSa la khar
o 3ndi moun ghadi yjib lntibah md lkhrin.
"walakin chwia dial ttout bezzaf.
"hadchi kantmenna kanl3ik f jami3 