# This is quite important for practice in ML/DL model building, always start from a sample set from the large data set.

In [1]:
import os
import math
import copy
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import jieba
import nltk
from nltk import word_tokenize
from collections import Counter
from torch.autograd import Variable

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# NLTK resource https://www.nltk.org/data.html
# Download resources for part-of-speech tagging
nltk.download('averaged_perceptron_tagger')

# Download WordNet resources (for tasks like synonym extraction)
nltk.download('nltk_data/corpora/wordnet')

# Download Open Multilingual WordNet resource
nltk.download('nltk_data/corpora/omw-1.4')

# Download pre-trained model for sentence tokenization (especially for English)
nltk.download('punkt')


#################### Corpora ###############################
# Download the Brown Corpus
nltk.download('brown')

# Download a collection of English texts from Project Gutenberg
nltk.download('gutenberg')

# Download other corpora (replace names with desired ones)
nltk.download('shakespeare')
nltk.download('cmudict')
nltk.download('cess_cat')

############## Stop words ######################################
# Download stopwords for a specific language (replace 'english' with the code)
nltk.download('stopwords/english')

# Download stopwords for other languages (e.g., 'french', 'german')
nltk.download('stopwords/<language_name>')

############ Additional resource#################
# Download gazetteers (geographical name lists)
nltk.download('gazetteers')

# Download names (personal name lists)
nltk.download('names')

# Download data for Snowball stemmers
nltk.download('snowball_data')

# Download Wall Street Journal parsed corpus (for advanced tasks)
nltk.download('treebank')

# Download sample tweets from Twitter
nltk.download('twitter_samples')




[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Error loading nltk_data/corpora/wordnet: Package
[nltk_data]     'nltk_data/corpora/wordnet' not found in index
[nltk_data] Error loading nltk_data/corpora/omw-1.4: Package
[nltk_data]     'nltk_data/corpora/omw-1.4' not found in index
[nltk_data] Downloading package punkt to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package shakespeare to
[nltk_data]     /home/loveplay1983/nltk_data...
[nltk_data]   Package shakes

True

In [3]:
#  Init parameters

UNK = 0 # unknow word-id
PAD = 1 # padding word-id
BATCH_SIZE = 64

DEBUG = True
# DEBUG = False # model building, GPU CUDA is preferred

if DEBUG:
    EPOCHS = 2
    LAYERS = 3
    H_NUM = 8
    D_MODEL = 128
    D_FF = 256
    DROPOUT = 0.1
    MAX_LENGTH = 60
    TRAIN_FILE = "./data/nmt/en-cn/train_mini.txt"
    DEV_FILE = "./data/nmt/en-cn/dev_mini.txt"
    SAVE_FILE = "./save/models/model.pt"

else:
    EPOCHS = 20
    LAYERS = 6
    H_NUM = 8
    D_MODEL = 256
    D_FF = 1024
    DROPUT = .1
    MAX_LENGTH = 60
    TRAIN_FILE = "./data/nmt/en-cn/train.txt"
    DEV_FILE = "./data/nmt/en-cn/dev.txt"
    SAVE_FILE = "./save/models/large_model.pt"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Preprocessing 
1. Load the sentence and tokenize the sentence and add start/end marks(Begin of Sentence /End of Sentence vs BOS/ EOS).
2. Build dictionaries including ‘word-to-id’ and inverted dictionary ‘id-to-word’: English and Chinese, ‘word: index}, i.e, {‘english’: 1234}, {1234: ‘english’}.
3. Sort the dictionaries to reduce padding.
4. Split the dataset into patches for training and validation.

In [4]:
def seq_padding(X, padding=0):
    """
    Add padding to a batch of data
    """
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([
            x, [padding] * (ML - len(x))
        ]) if len(x) < ML else x for x in X
    ])

In [5]:
class PrepareData:
    def __init__(self, train_file, dev_file):
        # 1. Read the data and tokenize
        self.train_en, self.train_cn = self.load_data(train_file)
        self.dev_en, self.dev_cn = self.load_data(dev_file)

        # 2. build dictionary: En and CN
        self.en_word_dict, self.en_total_words, self.en_index_dict = self.build_dict(self.train_en)
        self.cn_word_dict, self.cn_total_words, self.cn_index_dict = self.build_dict(self.train_cn)

        # 3. word to id by dictionary
        self.train_en, self.train_cn = self.wordToID(self.train_en, self.train_cn, 
                                                     self.en_word_dict, self.cn_word_dict)
        self.dev_en, self.dev_cn = self.wordToID(self.dev_en, self.dev_cn, 
                                                 self.en_word_dict, self.cn_word_dict)

        # 4. batch, padding, and masking
        self.train_data = self.splitBatch(self.train_en, self.train_cn, BATCH_SIZE)
        self.dev_data = self.splitBatch(self.dev_en, self.dev_cn, BATCH_SIZE)

    # Utility functions
    def load_data(self, path):
        """
        read data, tokenize the seence and add start and end marks(bos, eos)
        for example:
        en = [
            ["BOS", "i", "love", "you", "EOS"],
            ["BOS", "me", "too", "EOS"],
            ...
        ]
        cn = [
            ["BOS", "我", "爱", "你", "EOS"],
            ["BOS", "我", "也", ,"是", "EOS"],
            ...
        ]
        """
        en = []
        cn = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().split("\t")
                en.append(["BOS"] + word_tokenize(line[0].lower()) + ["EOS"])
                cn.append(["BOS"] + word_tokenize(" ".join([w for w in line[1]])) + ["EOS"])
        return en, cn
    
    def build_dict(self, sentences, max_words = 50000):
        """
        sentences: list of word list
        build dictionary as {key(word): value(id)}
        """
        word_count = Counter()
        for setence in sentences:
            for s in sentence:
                word_count[s] += 1
                
        ls = word_count.most_common(max_words)
        total_words = len(ls) + 2 # BOS + EOS = 2
        word_dict = {w[0]: index + 2  for index, w in enumerate(ls)}
        word_dict["UNK"] = UNK
        word_dict["PAD"] = PAD
        # inverted index:  {key(id): value(word)}
        index_dict = {v: k for k, v in word_dict.items()}
        return word_dict, total_words, index_dict
        
        
    def wordToID(self, en, cn, en_dict, cn_dict, sort=True):
        """
        convert input/output word lists to id lists
        use input word list length to sort, reduce padding
        """
        length = len(en)
        out_en_ids = [[en_dict.get(w, 0) for w in sent] for sent in en]
        out_cn_ids = [[cn_dict.get(w, 0) for w in sent] for sent in cn]
        
        def len_argsort(seq):
            """
            get sorted index w.r.t length.
            """
            
            return sorted(range(len(seq)), key=lambda x: len(seq[x]))
        
        if sort:
            sorted_index = len_argsort(out_en_ids) # English
            out_en_ids = [out_en_ids[id] for id in sorted_index]
            out_cn_ids = [out_cn_ids[id] for id in sorted_index]
            
        return out_en_ids, out_cn_Ids
    
    def splitBatch(self, en, cn, batch_size, shuffle=True):
        """
        get data into batches
        """
        idx_list = np.arange(0, len(en), batch_size) # start, stop, step
        if shuffle:
            np.random.shuffle(idx_list)

        batch_indexs = []
        for idx in idx_list:
            # batch index between current index and the min index o
            batch_indexs.append(np.arange(idx, min(idx+batch_size, len(en)))) 

        batches = []
        for batch_index in batch_indexs:
            batch_en = [en[index] for index in batch_index]
            batch_cn = [cn[index] for index in batch_index]
            # paddings: batch, batch_size, batch_maxlen
            batch_cn = seq_padding(batch_cn)
            batch_en = seq_padding(batch_en)
            # Batch class will be defined later which is the masking batch of data during training
            # "Object for holding a batch of data with mask during training."
            batches.append(Batch(batch_en, batch_cn)) 
            
        return batches

In [6]:
# class Batch:
#     "Object for holding a batch of data with mask during training."
#     def __init__(self, src, trg=None, pad=0):
#         # convert words id to long format.  
#         src = torch.from_numpy(src).to(DEVICE).long()
#         trg = torch.from_numpy(trg).to(DEVICE).long()
#         self.src = src
#         # get the padding postion binary mask
#         # change the matrix shape to  1×seq.length
#         self.src_mask = (src != pad).unsqueeze(-2)
#         # 如果输出目标不为空，则需要对decoder要使用到的target句子进行mask
#         if trg is not None:
#             # decoder input from target 
#             self.trg = trg[:, :-1]
#             # decoder target from trg 
#             self.trg_y = trg[:, 1:]
#             # add attention mask to decoder input  
#             self.trg_mask = self.make_std_mask(self.trg, pad)
#             # check decoder output padding number
#             self.ntokens = (self.trg_y != pad).data.sum()
    
#     # Mask 
#     @staticmethod
#     def make_std_mask(tgt, pad):
#         "Create a mask to hide padding and future words."
#         tgt_mask = (tgt != pad).unsqueeze(-2)
#         tgt_mask = tgt_mask & Variable(subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data))
#         return tgt_mask # subsequent_mask is defined in 'decoder' section.

Understanding Initialization:

When you create an nn.Embedding layer, it initializes a lookup table with random embedding vectors for each word in the vocabulary.
These initial vectors have a specific dimensionality (d_model) but their values are randomly chosen within a certain range.
Normalization and Gradient Vanishing:

Without the math.sqrt(d_model) factor, the initial values of the embedding vectors can have a large magnitude (very high or very low values).
This can lead to two potential issues:
Normalization: If the initial values have a large magnitude, the gradients during training might become very small when backpropagated through the network. This is known as the vanishing gradient problem, which can hinder the learning process.
Activation Functions: If the network uses activation functions with bounded outputs (like sigmoid or tanh), large initial values can cause these activations to saturate, effectively making them insensitive to further changes.
The Role of math.sqrt(d_model):

Multiplying the embedding vectors by math.sqrt(d_model) essentially scales their initial values. This scaling helps address the issues mentioned above:
Normalization: By dividing the variance of the initial values by d_model, the gradients tend to have a more manageable magnitude during backpropagation, improving learning efficiency.
Activation Functions: Scaling the initial values ensures they are within a range where activation functions can operate effectively, allowing for more nuanced gradients during training.
Alternative Initializations:

While math.sqrt(d_model) is a common scaling factor, it's not the only approach. Some researchers use other techniques like uniform initialization within a specific range or initialization based on pre-trained word embeddings from sources like Word2Vec or GloVe.

In [7]:
# input and output embeddngs
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        # lut -> lookup table
        self.lut = nn.Embeddings(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        # return x's embedding vector (times math.sqrt(d_model))
        return self.lut(x) * math.sqrt(self.d_model)

# Positional encoding
[max_sequence_len, embedding_dim] 
$$PE_{(pos, 2i)} = sin(\frac{pos}{10000^{2i/d_{model}}})$$
$$PE_{(pos, 2i+1)} = cos(\frac{pos}{10000^{2i/d_{model}}})$$

**1. Standard Formula and Scaling Factor:**

The standard formula for positional encoding in Transformers defines a scaling factor that influences the influence of position on the final embedding:

```
PE(pos, 2i) = sin(pos * 10000.0^(2i / d_model))
PE(pos, 2i + 1) = cos(pos * 10000.0^(2i / d_model))
```

Here, the key term is `10000.0^(2i / d_model)`. Raising 10000.0 to a power that decreases with position (`i`) (due to the division by `d_model`) creates a scaling effect:

- For positions closer to zero (smaller `i`), the value is closer to 10000.0, giving the position a stronger influence.
- As the position (`i`) increases, the power term gets smaller, reducing the influence of position on the sine or cosine function.

**2. Code and Its Scaling Effect:**

The code snippet calculates a component for the positional encoding, focusing on even positions (`i`):

```
torch.exp(torch.arange(0., d_model, 2, device=DEVICE) * -(math.log(10000.0) / d_model))
```

This code achieves a similar scaling effect as the standard formula, but using logarithms:

- `torch.exp(...)`: Applies the exponential function (e raised to the power of...).
- `torch.arange(0., d_model, 2, device=DEVICE)`: Creates a sequence of increasing values for even positions.
- `- (math.log(10000.0) / d_model)`: A constant term calculated as the negative logarithm of 10000.0 divided by `d_model`. The negative sign ensures the base of the exponential term is less than 1, creating a decaying sequence.

**3. The Connection:**

Here's why the code achieves a similar effect:

- **Decaying Sequence:** The exponential term (`torch.exp(...)`) with the negative scaling factor creates a sequence of values that decay as the position (`i`) increases in the `torch.arange` part. This mimics the decreasing power term in the standard formula.
- **Scaling Factor and Logarithm:** The constant term `- (math.log(10000.0) / d_model)` plays a crucial role. Let's analyze it:
  - `math.log(10000.0)`: This calculates the natural logarithm (base e) of 10000.0. A larger value like 10000.0 in the logarithm typically results in a value close to 4 (logarithm of 10000 to base e is approximately 4.6).
  - `- (...)`: The negative sign flips the result, ensuring the base of the exponential term in `torch.exp` is less than 1.
  - `/ d_model`: This division scales the effect based on the model's dimension (`d_model`).

**Essentially, raising `e` (base of the natural logarithm) to the power of `- (math.log(10000.0) / d_model)` results in a value close to 1/(10000.0^1), scaled by `d_model`. This acts similarly to the inverse of the scaling factor in the standard formula.**

**In simpler terms:**

- The standard formula uses `10000.0` raised to a power that decreases with position (`i`) to create a scaling effect.
- The code achieves a similar effect by taking the inverse logarithm of 10000.0, dividing by `d_model`, and using that as the base of a decaying exponential term. The final result is mathematically equivalent (or very close) to the inverse of the scaling factor in the standard formula.

**Deriving the Formula:**

While the code doesn't directly calculate the inverse of the standard formula's scaling factor, it achieves a functionally equivalent outcome. Here's a breakdown of the steps involved:

1. Standard Formula Scaling Factor:
   - `10000.0^(2i / d_model)` (this term controls the influence of position in the standard formula).
2. Code's Scaling Effect:
   - The negative logarithm term (`-math.log(10000.0)`) in the code roughly scales the values down to

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model, device=DEVICE)
        position=torch.arange(0., max_len, device=DEVICE).unsqueeze(1)
        div_term = torch.exp(torch.arange(0., d_model, 2, 
                                          device=DEVICE) * -(math.log(10000.0) / d_model))
        pe_pos = torch.mul(position, div_term)
        pe[:, 0::2] = torch.sin(pe_pos)  # even embedding dimension
        pe[:, 1::2] = torch.cos(pe_pos)  # odd  embedding dimension
        pe = pe.unsqueeze(0)
        
        self.register_buffer("pe", pe)  # pe
        
    def forward(self, x):
        
        

In [10]:
test = torch.zeros(50, 50, device=DEVICE)

In [11]:
test.shape

torch.Size([50, 50])

In [13]:
test[:, 0::2].shape

torch.Size([50, 25])

In [14]:
test[:, 1::2]

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')