<a href="https://colab.research.google.com/github/mr-alamdari/NLP-Sequence-Models-Beginner/blob/main/NLP_Sequence_Models_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import numpy
import pickle

In [2]:
!pip install -q -U trax

import trax

import trax.fastmath.numpy as np

from trax import layers as tl

[K     |████████████████████████████████| 637 kB 5.5 MB/s 
[K     |████████████████████████████████| 4.9 MB 51.4 MB/s 
[K     |████████████████████████████████| 462 kB 43.9 MB/s 
[?25h

#**Practices**

In [3]:
a = np.array(325)



In [4]:
display(a)

DeviceArray(325, dtype=int32, weak_type=True)

In [5]:
type(a)

jaxlib.xla_extension.DeviceArray

In [6]:
def f(x):
  return 3 * x**2 - 32

In [7]:
d_a = f(a)
d_a

DeviceArray(316843, dtype=int32, weak_type=True)

In [8]:
grad_f = trax.fastmath.grad(fun=f)

In [9]:
type(grad_f)

function

In [10]:
def tweet_to_tensor(tweet, vocab_dict, unk_token='__UNK__', verbose=False):

    word_list = process_tweet(tweet)
    tensor_l = []
    unk_ID = vocab_dict[unk_token]

    for word in word_list:
        word_ID = vocab_dict[word] if word in vocab_dict else unk_ID
        tensor_l.append(word_ID) 
    
    return tensor_l

In [11]:
class Layer(object):
    def __init__(self):
        self.weights = None

    def init(self, input_signature, random_key):
        self.init_weights_and_state(input_signature, random_key)
        return self.weights

    def __call__(self, x):
        return self.forward(x)

In [12]:
class Relu(Layer):
    def forward(self, x):
        return np.maximum(x,0)

In [13]:
class Dense(Layer):

    def __init__(self, n_units, init_stdev=0.1):
        
        self._n_units = n_units
        self._init_stdev = init_stdev

    def forward(self, x):
        dense = np.dot(x, self.weights) 
        return dense

    def init_weights_and_state(self, input_signature, random_key):
        input_shape = input_signature.shape
        w = self._init_stdev * trax.fastmath.random.normal(key = random_key, shape = (input_shape[-1], self._n_units))
        self.weights = w
        return self.weights

In [14]:
tmp_embed = np.array([[1,2,3], [4,5,6]])

display(np.mean(tmp_embed,axis=0))

display(np.mean(tmp_embed,axis=1))

DeviceArray([2.5, 3.5, 4.5], dtype=float32)

DeviceArray([2., 5.], dtype=float32)

In [15]:
def classifier(vocab, embedding_dim=256, output_dim=2, mode='train'):
    vocab_size=len(vocab)
    embed_layer = tl.Embedding(vocab_size=vocab_size, d_feature=embedding_dim)
    
    mean_layer = tl.Mean(axis=1)
    
    dense_output_layer = tl.Dense(n_units = output_dim)

    log_softmax_layer = tl.LogSoftmax()
    
    model = tl.Serial(
      embed_layer,
      mean_layer,
      dense_output_layer,
      log_softmax_layer
    )

    return model

In [16]:
from trax.supervised import training

def train_model(classifier, train_task, eval_task, n_steps, output_dir):
    training_loop = training.Loop(
                                classifier, # The learning model
                                train_task, # The training task
                                eval_task = eval_task, # The evaluation task
                                output_dir = output_dir) # The output directory

    training_loop.run(n_steps = n_steps)

    return training_loop

In [17]:
def compute_accuracy(preds, y, y_weights):

    is_pos =  preds[:, 1] > preds[:, 0] 
    is_pos_int = is_pos.astype(np.int32)
    correct = is_pos_int == y
    sum_weights = np.sum(y_weights)
    correct_float = correct.astype(np.float32)
    weighted_correct_float = correct_float * y_weights
    weighted_num_correct = np.sum(weighted_correct_float)
    accuracy = weighted_num_correct / sum_weights
    return accuracy, weighted_num_correct, sum_weights

In [18]:
def test_model(generator, model):
    accuracy = 0.
    total_num_correct = 0
    total_num_pred = 0
    for batch in generator: 
        inputs = batch[0]
        targets = batch[1]
        example_weight = batch[2]
        pred = model(inputs)
        batch_accuracy, batch_num_correct, batch_num_pred = compute_accuracy(pred, targets, example_weight)
        total_num_correct += batch_num_correct
        total_num_pred += batch_num_pred
    accuracy = total_num_correct / total_num_pred
    return accuracy

In [19]:
def predict(model, Vocab, sentence):
    inputs = np.array(tweet_to_tensor(sentence, vocab_dict=Vocab))
    
    inputs = inputs[None, :]  
    
    preds_probs = model(inputs)
    
    preds = int(preds_probs[0, 1] > preds_probs[0, 0])
    
    sentiment = "negative"
    if preds == 1:
        sentiment = 'positive'

    return preds, sentiment

#N-Grams

In [20]:
!wget https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Sequence%20Models/Week%202/shakespeare.png

--2022-04-21 05:15:48--  https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Sequence%20Models/Week%202/shakespeare.png
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1713575 (1.6M) [image/png]
Saving to: ‘shakespeare.png’


2022-04-21 05:15:48 (28.4 MB/s) - ‘shakespeare.png’ saved [1713575/1713575]



In [21]:
!wget https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Sequence%20Models/Week%202/data/1kinghenryiv.txt

--2022-04-21 05:15:49--  https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Sequence%20Models/Week%202/data/1kinghenryiv.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 145002 (142K) [text/plain]
Saving to: ‘1kinghenryiv.txt’


2022-04-21 05:15:49 (6.20 MB/s) - ‘1kinghenryiv.txt’ saved [145002/145002]



In [23]:
lines = []
with open('1kinghenryiv.txt', 'r') as files:
  for line in files:
      pure_line = line.strip()
      if pure_line:
          lines.append(pure_line)

In [26]:
lines[0: 20]

['1 KING HENRY IV',
 'DRAMATIS PERSONAE',
 'KING HENRY\tthe Fourth. (KING HENRY IV:)',
 'HENRY,',
 'Prince of Wales\t(PRINCE HENRY:)\t|',
 '| sons of the King',
 'JOHN of Lancaster\t(LANCASTER:)\t|',
 'WESTMORELAND:',
 'SIR WALTER BLUNT:',
 'THOMAS PERCY\tEarl of Worcester. (EARL OF WORCESTER:)',
 'HENRY PERCY\tEarl of Northumberland. (NORTHUMBERLAND:)',
 'HENRY PERCY\tsurnamed HOTSPUR, his son. (HOTSPUR:)',
 'EDMUND MORTIMER\tEarl of March. (MORTIMER:)',
 'RICHARD SCROOP\tArchbishop of York. (ARCHBISHOP OF YORK:)',
 'ARCHIBALD\tEarl of Douglas. (DOUGLAS:)',
 'OWEN GLENDOWER:',
 'SIR RICHARD VERNON\t(VERNON:)',
 'SIR JOHN FALSTAFF\t(FALSTAFF:)',
 'SIR MICHAEL\ta friend to the Archbishop of York.',
 'POINS:']

In [46]:
ord('M'), ord('r'), ord(' '), ord('A'), ord('l'), ord('a'), ord('m'), ord('d'), ord('a'), ord('r'), ord('i')

(77, 114, 32, 65, 108, 97, 109, 100, 97, 114, 105)

In [47]:
line2Tensor = lambda line, EoSentence='1': [ord(l) for l in line] + [ord(EoSentence)]

In [48]:
temp_line = 'Mr Alamdari'
line2Tensor(temp_line)

[77, 114, 32, 65, 108, 97, 109, 100, 97, 114, 105, 49]

In [90]:
def data_generator(lines, batch_size=2, max_length=10, shuffle=True):
  n = len(lines)
  lines_index = [*range(n)]
  if shuffle:
    np.random.shuffle(lines_index)

  current_batch = []
  index = 0
  while True:
    if index >= n:
      index = 0
      if shuffle:
        np.random.shuffle(lines_index)
    line = lines[index]

    if len(line) < max_length:
      current_batch.append(line)
    index += 1

    if len(current_batch) == batch_size:
      batch = []
      mask = []
      for l in current_batch:
        tensor = line2Tensor(l)
        pad = [0] * (max_length - len(tensor))
        padded_tensor = tensor + pad
        batch.append(padded_tensor)
        masked_tensor = numpy.sign(padded_tensor)
        masked_tensor[masked_tensor == -1] = 0
        mask.append(masked_tensor)
      
      yield np.array(batch), np.array(mask)

      current_batch = []

In [92]:
lines = ['12345678901','123456789','234567890','345678901']
data_gen = data_generator(lines, batch_size=2, max_length=10, shuffle=False)

In [93]:
next(data_gen)

(DeviceArray([[49, 50, 51, 52, 53, 54, 55, 56, 57, 49],
              [50, 51, 52, 53, 54, 55, 56, 57, 48, 49]], dtype=int32),
 DeviceArray([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int32))