<a href="https://colab.research.google.com/github/n-bzy/iannwtf/blob/main/homework10_Nico.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow_text

In [None]:
# bash code to mount the drive
import os
from google.colab import drive
drive.mount ("/content/drive")
os.chdir("/content/drive/MyDrive")

Mounted at /content/drive


In [None]:
import tensorflow as tf
#import tensorflow_text as tf_text
import numpy as np
import re

file_path = f"/content/drive/MyDrive/bible.txt"

with open(file_path, "r") as f:
    text = f.read()

print(text[:100])

The First Book of Moses:  Called Genesis


1:1 In the beginning God created the heaven and the earth


## Preprocess

In [None]:
# Option 1: Normalize, split words, sort by most common words and replace all 
# words not under common size with [UNK]

def preprocess(text, size):
    "Preprocessing the text file for use in a NLP model"
    text = text.lower()
    text = re.sub(r"[^a-z]+", " ", text)
    text = tf.strings.split(text)

    vec_layer = tf.keras.layers.TextVectorization(max_tokens=size,
                                                        standardize=None,
                                                        split=None,
                                                        output_mode='int')
    vec_layer.adapt(text)
    voc = vec_layer.get_vocabulary()
    
    return voc, text
voc, tex = preprocess(text, size=10000)
print(voc[:20], tex)

['', '[UNK]', 'the', 'and', 'of', 'to', 'that', 'in', 'he', 'shall', 'unto', 'for', 'i', 'his', 'a', 'lord', 'they', 'be', 'is', 'him'] tf.Tensor([b'the' b'first' b'book' ... b'you' b'all' b'amen'], shape=(791829,), dtype=string)


In [None]:
# Option 2: Normalize text, split words and sort from most common down
def preprocess_voc(text,size):
    text = text.lower()
    text = re.sub(r"[^a-z]+", " ", text)
    text = tf.strings.split(text) 
    text_n = text.numpy().tolist()
    text_n.sort()
    counts = { text_n[0] : 1 }
    current_word = text_n[0]
    for i in text_n[1:]: 
        if i == current_word:
            counts[current_word] += 1
        else:
            current_word = i
            counts.update({current_word:1})
    counts = {key: val for key, val in sorted(counts.items(), key = lambda ele: ele[1], reverse = True)}
    voc = tf.convert_to_tensor(list(counts.keys())[:size])
    return text, voc

tex, voc = preprocess_voc(text,size=10000)
print(voc, tex[:10])

tf.Tensor([b'the' b'and' b'of' ... b'forbiddeth' b'forborn' b'forcible'], shape=(10000,), dtype=string) tf.Tensor(
[b'the' b'first' b'book' b'of' b'moses' b'called' b'genesis' b'in' b'the'
 b'beginning'], shape=(10,), dtype=string)


In [None]:
def words_to_number(text, voc):
    text = text.numpy().tolist()
    voc = voc.numpy().tolist()
    for item in range(len(text)):
        if text[item] in voc:
            text[item] = voc.index(text[item])
        else: 
            # UNK = index 10000
            text[item] = 10000
    return text

tex_n = words_to_number(tex,voc)
print(len(tex_n), tex_n[:20])

791829 [0, 216, 401, 2, 132, 160, 10000, 5, 0, 680, 26, 1297, 0, 170, 1, 0, 111, 1, 0, 111]


## Input-Target-Pairs

In [None]:
import math

def input_target_pairs(text):
    dataset = tf.data.Dataset.from_tensor_slices(text)

    iterator = iter(dataset) 
    iterator.get_next()
    shift1 = dataset.map(lambda x: iterator.get_next())

    iterator2 = iter(dataset) 
    iterator2.get_next()
    iterator2.get_next()
    shift2 = dataset.map(lambda x: iterator2.get_next())

    shift1up = tf.data.Dataset.zip((dataset, shift1))
    shift2up = tf.data.Dataset.zip((dataset, shift2))
    shift1down = tf.data.Dataset.zip((shift1, dataset))
    shift2down = tf.data.Dataset.zip((shift2, dataset))

    dataset = shift2down.concatenate(shift1up).concatenate(shift2up).concatenate(shift1down)
    dataset = dataset.shuffle(10000).batch(64).prefetch(tf.data.AUTOTUNE)
    return dataset


train_ds = input_target_pairs(tex_n[:math.ceil(len(tex_n)*0.75)])
test_ds = input_target_pairs(tex_n[math.ceil(len(tex_n)*0.75):])
for x,t in train_ds.take(1):
    tf.print(x.shape, t.shape)

TensorShape([64]) TensorShape([64])


## Class

In [None]:
class SkipGram(tf.keras.layers.Layer):
    def __init__(self, voc_size, emb_size):
        super().__init__()
    
        self.voc_size = voc_size
        self.emb_size = emb_size

        self.opt = tf.keras.optimizers.Adam()

        self.metrics_list = [tf.keras.metrics.Mean(name="loss")]
    
    def build(self):
        self.score = self.add_weight(shape=(self.voc_size, self.emb_size),
                                     initializer='random_normal',
                                     trainable=True)
        self.score_bias = self.add_weight(shape=(self.emb_size,),
                                   initializer='random_normal',
                                   trainable=True)
        self.emb = self.add_weight(shape=(self.voc_size, self.emb_size),
                                   initializer='random_normal',
                                   trainable=True)

    def __call__(self, x, training=False):
        emb = tf.nn.embedding_lookup(self.emb, x)
        return emb

    @property
    def metrics(self):
        return self.metrics_list
    
    def reset_metrics(self):
        for metric in self.metrics:
            metric.reset_state()   

    def train(self, data):
        input, target = data

        with tf.GradientTape() as tape:
            y = self(input, training=True)
            loss = tf.nn.nce_loss(weights=self.score, biases=self.score_bias, labels=target, inputs=y, 
                                  num_sampled=2, num_classes=self.voc_size)
            loss = tf.reduce_mean(loss)
            
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        self.metrics[0].update_state(loss)
        
        return {m.name : m.result() for m in self.metrics} 

    def test(self,data): 
        input, target = data

        y = self(input, training=False)
        loss = tf.nn.nce_loss(weights=self.score, labels=target, inputs=y, 
                              num_sampled=2, num_classes=self.voc_size)
        loss = tf.reduce_mean(loss)
        
        self.metrics[0].update_state(loss)
        
        return {m.name : m.result() for m in self.metrics}   

In [None]:
model_skip = SkipGram(10001, 64)
model_skip.build()

In [None]:
import tqdm

def training_loop(model, train, val, epochs):
    """Train and test the SkipGram for given epochs on given data"""

    # Save loss in a list for visualization
    lists = []

    for n in range(epochs):
        print(f"Epoch {n}:")
        
        for data in tqdm.tqdm(train, position=0, leave=True):
            metrics = model.train(data)

        # Add metrics to list
        lists.append(metrics)
        print([f"{key}: {value.numpy()}" for (key,value) in metrics.items()])
        model.reset_metrics()

        for data in tqdm.tqdm(val, position=0, leave=True):
            metrics = model.test(data)

        # Add metrics to list
        lists.append(metrics)
        print([f"{key}: {value.numpy()}" for (key,value) in metrics.items()])
        model.reset_metrics()
        
    return lists

li = training_loop(model_skip, train_ds, test_ds, epochs=10)

Epoch 0:


  0%|          | 0/2375488 [00:00<?, ?it/s]

() ()





InvalidArgumentError: ignored