<a href="https://colab.research.google.com/github/nicolai5965/Transformer_scratch_tensorflow/blob/main/Branch_3_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import string
import re
import time

from nltk.corpus import words
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('words')
nltk.download('punkt')

import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import train_test_split

from google.colab import drive
# Mount Google Drive to load the dataset
drive.mount('/content/drive')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the file path and load the train and test data
filepath = '/content/drive/My Drive/Colab Notebooks/Machine Learning/TensorFlow/GRU/'

In [None]:
df = pd.read_csv(f'{filepath}fake_or_real_news.csv')

In [None]:
display(df)

Unnamed: 0,id,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [None]:
class TextPreprocessor:
    def __init__(self, remove_list, fraction, column_name):
        self.remove_list = remove_list
        self.fraction = fraction
        self.column_name = column_name

    def preprocess(self, df):
        df_shuffled = df.sample(frac=1).reset_index(drop=True)
        num_samples = int(len(df_shuffled) * self.fraction)
        df_subset = df_shuffled[:num_samples].copy()

        df_subset[self.column_name] = df_subset[self.column_name].apply(self._remove_strings)
        return df_subset[self.column_name].values

    def _remove_strings(self, text):
        for string in self.remove_list:
            text = text.replace(string, ' ')
        text = re.sub(r'http\S+|www.\S+', '', text, flags=re.MULTILINE)
        text = re.sub(r'\(@\w+\)', '', text)
        return text.lower()


class TextTokenizer:
    def __init__(self):
        self.tokenizer = Tokenizer()

    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)
        self.tokenizer.word_index['<SOS>'] = len(self.tokenizer.word_index) + 1
        self.tokenizer.word_index['<EOS>'] = len(self.tokenizer.word_index) + 1

    def tokenize(self, texts):
        return [self._custom_tokenize_text(t) for t in texts]

    def texts_to_sequences(self, texts):
        return self.tokenizer.texts_to_sequences(texts)

    def _custom_tokenize_text(self, text):
        return re.findall(r'\b\w+\b|[' + string.punctuation + ']', text)

    @property
    def word_index(self):
        return self.tokenizer.word_index

    @property
    def index_word(self):
        return self.tokenizer.index_word



class PadToFixed:
    def __init__(self, max_sequence_len):
        self.max_sequence_len = max_sequence_len

    def fit_transform(self, encoder_sequences, decoder_sequences, decoder_labels):
        return self._pad_to_fixed(encoder_sequences, decoder_sequences, decoder_labels)

    def _pad_to_fixed(self, encoder_sequences, decoder_sequences, decoder_labels):
        encoder_sequences = tf.keras.preprocessing.sequence.pad_sequences(encoder_sequences, padding='post', maxlen=self.max_sequence_len)
        decoder_sequences = tf.keras.preprocessing.sequence.pad_sequences(decoder_sequences, padding='post', maxlen=self.max_sequence_len)
        decoder_labels = tf.keras.preprocessing.sequence.pad_sequences(decoder_labels, padding='post', maxlen=self.max_sequence_len)
        return encoder_sequences, decoder_sequences, decoder_labels


def add_SOS_EOS_tokens(sequences, tokenizer):
    encoder_sequences = [[tokenizer.word_index['<SOS>']] + seq + [tokenizer.word_index['<EOS>']] for seq in sequences]
    decoder_sequences = [[tokenizer.word_index['<SOS>']] + seq[1:] for seq in sequences]
    decoder_labels = [seq[1:] + [tokenizer.word_index['<EOS>']] for seq in sequences]
    return encoder_sequences, decoder_sequences, decoder_labels



In [None]:
def generate_data_information(train_encoder_inputs, train_decoder_inputs, train_decoder_labels,
                              val_encoder_inputs, val_decoder_inputs, val_decoder_labels):
    # Create a DataFrame for sequence shapes information
    data_sequence_shapes = {
        'Train Encoder Input Shape': [],
        'Train Decoder Input Shape': [],
        'Train Decoder Label Shape': [],
        'Val Encoder Input Shape': [],
        'Val Decoder Input Shape': [],
        'Val Decoder Label Shape': [],
    }

    # Add the sequence shapes to the DataFrame
    data_sequence_shapes['Train Encoder Input Shape'].append(train_encoder_inputs.shape)
    data_sequence_shapes['Train Decoder Input Shape'].append(train_decoder_inputs.shape)
    data_sequence_shapes['Train Decoder Label Shape'].append(train_decoder_labels.shape)
    data_sequence_shapes['Val Encoder Input Shape'].append(val_encoder_inputs.shape)
    data_sequence_shapes['Val Decoder Input Shape'].append(val_decoder_inputs.shape)
    data_sequence_shapes['Val Decoder Label Shape'].append(val_decoder_labels.shape)

    df_sequence_shapes = pd.DataFrame(data_sequence_shapes)

    # Create a DataFrame for dataset information
    data_dataset = {
        'Dataset': ['Train Encoder Input', 'Train Decoder Input', 'Train Decoder Label', 'Val Encoder Input', 'Val Decoder Input', 'Val Decoder Label'],
        'Total Tokens': [len(train_encoder_inputs), len(train_decoder_inputs), len(train_decoder_labels), len(val_encoder_inputs), len(val_decoder_inputs), len(val_decoder_labels)],
        'Total Parameters': [train_encoder_inputs.size, train_decoder_inputs.size, train_decoder_labels.size, val_encoder_inputs.size, val_decoder_inputs.size, val_decoder_labels.size]
    }

    df_dataset = pd.DataFrame(data_dataset)

    # Display the sequence shapes DataFrame
    print("Sequence Shapes:")
    display(df_sequence_shapes)

    # Display the dataset information DataFrame
    print("Dataset Information:")
    display(df_dataset)


In [None]:
class DataPreprocessor:
    def __init__(self, config, verbose_1=True, verbose_2=True):
        self.config = config
        self.verbose_1 = verbose_1
        self.verbose_2 = verbose_2

    def preprocess(self):
        preprocessor = TextPreprocessor(
            remove_list=self.config['remove_list'],
            fraction=self.config['data_proportion'],
            column_name=self.config['column_name']
        )

        text = preprocessor.preprocess(self.config['dataframe'])
        text = [sent_tokenize(t) for t in text]
        text = [sent for article in text for sent in article]

        train_text, val_text = train_test_split(text, test_size=self.config['val_split_size'], random_state=42)

        tokenizer = TextTokenizer()
        tokenizer.fit(train_text)

        train_tokens = tokenizer.tokenize(train_text)
        val_tokens = tokenizer.tokenize(val_text)

        total_words = sum([len(t) for t in train_tokens])

        encoder_sequences_train, decoder_sequences_train, decoder_labels_train = add_SOS_EOS_tokens(tokenizer.texts_to_sequences(train_tokens), tokenizer)
        encoder_sequences_val, decoder_sequences_val, decoder_labels_val = add_SOS_EOS_tokens(tokenizer.texts_to_sequences(val_tokens), tokenizer)

        pad_to_fixed = PadToFixed(max_sequence_len=self.config['max_sequence_len'])
        encoder_input_train, decoder_input_train, decoder_target_train = pad_to_fixed.fit_transform(encoder_sequences_train, decoder_sequences_train, decoder_labels_train)
        encoder_input_val, decoder_input_val, decoder_target_val = pad_to_fixed.fit_transform(encoder_sequences_val, decoder_sequences_val, decoder_labels_val)

        if self.verbose_1:
            generate_data_information(encoder_input_train, decoder_input_train, decoder_target_train,
                                      encoder_input_val, decoder_input_val, decoder_target_val)

        train_dataset = tf.data.Dataset.from_tensor_slices((encoder_input_train, decoder_input_train, decoder_target_train))
        val_dataset = tf.data.Dataset.from_tensor_slices((encoder_input_val, decoder_input_val, decoder_target_val))

        train_dataset = train_dataset.shuffle(self.config['BUFFER_SIZE']).batch(self.config['BATCH_SIZE'], drop_remainder=False)
        val_dataset = val_dataset.batch(self.config['BATCH_SIZE'], drop_remainder=False)  # Note: No need to shuffle the validation data

        return train_dataset, val_dataset, len(tokenizer.word_index) + 1, tokenizer

    def get_max_sequence_lengths(self):
        return self.config['max_sequence_len'], self.config['max_sequence_len']




In [None]:
class MyEmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model, verbose=False, name="my_embedding_layer", **kwargs):
        super(MyEmbeddingLayer, self).__init__(name=name, **kwargs)
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.verbose = verbose
        # Initialize the embedding matrix randomly
        self.embedding_matrix = self.add_weight(
            shape=[vocab_size, d_model],
            initializer='random_normal',
            trainable=True
        )

    def call(self, inputs):
        embeddings = tf.nn.embedding_lookup(self.embedding_matrix, inputs)
        if self.verbose:
            print(f"Embeddings shape: {embeddings.shape}")
        return embeddings

    def get_config(self):
        config = super().get_config()
        config.update({
            'vocab_size': self.vocab_size,
            'd_model': self.d_model,
            'verbose': self.verbose
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)



In [None]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, position, d_model, name="positional_encoding", **kwargs):
        super(PositionalEncoding, self).__init__(name=name, **kwargs)
        self.position = position
        self.d_model = d_model

        self.pos_encoding = self.calculate_positional_encoding()

    def calculate_positional_encoding(self):
        angle_rads = self.get_angles(
            np.arange(self.position)[:, np.newaxis],
            np.arange(self.d_model)[np.newaxis, :],
            self.d_model
        )

        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    @staticmethod
    def get_angles(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]


    def get_config(self):
        config = super().get_config()
        config.update({
            'position': self.position,
            'd_model': self.d_model
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.keras.layers.Lambda(lambda x: tf.matmul(x[0], x[1], transpose_b=True),
                                       name="matmul_qk")([q, k])

    depth = tf.keras.layers.Lambda(lambda x: tf.cast(tf.shape(x)[-1], tf.float32),
                                   name="depth")(k)
    logits = tf.keras.layers.Lambda(lambda x: x[0] / tf.math.sqrt(x[1]),
                                    name="logits")([matmul_qk, depth])

    if mask is not None:
        logits = tf.keras.layers.Lambda(lambda x: x[0] + (x[1] * -1e9),
                                        name="logits_masked")([logits, mask])

    attention_weights = tf.keras.layers.Lambda(lambda x: tf.nn.softmax(x, axis=-1),
                                               name="attention_weights")(logits)
    return tf.keras.layers.Lambda(lambda x: tf.matmul(x[0], x[1]),
                                  name="output")([attention_weights, v]), attention_weights


def create_padding_mask(seq, identifier):
    seq = tf.keras.layers.Lambda(lambda x: tf.cast(tf.math.equal(x, 0), tf.float32),
                                 name=f"padding_mask_{identifier}")(seq)
    return tf.keras.layers.Lambda(lambda x: x[:, tf.newaxis, tf.newaxis, :],
                                  name=f"padding_mask_expanded_{identifier}")(seq)



def create_look_ahead_mask(size):
    ones_matrix = tf.keras.layers.Lambda(lambda x: tf.ones((x, x)),
                                         name="look_ahead_mask_ones_matrix")(size)
    mask = tf.keras.layers.Lambda(lambda x: 1 - tf.linalg.band_part(x, -1, 0),
                                  name="look_ahead_mask")(ones_matrix)
    return mask


In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, name="multi_head_attention", **kwargs):
        super(MultiHeadAttention, self).__init__(name=name, **kwargs)
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0  # Ensure d_model is divisible evenly by num_heads

        self.depth = d_model // self.num_heads  # Calculate depth for each head

        self.query_lin = tf.keras.layers.Dense(d_model, name=name+"_query_lin")  # Linear transformation layer for queries
        self.key_lin = tf.keras.layers.Dense(d_model, name=name+"_key_lin")  # Linear transformation layer for keys
        self.value_lin = tf.keras.layers.Dense(d_model, name=name+"_value_lin")  # Linear transformation layer for values

        self.final_lin = tf.keras.layers.Dense(d_model, name=name+"_final_lin")  # Linear transformation layer for final output

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))  # Reshape input tensor to split the last dimension
        return tf.transpose(x, perm=[0, 2, 1, 3])  # Transpose the tensor dimensions

    def call(self, v, k, q, mask=None):
        batch_size = tf.shape(q)[0]  # Get the batch size

        W_q = self.query_lin(q)  # Linear transformation for queries
        W_k = self.key_lin(k)  # Linear transformation for keys
        W_v = self.value_lin(v)  # Linear transformation for values

        q_split = self.split_heads(W_q, batch_size)  # Split queries into multiple heads
        k_split = self.split_heads(W_k, batch_size)  # Split keys into multiple heads
        v_split = self.split_heads(W_v, batch_size)  # Split values into multiple heads

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q_split, k_split, v_split, mask)  # Calculate scaled dot product attention

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # Transpose attention output

        concat_attention = tf.reshape(scaled_attention,
                                      (batch_size, -1, self.d_model))  # Reshape attention output

        output = self.final_lin(concat_attention)  # Linear transformation for final output

        return output, attention_weights#, W_q, W_k, W_v, q_split, k_split, v_split


    def get_config(self):
        config = super(MultiHeadAttention, self).get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'name': self.name,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
class PointWiseFeedForwardNetwork(tf.keras.Model):
    def __init__(self, d_model, dff, identifier, **kwargs):
        super(PointWiseFeedForwardNetwork, self).__init__(**kwargs)

        self.d_model = d_model
        self.dff = dff
        self.identifier = identifier

        self.layer1 = tf.keras.layers.Dense(dff, activation='relu', name=f'feed_forward_relu_{identifier}')
        self.layer2 = tf.keras.layers.Dense(d_model, name=f'feed_forward_output_{identifier}')

    def call(self, x):
        x = self.layer1(x)
        return self.layer2(x)

    def get_config(self):
        config = super().get_config()
        config.update({
            'd_model': self.d_model,
            'dff': self.dff,
            'identifier': self.identifier,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)


In [None]:
class NormAndAdd(tf.keras.layers.Layer):
    def __init__(self, layernorm, **kwargs):
        super(NormAndAdd, self).__init__(**kwargs)
        self.layernorm = layernorm

    def call(self, inputs):
        return self.layernorm(inputs[0] + inputs[1])

    def get_config(self):
        config = super(NormAndAdd, self).get_config()
        config.update({
            'layernorm': {
                'class_name': self.layernorm.__class__.__name__,
                'config': self.layernorm.get_config(),
            },
        })
        return config

    @classmethod
    def from_config(cls, config):
        layernorm_config = config.pop('layernorm')
        layernorm_class = tf.keras.layers.deserialize(layernorm_config)
        return cls(layernorm=layernorm_class, **config)

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, name='encoder_layer',
                 mha=None, ffn=None, norm_and_add1=None, norm_and_add2=None, **kwargs):
        super(EncoderLayer, self).__init__(name=name, **kwargs)

        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.rate = rate

        self.mha = MultiHeadAttention(d_model, num_heads, name='MHA_encoder') if mha is None else mha
        self.ffn = PointWiseFeedForwardNetwork(d_model, dff, 'encoder') if ffn is None else ffn

        self.norm_and_add1 = NormAndAdd(tf.keras.layers.LayerNormalization(epsilon=1e-6), name='encoder_NaA_1') if norm_and_add1 is None else norm_and_add1
        self.norm_and_add2 = NormAndAdd(tf.keras.layers.LayerNormalization(epsilon=1e-6), name='encoder_NaA_2') if norm_and_add2 is None else norm_and_add2

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, padding_mask):
        attn_output, attn_weights = self.mha(x, x, x, padding_mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.norm_and_add1([x, attn_output])  # (batch_size, input_seq_len, d_model)

        ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.norm_and_add2([out1, ffn_output])  # (batch_size, input_seq_len, d_model)

        return out2

    def get_config(self):
        config = super().get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'rate': self.rate,
            'name': self.name,
            'mha': {
                'class_name': self.mha.__class__.__name__,
                'config': self.mha.get_config(),
            },
            'ffn': {
                'class_name': self.ffn.__class__.__name__,
                'config': self.ffn.get_config(),
            },
            'norm_and_add1': {
                'class_name': self.norm_and_add1.__class__.__name__,
                'config': self.norm_and_add1.get_config(),
            },
            'norm_and_add2': {
                'class_name': self.norm_and_add2.__class__.__name__,
                'config': self.norm_and_add2.get_config(),
            },
        })
        return config

    @classmethod
    def from_config(cls, config):
        config['mha'] = deserialize_layer(config['mha'])
        config['ffn'] = deserialize_layer(config['ffn'])
        config['norm_and_add1'] = deserialize_layer(config['norm_and_add1'])
        config['norm_and_add2'] = deserialize_layer(config['norm_and_add2'])
        return cls(**config)

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1, name='decoder_layer', mha1=None, mha2=None, ffn=None, norm_and_add1=None, norm_and_add2=None, norm_and_add3=None, **kwargs):
        super(DecoderLayer, self).__init__(name=name, **kwargs)

        self.d_model = d_model
        self.num_heads = num_heads
        self.dff = dff
        self.rate = rate

        self.mha1 = MultiHeadAttention(d_model, num_heads, name='MHA_decoder_1') if mha1 is None else mha1
        self.mha2 = MultiHeadAttention(d_model, num_heads, name='MHA_decoder_2') if mha2 is None else mha2

        self.ffn = PointWiseFeedForwardNetwork(d_model, dff, 'decoder') if ffn is None else ffn  # Use PointWiseFeedForwardNetwork class

        self.norm_and_add1 = NormAndAdd(tf.keras.layers.LayerNormalization(epsilon=1e-6), name='decoder_NaA_1') if norm_and_add1 is None else norm_and_add1
        self.norm_and_add2 = NormAndAdd(tf.keras.layers.LayerNormalization(epsilon=1e-6), name='decoder_NaA_2') if norm_and_add2 is None else norm_and_add2
        self.norm_and_add3 = NormAndAdd(tf.keras.layers.LayerNormalization(epsilon=1e-6), name='decoder_NaA_3') if norm_and_add3 is None else norm_and_add3

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)

    def call(self, dec_input, enc_output, training, look_ahead_mask, dec_padding_mask, enc_padding_mask):
        combined_mask = tf.keras.layers.Lambda(lambda x: tf.maximum(x[0], x[1]), name="mask_combiner")([look_ahead_mask, dec_padding_mask])

        attn1, attn_weights_block1 = self.mha1(dec_input, dec_input, dec_input, combined_mask)  # (batch_size, target_seq_len, d_model)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.norm_and_add1([attn1, dec_input])

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, enc_padding_mask)  # (batch_size, target_seq_len, d_model)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.norm_and_add2([attn2, out1])  # (batch_size, target_seq_len, d_model)

        ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.norm_and_add3([ffn_output, out2])  # (batch_size, target_seq_len, d_model)

        return out3, attn_weights_block1, attn_weights_block2

    def get_config(self):
        config = super().get_config()
        config.update({
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'rate': self.rate,
            'name': self.name,
            'mha1': {
                'class_name': self.mha1.__class__.__name__,
                'config': self.mha1.get_config(),
            },
            'mha2': {
                'class_name': self.mha2.__class__.__name__,
                'config': self.mha2.get_config(),
            },
            'ffn': {
                'class_name': self.ffn.__class__.__name__,
                'config': self.ffn.get_config(),
            },
            'norm_and_add1': {
                'class_name': self.norm_and_add1.__class__.__name__,
                'config': self.norm_and_add1.get_config(),
            },
            'norm_and_add2': {
                'class_name': self.norm_and_add2.__class__.__name__,
                'config': self.norm_and_add2.get_config(),
            },
            'norm_and_add3': {
                'class_name': self.norm_and_add3.__class__.__name__,
                'config': self.norm_and_add3.get_config(),
            },
        })
        return config

    @classmethod
    def from_config(cls, config):
        config['mha1'] = deserialize_layer(config['mha1'])
        config['mha2'] = deserialize_layer(config['mha2'])
        config['ffn'] = deserialize_layer(config['ffn'])
        config['norm_and_add1'] = deserialize_layer(config['norm_and_add1'])
        config['norm_and_add2'] = deserialize_layer(config['norm_and_add2'])
        config['norm_and_add3'] = deserialize_layer(config['norm_and_add3'])
        return cls(**config)


In [None]:
from tensorflow.keras.layers import deserialize as deserialize_layer


In [None]:
def deserialize_layer(layer_dict):
    from tensorflow.keras.layers import deserialize as deserialize_layer
    return deserialize_layer(layer_dict)


# **Encoder and Decoder blocks**

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, max_position, rate=0.1, name='Encoder_Block',
                 embedding_layer=None, pos_encoding_layer=None, encoder_layers=None, **kwargs):
        super(Encoder, self).__init__(name=name, **kwargs)

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.input_vocab_size = input_vocab_size
        self.max_position = max_position
        self.rate = rate

        if embedding_layer is None:
            self.embedding = MyEmbeddingLayer(input_vocab_size, d_model, name='encoder_embedding')
        else:
            self.embedding = embedding_layer

        if pos_encoding_layer is None:
            self.pos_encoding = PositionalEncoding(max_position, d_model, name='encoder_encoding')
        else:
            self.pos_encoding = pos_encoding_layer

        if encoder_layers is None:
            self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate, name=f"encoder_layer_{i+1}")
                               for i in range(num_layers)]
        else:
            self.enc_layers = encoder_layers

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        #seq_len = tf.shape(x)[1]
        seq_len = tf.keras.layers.Lambda(lambda x: tf.shape(x)[1], name="encoder_sequence_length")(x)

        x_emb = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x_pos_enc = self.pos_encoding(x_emb)
        x_pos_enc_dropout = self.dropout(x_pos_enc, training=training)

        x_enc_output = x_pos_enc_dropout
        for i in range(self.num_layers):
            x_enc_output = self.enc_layers[i](x_enc_output, training, mask)

        return x_enc_output  # (batch_size, input_seq_len, d_model)

    def get_config(self):
        config = super(Encoder, self).get_config()
        config.update({
            'num_layers': self.num_layers,
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'input_vocab_size': self.input_vocab_size,
            'max_position': self.max_position,
            'rate': self.rate,
            'name': self.name,
            'embedding_layer': {
                'class_name': self.embedding.__class__.__name__,
                'config': self.embedding.get_config(),
            },
            'pos_encoding_layer': {
                'class_name': self.pos_encoding.__class__.__name__,
                'config': self.pos_encoding.get_config(),
            },
            'encoder_layers': [
                {
                    'class_name': layer.__class__.__name__,
                    'config': layer.get_config(),
                }
                for layer in self.enc_layers
            ],
        })
        return config

    @classmethod
    def from_config(cls, config):
        config['embedding_layer'] = deserialize_layer(config['embedding_layer'])
        config['pos_encoding_layer'] = deserialize_layer(config['pos_encoding_layer'])
        config['encoder_layers'] = [deserialize_layer(layer_config) for layer_config in config['encoder_layers']]
        return cls(**config)


In [None]:

class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, max_position, rate=0.1, name='Decoder_Block',
                 embedding=None, pos_encoding=None, dec_layers=None, dropout=None, **kwargs):
        super(Decoder, self).__init__(name=name, **kwargs)

        self.d_model = d_model
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dff = dff
        self.target_vocab_size = target_vocab_size
        self.max_position = max_position
        self.rate = rate

        if embedding is None:
            self.embedding = MyEmbeddingLayer(target_vocab_size, d_model, name='decoder_embedding')
        else:
            self.embedding = embedding

        if pos_encoding is None:
            self.pos_encoding = PositionalEncoding(max_position, d_model, name='decoder_encoding')
        else:
            self.pos_encoding = pos_encoding

        if dec_layers is None:
            self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate, name=f"decoder_layer_{i+1}")
                               for i in range(num_layers)]
        else:
            self.dec_layers = dec_layers

        if dropout is None:
            self.dropout = tf.keras.layers.Dropout(rate)
        else:
            self.dropout = dropout

    def call(self, dec_input, enc_output, training, look_ahead_mask, dec_padding_mask, enc_padding_mask):
        seq_len = tf.keras.layers.Lambda(lambda x: tf.shape(x)[1], name="decoder_sequence_length")(dec_input)

        dec_input_emb = self.embedding(dec_input)  # (batch_size, target_seq_len, d_model)
        dec_input_pos_enc = self.pos_encoding(dec_input_emb)
        dec_input_pos_enc_dropout = self.dropout(dec_input_pos_enc, training=training)
        dec_output = dec_input_pos_enc_dropout

        attention_weights = {}

        for i in range(self.num_layers):
            dec_output, attn_weights_block1, attn_weights_block2 = self.dec_layers[i](dec_output, enc_output, training, look_ahead_mask, dec_padding_mask, enc_padding_mask)

            attention_weights[f'decoder_layer{i+1}_block1'] = attn_weights_block1
            attention_weights[f'decoder_layer{i+1}_block2'] = attn_weights_block2

        return dec_output  # (batch_size, target_seq_len, d_model)


    def get_config(self):
        config = super(Decoder, self).get_config()
        config.update({
            'num_layers': self.num_layers,
            'd_model': self.d_model,
            'num_heads': self.num_heads,
            'dff': self.dff,
            'target_vocab_size': self.target_vocab_size,
            'max_position': self.max_position,
            'rate': self.rate,
            'name': self.name,
            'embedding': {
                'class_name': self.embedding.__class__.__name__,
                'config': self.embedding.get_config(),
            },
            'pos_encoding': {
                'class_name': self.pos_encoding.__class__.__name__,
                'config': self.pos_encoding.get_config(),
            },
            'dec_layers': [{
                'class_name': layer.__class__.__name__,
                'config': layer.get_config(),
            } for layer in self.dec_layers],
            'dropout': {
                'class_name': self.dropout.__class__.__name__,
                'config': self.dropout.get_config(),
            },
        })
        return config

    @classmethod
    def from_config(cls, config):
        config['embedding'] = deserialize_layer(config['embedding'])
        config['pos_encoding'] = deserialize_layer(config['pos_encoding'])
        config['dec_layers'] = [deserialize_layer(layer_config) for layer_config in config['dec_layers']]
        config['dropout'] = deserialize_layer(config['dropout'])
        return cls(**config)

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, enc_num_layers, dec_num_layers, d_model,
                 enc_num_heads, dec_num_heads, enc_dff, dec_dff,
                 input_vocab_size, target_vocab_size, pe_input, pe_target,
                 enc_rate=0.1, dec_rate=0.1,
                 verbose=False, name='Transformer_Block', encoder=None, decoder=None, **kwargs):
        super(Transformer, self).__init__(name=name, **kwargs)
        self.verbose = verbose
        self.call_count = 0
        self.enc_num_layers = enc_num_layers
        self.dec_num_layers = dec_num_layers
        self.d_model = d_model
        self.enc_num_heads = enc_num_heads
        self.dec_num_heads = dec_num_heads
        self.enc_dff = enc_dff
        self.dec_dff = dec_dff
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.pe_input = pe_input
        self.pe_target = pe_target
        self.enc_rate = enc_rate
        self.dec_rate = dec_rate

        if encoder is None:
            self.encoder = Encoder(enc_num_layers, d_model, enc_num_heads, enc_dff,
                                   input_vocab_size, pe_input, enc_rate, name='Encoder')
        else:
            self.encoder = encoder

        if decoder is None:
            self.decoder = Decoder(dec_num_layers, d_model, dec_num_heads, dec_dff,
                                   target_vocab_size, pe_target, dec_rate, name='Decoder')
        else:
            self.decoder = decoder

        self.final_layer = tf.keras.layers.Dense(target_vocab_size, dtype='float32', name='transformer_final_layer')


    def call(self, inp, tar, training, enc_padding_mask,
            look_ahead_mask, dec_padding_mask):
        self.call_count += 1
        enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        if self.verbose and self.call_count == 1:
            print(f"Encoder output shape: {enc_output.shape}")

        dec_output = self.decoder(
            tar, enc_output, training, look_ahead_mask, dec_padding_mask, enc_padding_mask)
        if self.verbose and self.call_count == 1:
            print(f"Decoder output shape: {dec_output.shape}")

        final_linear_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)
        if self.verbose and self.call_count == 1:
            print(f"Final linear output shape: {final_linear_output.shape}")

        return final_linear_output

    def get_config(self):
        config = super(Transformer, self).get_config()
        config.update({
            'enc_num_layers': self.enc_num_layers,
            'dec_num_layers': self.dec_num_layers,
            'd_model': self.d_model,
            'enc_num_heads': self.enc_num_heads,
            'dec_num_heads': self.dec_num_heads,
            'enc_dff': self.enc_dff,
            'dec_dff': self.dec_dff,
            'input_vocab_size': self.input_vocab_size,
            'target_vocab_size': self.target_vocab_size,
            'pe_input': self.pe_input,
            'pe_target': self.pe_target,
            'enc_rate': self.enc_rate,
            'dec_rate': self.dec_rate,
            'verbose': self.verbose,
            'name': self.name,
            'encoder': {
                'class_name': self.encoder.__class__.__name__,
                'config': self.encoder.get_config(),
            },
            'decoder': {
                'class_name': self.decoder.__class__.__name__,
                'config': self.decoder.get_config(),
            },
        })
        return config

    @classmethod
    def from_config(cls, config):
        config['encoder'] = deserialize_layer(config['encoder'])
        config['decoder'] = deserialize_layer(config['decoder'])
        return cls(**config)

In [None]:
class LossFunction:
    def __init__(self):
        self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

    def compute(self, real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = self.loss_object(real, pred)

        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask

        return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = tf.cast(warmup_steps, tf.float32)

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

    def get_config(self):
        return {"d_model": float(self.d_model.numpy()), "warmup_steps": float(self.warmup_steps.numpy())}


    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [None]:
def deserialize_layer(layer_config):
    """Creates a layer from its config"""
    # This line might need to be customized depending on your project structure
    from tensorflow.keras import layers

    # Get the class of the layer from tf.keras.layers
    LayerClass = getattr(layers, layer_config['class_name'])

    # Create a new instance of the layer
    layer = LayerClass.from_config(layer_config['config'])

    return layer


In [None]:

class ModelCreator:
    def __init__(self, transformer, config, optimizer, loss_function):
        self.transformer = transformer
        self.config = config
        self.optimizer = optimizer
        self.loss_function = loss_function

    def create_and_compile(self):
        # Define input and output placeholders
        input_shape = (None,)  # Variable-length sequence
        target_shape = (None,)  # Variable-length sequence

        enc_inputs = tf.keras.Input(shape=input_shape, dtype=tf.int32, name='encoder_input')
        dec_inputs = tf.keras.Input(shape=input_shape, dtype=tf.int32, name='decoder_input')
        dec_targets = tf.keras.Input(shape=target_shape, dtype=tf.int32, name='decoder_targets')

        # Create masks
        enc_padding_mask = create_padding_mask(enc_inputs, 'encoder')
        dec_padding_mask = create_padding_mask(dec_inputs, 'decoder')
        dec_seq_len = tf.keras.layers.Lambda(lambda x: tf.shape(x)[1], name="decoder_lam_shape")(dec_inputs)
        #look_ahead_mask = create_look_ahead_mask(tf.shape(dec_inputs)[1])
        look_ahead_mask = create_look_ahead_mask(dec_seq_len)


        # Run the model
        outputs = self.transformer(enc_inputs, dec_inputs, training=True,
                                         enc_padding_mask=enc_padding_mask,
                                         look_ahead_mask=look_ahead_mask,
                                         dec_padding_mask=dec_padding_mask)

        # Define the model
        model = tf.keras.Model(inputs=[enc_inputs, dec_inputs], outputs=outputs)

        # Compile the model
        model.compile(optimizer=self.optimizer, loss=self.loss_function.compute, metrics=['accuracy'])

        return model

In [None]:
class ModelTrainer:
    def __init__(self, model, dataset, validation_data, epochs):
        self.model = model
        self.dataset = dataset
        self.validation_data = validation_data
        self.epochs = epochs

    def train(self):
        # Prepare dataset for training by separating inputs and outputs
        full_dataset = self.dataset.map(lambda enc_inputs, dec_inputs, dec_targets: ((enc_inputs, dec_inputs), dec_targets))

        # Prepare validation dataset in a similar way
        validation_dataset = self.validation_data.map(lambda enc_inputs, dec_inputs, dec_targets: ((enc_inputs, dec_inputs), dec_targets))

        # Early stopping callback
        early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)

        # Train the model
        history = self.model.fit(full_dataset, validation_data=validation_dataset, epochs=self.epochs,
                       callbacks=[early_stopping_cb], verbose=1)
        return history

    def save_model(self, path):
        self.model.save(path)


In [None]:
class TransformerModel:
    def __init__(self, preprocessing_config, model_config, epochs=5):
        self.preprocessing_config = preprocessing_config
        self.model_config = model_config
        self.epochs = epochs
        self.transformer = None
        self.train_dataset = None
        self.val_dataset = None
        self.vocabulary_size = None
        self.tokenizer = None
        self.model = None

    def preprocess(self):
        preprocessor = DataPreprocessor(self.preprocessing_config, verbose_1=True, verbose_2=True)
        self.train_dataset, self.val_dataset, self.vocabulary_size, self.tokenizer = preprocessor.preprocess()
        pe_input, pe_target = preprocessor.get_max_sequence_lengths()

        self.model_config["pe_input"] = pe_input
        self.model_config["pe_target"] = pe_target
        self.model_config["input_vocab_size"] = self.vocabulary_size
        self.model_config["target_vocab_size"] = self.vocabulary_size
        print("Vocabulary size:", self.vocabulary_size)

    def create_model(self):
        self.transformer = Transformer(**self.model_config)

        loss_function = LossFunction()
        learning_rate = CustomSchedule(self.model_config['d_model'])
        optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
        model_creator = ModelCreator(self.transformer, self.model_config, optimizer, loss_function)
        self.model = model_creator.create_and_compile()

    def summary(self):
        self.model.summary()

    def train(self):
        model_trainer = ModelTrainer(self.model, self.train_dataset, self.val_dataset, self.epochs)
        history = model_trainer.train()
        return history, model_trainer

    def save(self, model_trainer, path):
        model_trainer.save_model(path)

    def load_model(self, path):
        self.model.load_weights(path)

    def print_layer_names(self):
        for layer in self.model.layers:
            print(layer.name)


In [None]:
# Configurations
preprocessing_config = {
    'dataframe': df,
    'column_name': 'text',
    'data_proportion': 0.004,
    'remove_list': ['\n', '~', '[email protected]'],
    'val_split_size': 0.2,
    'max_sequence_len': 100,
    'BATCH_SIZE': 32,
    'BUFFER_SIZE': 2000,
}


model_config = {
    "enc_num_layers": 4,
    "dec_num_layers": 4,
    "d_model": 256,  # Used for both encoder and decoder
    "enc_dff": 512,
    "dec_dff": 512,
    "enc_num_heads": 4,
    "dec_num_heads": 4,
    "pe_input": 0,  # Will be updated after preprocessing
    "pe_target": 0,  # Will be updated after preprocessing
    "input_vocab_size": 0,  # Will be updated after preprocessing
    "target_vocab_size": 0,  # Will be updated after preprocessing
    "enc_rate": 0.1,
    "dec_rate": 0.2,
    "verbose": True
}

epochs = 2

In [None]:

# Create TransformerModel instance
transformer_model = TransformerModel(preprocessing_config, model_config, epochs)

# Preprocess data
transformer_model.preprocess()

# Create model
transformer_model.create_model()

transformer_model.print_layer_names()

# Train the model
history, model_trainer = transformer_model.train()


transformer_model.summary()

Sequence Shapes:


Unnamed: 0,Train Encoder Input Shape,Train Decoder Input Shape,Train Decoder Label Shape,Val Encoder Input Shape,Val Decoder Input Shape,Val Decoder Label Shape
0,"(626, 100)","(626, 100)","(626, 100)","(157, 100)","(157, 100)","(157, 100)"


Dataset Information:


Unnamed: 0,Dataset,Total Tokens,Total Parameters
0,Train Encoder Input,626,62600
1,Train Decoder Input,626,62600
2,Train Decoder Label,626,62600
3,Val Encoder Input,157,15700
4,Val Decoder Input,157,15700
5,Val Decoder Label,157,15700


Vocabulary size: 4017
Encoder output shape: (None, None, 256)
Decoder output shape: (None, None, 256)
Final linear output shape: (None, None, 4017)
decoder_input
encoder_input
decoder_lam_shape
padding_mask_decoder
padding_mask_encoder
look_ahead_mask_ones_matrix
padding_mask_expanded_decoder
padding_mask_expanded_encoder
look_ahead_mask
Transformer_Block
Epoch 1/2
Epoch 2/2
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 encoder_input (InputLayer)     [(None, None)]       0           []                               
                                                                                                  
 decoder_lam_

In [None]:
#transformer_model.summary()

In [None]:
def print_layer_names(layer, prefix=""):
    print(prefix + layer.name)
    try:
        for sub_layer in layer.layers:
            print_layer_names(sub_layer, prefix=prefix+"----")
    except AttributeError:
        # No further layers to inspect
        pass

print_layer_names(transformer_model.model)


model
----decoder_input
----encoder_input
----decoder_lam_shape
----padding_mask_decoder
----padding_mask_encoder
----look_ahead_mask_ones_matrix
----padding_mask_expanded_decoder
----padding_mask_expanded_encoder
----look_ahead_mask
----Transformer_Block
--------Encoder
--------Decoder
--------transformer_final_layer


In [None]:
# # Save the model
transformer_model.save(model_trainer, '/content/drive/MyDrive/Colab Notebooks/Machine Learning/TensorFlow/Transformer/Transformer_Weight/test_with_names_7')




TypeError: ignored

In [None]:
import h5py


In [None]:
h5f_names = h5py.File('/content/drive/MyDrive/Colab Notebooks/Machine Learning/TensorFlow/Transformer/Transformer_Weight/test_with_names_6.h5', 'r')
# List all groups
print("Keys: %s" % h5f_names.keys())
keys = list(h5f_names.keys())
print(keys)

# Get the data
for key in keys:
    print("\nKey name: ", key)
    data = list(h5f_names[key])
    print("Data: ", data)

In [None]:
Transformer_Block = h5f_names['model_weights']

for key in Transformer_Block.keys():
    print("\nKey name: ", key)
    data = np.array(Transformer_Block[key])
    print("Data shape: ", data.shape)
    print("Data type: ", data.dtype)
    print("First few elements: ", data.flat[:10])

print("-----------------------------------------------------------------------------------")

Transformer_Block_Transformer_Block = h5f_names['model_weights/Transformer_Block']

for key in Transformer_Block_Transformer_Block.keys():
    print("\nKey name: ", key)
    data = np.array(Transformer_Block_Transformer_Block[key])
    print("Data shape: ", data.shape)
    print("Data type: ", data.dtype)
    print("First few elements: ", data.flat[:10])


print("-----------------------------------------------------------------------------------")

Transformer_Block_Variable_0 = h5f_names['model_weights/Transformer_Block/Transformer_Block/Encoder_Block']

for key in Transformer_Block_Variable_0.keys():
    print("\nKey name: ", key)
    data = np.array(Transformer_Block_Variable_0[key])
    print("Data shape: ", data.shape)
    print("Data type: ", data.dtype)
    print("First few elements: ", data.flat[:10])

In [None]:
# h5f_no_names = h5py.File('/content/drive/MyDrive/Colab Notebooks/Machine Learning/TensorFlow/Transformer/Transformer_Weight/test6_no_name.h5', 'r')
# # List all groups
# print("Keys: %s" % h5f_no_names.keys())
# keys = list(h5f_no_names.keys())
# print(keys)

# # Get the data
# for key in keys:
#     print("\nKey name: ", key)
#     data = list(h5f_no_names[key])
#     print("Data: ", data)

In [None]:
# Transformer__no_names = h5f_no_names['transformer']

# for key in Transformer__no_names.keys():
#     print("\nKey name: ", key)
#     data = np.array(Transformer__no_names[key])
#     print("Data shape: ", data.shape)
#     print("Data type: ", data.dtype)
#     print("First few elements: ", data.flat[:10])

# print("-----------------------------------------------------------------------------------")

# Transformer__no_names_Transformer_Block = h5f_no_names['transformer/transformer']

# for key in Transformer__no_names_Transformer_Block.keys():
#     print("\nKey name: ", key)
#     data = np.array(Transformer__no_names_Transformer_Block[key])
#     print("Data shape: ", data.shape)
#     print("Data type: ", data.dtype)
#     print("First few elements: ", data.flat[:10])


# print("-----------------------------------------------------------------------------------")

# Transformer__no_names_Variable_0 = h5f_no_names['transformer/encoder']

# for key in Transformer__no_names_Variable_0.keys():
#     print("\nKey name: ", key)
#     data = np.array(Transformer__no_names_Variable_0[key])
#     print("Data shape: ", data.shape)
#     print("Data type: ", data.dtype)
#     print("First few elements: ", data.flat[:10])

In [None]:
def visualize_layer_weights(layer):
    """Visualize weights of the provided layer."""

    # Get the weights of the specified layer
    weights = layer.get_weights()

    if len(weights) == 0:
        print(f'Layer {layer.name} has no weights to visualize.')
        return

    # Create a figure
    plt.figure(figsize=(12, 8))

    # Flatten the weights for visualization
    weight_values = weights[0].flatten()  # 0 index for main weight matrix

    # If weights are all zeros, print message and return
    if np.all(weight_values == 0):
        print(f'Weights for layer {layer.name} contain only zeros.')
        return

    # Plot histogram of weights
    plt.hist(weight_values, bins=20)
    plt.title('Layer ' + layer.name + ' Weight Distribution')
    plt.xlabel('Weight values')
    plt.ylabel('Frequency')

    # Compute statistics
    weight_min = np.min(weight_values)
    weight_max = np.max(weight_values)
    weight_mean = np.mean(weight_values)
    weight_std = np.std(weight_values)

    # Annotate statistics
    plt.annotate('Min: {:.2f}'.format(weight_min), xy=(0.05, 0.88), xycoords='axes fraction')
    plt.annotate('Max: {:.2f}'.format(weight_max), xy=(0.05, 0.80), xycoords='axes fraction')
    plt.annotate('Mean: {:.2f}'.format(weight_mean), xy=(0.05, 0.72), xycoords='axes fraction')
    plt.annotate('Std: {:.2f}'.format(weight_std), xy=(0.05, 0.64), xycoords='axes fraction')

    # Show the plot
    plt.tight_layout()
    plt.show()

# model = transformer_model.model

# visualize_layer_weights(model.layers[-1])

In [None]:
# model = transformer_model.model
# tf.keras.utils.plot_model(
#     model, to_file='model.png', show_shapes=True, show_dtype=True,
#     show_layer_names=True, rankdir='TB', expand_nested=True, dpi=96
# )


In [None]:
class HistoryPlotter:
    def __init__(self, history):
        self.history = history

    def plot_loss(self):
        fig, ax = plt.subplots(figsize=(7, 5))
        ax.plot(self.history.history['loss'], label='Training loss')
        ax.plot(self.history.history['val_loss'], label='Validation loss')
        ax.set_title('Model loss')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Loss')
        ax.legend()
        plt.show()

    def plot_accuracy(self):
        fig, ax = plt.subplots(figsize=(7, 5))
        ax.plot(self.history.history['accuracy'], label='Training accuracy')
        ax.plot(self.history.history['val_accuracy'], label='Validation accuracy')
        ax.set_title('Model accuracy')
        ax.set_xlabel('Epoch')
        ax.set_ylabel('Accuracy')
        ax.legend()
        plt.show()



In [None]:
# # Plot the training history
# plotter = HistoryPlotter(history)

# # Plot the loss
# plotter.plot_loss()

# # Plot the accuracy
# plotter.plot_accuracy()

## Saving the model:

# Text generator

In [None]:

class StochasticBeamSearch:
    def __init__(self, model, tokenizer, beam_size=3):
        self.model = model
        self.tokenizer = tokenizer
        self.beam_size = beam_size

    def decode_sequence(self, sequence):
        index_to_word = dict((i, word) for word, i in self.tokenizer.word_index.items())
        # Exclude the '<EOS>' token in the final output
        return ' '.join(index_to_word.get(token, '?') for token in sequence if index_to_word.get(token, '?') != '<EOS>')

    def predict(self, start_sentence, max_length=10):
        start_tokens = self.tokenizer.texts_to_sequences([start_sentence])
        start_tokens = np.squeeze(start_tokens, axis=0)

        # Initialize beam with the start tokens
        beam = [(start_tokens, 0.0)]  # each element in the beam is (token_sequence, log_probability)

        for _ in range(max_length):
            all_candidates = []

            for tokens, log_prob in beam:
                # Predict next tokens for all current sequences in the beam
                tokens = np.expand_dims(tokens, axis=0)  # Make sure tokens have the shape [batch_size, seq_len]
                predictions = self.model.predict([tokens, tokens], verbose=0)

                # Select the last token from predictions
                predictions = predictions[0, -1, :]

                # Get top k tokens and probabilities
                top_k_probs, top_k_tokens = tf.math.top_k(predictions, k=self.beam_size)

                # Form next candidates by adding new tokens to current sequences
                for k in range(self.beam_size):
                    if tokens[0][-1] != top_k_tokens[k]:  # Add new token only if it's different from the last one
                        updated_tokens = np.append(tokens, top_k_tokens[k])
                        epsilon = 1e-9  # Small constant
                        updated_log_prob = log_prob + np.log(top_k_probs[k].numpy() + epsilon)  # Convert tensor to numpy array before adding epsilon
                        all_candidates.append((updated_tokens, updated_log_prob))

            # Select new beam probabilistically
            beam_probs = np.array([c[1] for c in all_candidates])
            beam_probs = np.exp(beam_probs)  # Convert from log probabilities to probabilities

            # Check for NaN values
            if np.isnan(beam_probs).any():
                print("NaN values detected in beam_probs. Replacing with uniform probabilities.")
                beam_probs = np.ones_like(beam_probs) / len(beam_probs)

            # Make sure the probabilities sum up to 1
            beam_probs = beam_probs / np.sum(beam_probs)
            beam_indices = np.random.choice(range(len(beam_probs)), size=self.beam_size, p=beam_probs)

            beam = [all_candidates[i] for i in beam_indices]

            # Select the sequence with the highest probability from the final beam
            tokens, _ = max(beam, key=lambda x: x[1])

            # Check for EOS token and stop if found
            if self.tokenizer.word_index['<EOS>'] in tokens:
                break

        # Decode tokens into text and return
        return self.decode_sequence(tokens)


In [None]:
# beam_search = StochasticBeamSearch(transformer_model.model, transformer_model.tokenizer, beam_size=3)
# text = beam_search.predict("the sun dipped below the horizon casting a ", 50)
# print(text)
