# Project in Keras

Based on: https://github.com/rachhek/speech_recognition_using_lstm/blob/master/speech_recognition_using_lstm_experiment.ipynb


In [1]:
! pip install python_speech_features

Collecting python_speech_features
  Downloading python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py): started
  Building wheel for python_speech_features (setup.py): finished with status 'done'
  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5879 sha256=d4671398ce46a9806edd386b73f90aa5846d9a18f9b3781d67c79cec2dfca4c0
  Stored in directory: c:\users\48695\appdata\local\pip\cache\wheels\37\01\19\e6c69a32684ab7b2e3ea4985a571d810cf055c72600e7f9f17
Successfully built python_speech_features
Installing collected packages: python_speech_features
Successfully installed python_speech_features-0.6


In [19]:
import os
import keras
from keras import ops
from keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional,Flatten,Input
from keras.layers import SpatialDropout1D, SpatialDropout2D, SpatialDropout3D, Bidirectional
from keras.layers import Conv1D, BatchNormalization, Conv2D, MaxPooling2D, MaxPooling1D, Flatten, Dropout
from keras.layers import MultiHeadAttention, LayerNormalization, GlobalAveragePooling2D, Layer
from python_speech_features import mfcc
from python_speech_features import logfbank
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.utils import plot_model
#from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from sklearn.preprocessing import LabelEncoder,normalize
from matplotlib import pyplot
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import scipy.io.wavfile as wav
import numpy as np
import csv
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score,ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt
import time
import tensorflow as tf
import librosa
import soundfile as sf
import pickle 

metrics=['categorical_accuracy','AUC','Precision','Recall'] #można potem obliczyć F1 na podstawie recall i precision
metrics2 = ['categorical_accuracy','precision','recall']
train_path = "./train/audio/" 
val_text = "./train/validation_list.txt"
test_text = "./train/testing_list.txt"
root = "./train/"
preprocessed = "./preprocessed"

In [2]:
def show_confusion_matrix(model, test_ds, num_classes, class_names):
    true_labels = np.concatenate([y for x, y in test_ds], axis=0)
    tf_labels = tf.argmax(true_labels, axis=1).numpy()

    predictions = model.predict(test_ds)
    predicted_classes = np.argmax(predictions, axis=1)

    conf_matrix = tf.math.confusion_matrix(tf_labels, predicted_classes, num_classes=num_classes)

    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix.numpy(), display_labels=class_names)
    fig, ax = plt.subplots(figsize=(11, 9))  # Ustaw rozmiar wykresu na 10x8

    # Wyświetl macierz pomyłek z określonym rozmiarem
    disp.plot(ax=ax)  # Użyj parametru ax, aby użyć określonej osi
    plt.show()
    return

In [3]:
def show_training_validation_loss_and_metrics(model,metrics):
    f,ax=plt.subplots(2,1,figsize=(20,20)) 

    #Assigning the first subplot to graph training loss and validation loss
    ax[0].plot(model.history.history['loss'],color='b',label='Training Loss')
    ax[0].plot(model.history.history['val_loss'],color='r',label='Validation Loss')

    #Plotting the training accuracy and validation accuracy
    for metric in metrics:
        ax[1].plot(model.history.history[metric],label='Training '+metric)
        ax[1].plot(model.history.history['val_'+metric],label='Validation '+metric)
    
    plt.legend() 

# Preprocessing with spectogram (can be run only once)

In [7]:
def convert_background_noise(root_path='./train', input_folder='_background_noise_', output_folder='silence'):
    audio_path = os.path.join(root_path, 'audio')
    input_path = os.path.join(audio_path, input_folder)
    output_path = os.path.join(audio_path, output_folder)
    
    sample_rate = 16000
    sample_length = 1

    audio_files = [d for d in os.listdir(input_path)
                   if os.path.isfile(os.path.join(input_path, d)) and d.endswith('.wav')]
    samples = []

    for f in audio_files:
        path = os.path.join(input_path, f)
        s, _ = librosa.load(path, sr=sample_rate)
        samples.append(s)

    samples = np.hstack(samples)
    c = int(sample_rate * sample_length)
    r = len(samples) // c
    names = [f'recording_{i}.wav' for i in range(r-1)]

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for i in range(r - 1):
        y = samples[c*i:c*(i+1)]
        sf.write(os.path.join(output_path, names[i]), y, sample_rate)

    val_choice = np.random.choice(names, int(0.1*len(names)), replace=False).tolist()
    with open(os.path.join(root_path, 'validation_list.txt'), 'a') as f:
        for name in val_choice:
            p = os.path.join(output_folder, name)
            p = p.replace('./', '')
            f.write(p)
            f.write('\n')

    test_choice = np.random.choice([n for n in names if n not in val_choice], int(0.1*len(names)), replace=False).tolist()
    with open(os.path.join(root_path, 'testing_list.txt'), 'a') as f:
        for name in test_choice:
            p = os.path.join(output_folder, name)
            p = p.replace('./', '')
            f.write(p)
            f.write('\n')


In [8]:
convert_background_noise(root_path=root, input_folder='_background_noise_', output_folder='silence')

In [9]:
def load_dataset(root_path_files, files_names_list_name, label_encoder=None):
    #Calculating x_test and y_test        
    test_labels = []
    test_data = []

    #test_labels.txt is a txt file with all labels for the speech samples that is required for the evaluation. We loop through it to calculate the MFCC value for each speech sample and then normalize it
    with open(os.path.join(root_path_files, files_names_list_name), newline='') as tsvfile:
        reader = csv.DictReader(tsvfile)
        reader = csv.reader(tsvfile, delimiter=' ')
        for row in reader:
            wav_file = os.path.join(root_path_files, "audio/", row[0])

            row.append(row[0].split("/")[0])
            (rate,sig) = wav.read(wav_file)

            # pad to 1s of length using pad_sequences
            sig = pad_sequences([sig], maxlen=16000, dtype='float', padding='post', truncating='post', value=0.0)

            #Getting the MFCC value from the .wav files.
            mfcc_feat = mfcc(sig,rate)
            
            scaler = MinMaxScaler(feature_range=(0,1))
            scaler = scaler.fit(mfcc_feat)

            #Normalizing the MFCC values.
            normalized = scaler.transform(mfcc_feat)
            test_data.append(normalized)
            test_labels.append(str(row[1]))
        
        if label_encoder is None:
            label_encoder_test = LabelEncoder().fit(test_labels)
        else:
            label_encoder_test = label_encoder
        vec_test = label_encoder_test.transform(test_labels)

        #One hot encoding the labels
        one_hot_labels_test = keras.utils.to_categorical(vec_test, num_classes=len(label_encoder_test.classes_))
        Y_test = one_hot_labels_test
        X_test = np.array(test_data,dtype=np.float32)
        return X_test, Y_test, label_encoder_test

In [10]:
def generate_train_txt(root_path_files, files_names_list_name):
    omit = []
    train = []
    for f in files_names_list_name:
        with open(os.path.join(root_path_files, f)) as fileobj:
            omit += [line.strip() for line in fileobj]
    for target in os.listdir(os.path.join(root_path_files, 'audio')):
        if not target.startswith('_'):
            for file in os.listdir(os.path.join(root_path_files, 'audio', target)):
                p = os.path.join(target, file)
                p = p.replace("\\","/")
                if p not in omit:
                    train.append(p)
    with open(os.path.join(root_path_files, 'training_list.txt'), 'wb') as file:
        for t in train:
            file.write(t.encode())
            file.write('\n'.encode())

In [11]:
root_path_files = root
root_saved_files = preprocessed
generate_train_txt(root_path_files, ['validation_list.txt', 'testing_list.txt'])
classes = os.listdir(root_path_files+"/audio")
print(classes)

['bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'four', 'go', 'happy', 'house', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'silence', 'six', 'stop', 'three', 'tree', 'two', 'up', 'wow', 'yes', 'zero', '_background_noise_']


In [13]:
root_path_files = root
root_saved_files = preprocessed

generate_train_txt(root_path_files, ['validation_list.txt', 'testing_list.txt'])

train_files_names_list_name = 'training_list.txt'
X_train, Y_train, label_encoder = load_dataset(root_path_files=root_path_files, files_names_list_name=train_files_names_list_name)
np.save(os.path.join(root_saved_files, 'X_train'), X_train)
np.save(os.path.join(root_saved_files, 'Y_train'), Y_train)

valid_files_names_list_name = 'validation_list.txt'
X_valid, Y_valid, _ = load_dataset(root_path_files=root_path_files, files_names_list_name=valid_files_names_list_name, 
                                   label_encoder=label_encoder)
np.save(os.path.join(root_saved_files, 'X_valid'), X_valid)
np.save(os.path.join(root_saved_files, 'Y_valid'), Y_valid)

test_files_names_list_name = 'testing_list.txt'
X_test, Y_test, _ = load_dataset(root_path_files=root_path_files, files_names_list_name=test_files_names_list_name, 
                                 label_encoder=label_encoder)
np.save(os.path.join(root_saved_files, 'X_test'), X_test)
np.save(os.path.join(root_saved_files, 'Y_test'), Y_test)

In [14]:
with open('encoder', 'wb') as f:
    pickle.dump(label_encoder, f)

In [15]:
print(label_encoder.classes_)

['bed' 'bird' 'cat' 'dog' 'down' 'eight' 'five' 'four' 'go' 'happy'
 'house' 'left' 'marvin' 'nine' 'no' 'off' 'on' 'one' 'right' 'seven'
 'sheila' 'silence' 'six' 'stop' 'three' 'tree' 'two' 'up' 'wow' 'yes'
 'zero']


# Loading data

In [None]:
#! pip install gdown
#! gdown https://drive.google.com/uc?id=1S0ZWGTnKzyaYfLUOzii_LFnrqdygLDtf
#! mkdir ./preprocessed
#! unzip preprocessed.zip -d ./preprocessed
#! mv ./preprocessed/encoder ./

In [4]:
# load the test data and labels
root_saved_files = preprocessed

X_train = np.load(os.path.join(root_saved_files,'X_train.npy'))
Y_train = np.load(os.path.join(root_saved_files, 'Y_train.npy'))

X_test = np.load(os.path.join(root_saved_files, 'X_test.npy'))
Y_test = np.load(os.path.join(root_saved_files, 'Y_test.npy'))

X_valid = np.load(os.path.join(root_saved_files, 'X_valid.npy'))
Y_valid = np.load(os.path.join(root_saved_files, 'Y_valid.npy'))

In [5]:
with open('encoder', 'rb') as f:
    encoder = pickle.load(f)
    CLASSES = encoder.classes_

In [6]:
print(CLASSES)

['bed' 'bird' 'cat' 'dog' 'down' 'eight' 'five' 'four' 'go' 'happy'
 'house' 'left' 'marvin' 'nine' 'no' 'off' 'on' 'one' 'right' 'seven'
 'sheila' 'silence' 'six' 'stop' 'three' 'tree' 'two' 'up' 'wow' 'yes'
 'zero']


# Experiments

In [7]:
n_classes = len(CLASSES)

In [8]:
def update_seed(new_random_seed):
    np.random.seed(new_random_seed)
    tf.keras.utils.set_random_seed(new_random_seed)

In [9]:
def train_model(model, train_data, val_data, test_data, lr, epochs, batch, path='checkpoint'):
    callbacks = [
        EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, mode = 'min')
    ]
    m = model()
    m.compile(optimizer=Adam(amsgrad=True, learning_rate=lr),loss='categorical_crossentropy',metrics=metrics)
    history = m.fit(train_data[0], train_data[1],
                    epochs=epochs,
                    callbacks=callbacks,
                    batch_size=batch,
                    validation_data=val_data,
                    verbose=1,
                    shuffle=True)

    datetime = time.strftime("%Y%m%d-%H%M%S")
    m.save(os.path.join(path, 'model_' + datetime + '.keras'))

    show_training_validation_loss_and_metrics(m, metrics2)

    y_prediction = m.predict(test_data[0])
    y_prediction = np.argmax(y_prediction, axis = 1)
    y_test_single_column=np.argmax(test_data[1], axis=1)
    result = confusion_matrix(y_test_single_column, y_prediction , normalize='pred')
    plt.figure(figsize=(20,20))
    labels = CLASSES
    sns.heatmap(result, annot=True, fmt='.2f', xticklabels=labels, yticklabels=labels)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title('Confusion matrix on test data')
    plt.show()
    
    result = confusion_matrix(y_test_single_column, y_prediction)
    plt.figure(figsize=(20,20))
    labels = CLASSES
    sns.heatmap(result, annot=True, fmt='.2f', xticklabels=labels, yticklabels=labels)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title('Confusion matrix on test data')
    plt.show()
    
    acc_train = accuracy_score(np.argmax(train_data[1], axis=1), np.argmax(m.predict(train_data[0]), axis = 1))
    print(f"Accuracy score on train dataset: {acc_train}")
    acc_val = accuracy_score(np.argmax(val_data[1], axis=1), np.argmax(m.predict(val_data[0]), axis = 1))
    print(f"Accuracy score on validation dataset: {acc_val}")
    acc_test = accuracy_score(y_test_single_column, y_prediction)
    print(f"Accuracy score on test dataset: {acc_test}")

    return [acc_train, acc_val, acc_test]

In [10]:
def repeat_train(model, train_data, val_data, test_data, lr, epochs, batch, seeds, path='checkpoint'):
    accuracy = []
    for seed in seeds:
        print(f"Training with seed {seed}")
        p = os.path.join(path, str(seed))
        if not os.path.exists(path):
            os.mkdir(path)
        if not os.path.exists(p):
            os.mkdir(p)
        update_seed(seed)
        acc = train_model(model, train_data, val_data, test_data, lr, epochs, batch, path=p)
        accuracy.append(acc)
    with open(os.path.join(path, 'accuracy'), 'wb') as f:
        pickle.dump(accuracy, f)

## Experiments running!

In [23]:
def modelLSTM():
    model = Sequential()
    model.add(LSTM(200,input_shape=(99,13),return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(Y_test.shape[1], activation='softmax'))
    return model

In [24]:
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model):
    super().__init__()
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True) 
    self.pos_encoding = positional_encoding(length=2048, depth=d_model)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x

class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()
    
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    context, x  = inputs

    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

In [34]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [82]:
class TransformerDecoder(Layer):
    def __init__(self, embed_dim, num_heads, latent_dim,):
        super().__init__()
        self.att_1 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.att_2 = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(latent_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.layernorm3 = LayerNormalization(epsilon=1e-6)

        self.reshape = keras.layers.Reshape((1,32))

    def call(self, inputs, encoder_outputs):
        attn_output_1 = self.att_1(inputs, inputs)
        out1 = self.layernorm1(inputs + attn_output_1)

        encoder_outputs = self.reshape(encoder_outputs)
        attn_output_2 = self.att_2(out1, encoder_outputs)
        out2 = self.layernorm2(out1 + attn_output_2)

        ffn_output = self.ffn(out2)
        return self.layernorm3(out2 + ffn_output)

In [37]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [22]:
def modelTransformer():
    embedding_size=32
    num_attn_heads=2
    ff_net_dim=32
    maxlen=99 #chyba?
    vocab_size=13 #chyba?
    input_layer = keras.Input(shape=(99,13))
    x = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_size)(input_layer)
    x = TransformerBlock(embedding_size, num_attn_heads, ff_net_dim)(x)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.1)(x)
    x = Dense(20, activation='relu')(x)
    x = Dropout(0.1)(x)
    output_layer = Dense(Y_test.shape[1], activation='softmax')(x)
    model = keras.Model(inputs=input_layer, outputs=output_layer)
    return model

In [88]:
def modelTransformer_gpt():
    embedding_size=128
    num_attn_heads=8
    ff_net_dim=128
    maxlen=1287 # before 99
    vocab_size= n_classes # before 13

    # define input layers
    encoder_input_layer = keras.Input(shape=(99*13,))
    decoder_input_layer = keras.Input(shape=(99*13,))

    # Encoder
    x_enc = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_size)(encoder_input_layer)
    x_enc = TransformerEncoder(embedding_size, num_attn_heads, ff_net_dim)(x_enc)
    x_enc = GlobalAveragePooling1D()(x_enc)
    x_enc = Dropout(0.1)(x_enc)

    # Decoder
    x_dec = TokenAndPositionEmbedding(maxlen, vocab_size, embedding_size)(decoder_input_layer)
    x_dec = TransformerDecoder(embedding_size, num_attn_heads, ff_net_dim)(x_dec, x_enc)
    x_dec = GlobalAveragePooling1D()(x_dec)
    x_dec = Dropout(0.1)(x_dec)

    # Concat
    combined_output = keras.layers.Concatenate()([x_enc, x_dec])

    # last dense
    x = Dense(20, activation='relu')(combined_output)
    x = Dropout(0.1)(x)

    # output layer
    output_layer = Dense(Y_test.shape[1], activation='softmax')(x)
    model = keras.Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=output_layer)
    return model

In [66]:
print(X_train.shape)
print(Y_train.shape)

(51486, 99, 13)
(51486, 31)


In [69]:
X_train_reshaped = X_train.reshape((X_train.shape[0], 99*13))

# Check the shape of the reshaped data
print("Shape of X_train:", X_train.shape)
print("Shape of X_train reshaped:", X_train_reshaped.shape)

Shape of X_train: (51486, 99, 13)
Shape of X_train reshaped: (51486, 1287)


In [89]:
model = modelTransformer_gpt()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

In [90]:
history = model.fit(
    [X_train_reshaped, X_train_reshaped], Y_train, batch_size=32, epochs=2, validation_data=(X_valid, Y_valid)
)

Epoch 1/2
[1m1609/1609[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.0363 - loss: 3.4289

ValueError: Layer 'functional_49' expected 2 input(s). Received 1 instead.

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import tensorflow_models as tfm
from tensorflow_models.nlp import layers

print('worked?')
#tfm.nlp.layers.TransformerEncoderBlock()

worked?


In [60]:
def modelTransformerLib_enc():
    inputs = tf.keras.Input(shape=(28,10,))
    x_encoded = tfm.nlp.layers.TransformerEncoderBlock(10,32,'relu')(inputs)
    #x_decoded = tfm.nlp.layers.TransformerDecoderBlock(9,32,'relu', input_shape=(99,13,))(inputs)
    outputs = tf.keras.layers.Dense(20, activation='softmax')(x_encoded) #Y_test.shape[1]
    model = tf.keras.Model(inputs, outputs)
    return model

In [91]:
from official.nlp.modeling.models import TransformerEncoder, TransformerDecoder

In [89]:
def modelTransformerLib_enc2():
    inputs = tf.keras.Input(shape=(99,13,))
    x_encoded = official.nlp.modeling.models.TransformerEncoder(num_attention_heads=13)(inputs)
    x_decoded = official.nlp.modeling.models.TransformerDecoder(num_attention_heads=13)(inputs, x_encoded)
    outputs = tf.keras.layers.Dense(20, activation='softmax')(x_encoded) #Y_test.shape[1]
    model = tf.keras.Model(inputs, outputs)
    return model

In [90]:
model = modelTransformerLib_enc2()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

NameError: name 'official' is not defined

In [87]:
def modelTransformerLib():
    inputs = tf.keras.Input(shape=(28,10,))
    x_encoded = tfm.nlp.layers.TransformerEncoderBlock(10,32,'relu')(inputs)
    input_shape_tmp=(28,10)
    print(tf.TensorShape(input_shape_tmp[0]).as_list())
    x_decoded = tfm.nlp.layers.TransformerDecoderBlock(13,32,'relu')(inputs, x_encoded)
    outputs = tf.keras.layers.Dense(20, activation='softmax')(x_decoded) #Y_test.shape[1]
    model = tf.keras.Model(inputs, outputs)
    return model

In [88]:
model = modelTransformerLib()
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

[28]


ValueError: as_list() is not defined on an unknown TensorShape.

In [51]:
def modelTransformer2():
    embed_dim=32
    num_heads=2
    ff_dim=32
    maxlen=99 #chyba?
    vocab_size=n_classes #chyba?
    
    input_layer = keras.Input(shape=(99,13))
    x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(input_layer)
    encoder_outputs = TransformerEncoder(embed_dim, num_heads, ff_dim)(x)
    encoder = keras.Model(input_layer, encoder_outputs)
    
    #decoder_inputs = keras.Input(shape=(maxlen,), name="decoder_inputs")
    encoded_seq_inputs = keras.Input(shape=(None, embed_dim))
    x = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(input_layer)
    x = TransformerDecoder(embed_dim, ff_dim, num_heads)(x, encoded_seq_inputs)
    x = Dropout(0.5)(x)
    decoder_outputs = Dense(vocab_size, activation="softmax")(x)
    decoder = keras.Model([input_layer, encoded_seq_inputs], decoder_outputs)
    
    decoder_outputs = decoder([input_layer, encoder_outputs])

    #output_layer = Dense(Y_test.shape[1], activation='softmax')(x)
    #model = keras.Model(inputs=inputs, outputs=decoder_outputs)
    transformer = keras.Model(input_layer, decoder_outputs)
    return transformer

In [52]:
transformer = modelTransformer2()
transformer.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

history = transformer.fit(
    X_train, Y_train, batch_size=32, epochs=2, validation_data=(X_valid, Y_valid)
)

1. The `call()` method of your layer may be crashing. Try to `__call__()` the layer eagerly on some test input first to see if it works. E.g. `x = np.random.random((3, 4)); y = layer(x)`
2. If the `call()` method is correct, then you may need to implement the `def build(self, input_shape)` method on your layer. It should create all variables used by the layer (e.g. by calling `layer.build()` on all its children layers).
Exception encountered: ''Exception encountered when calling MultiHeadAttention.call().

[1mDimension must be 5 but is 4 for '{{node multi_head_attention_36_1/transpose_1}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32](multi_head_attention_36_1/Mul, multi_head_attention_36_1/transpose_1/perm)' with input shapes: [?,99,13,2,32], [4].[0m

Arguments received by MultiHeadAttention.call():
  • query=tf.Tensor(shape=(None, 99, 13, 32), dtype=float32)
  • value=tf.Tensor(shape=(None, None, 32), dtype=float32)
  • key=None
  • query_mask=None
  • value_mask=None
  • key_mask=None
  

RuntimeError: Exception encountered when calling TransformerDecoder.call().

[1mCould not automatically infer the output shape / dtype of 'transformer_decoder_4' (of type TransformerDecoder). Either the `TransformerDecoder.call()` method is incorrect, or you need to implement the `TransformerDecoder.compute_output_spec() / compute_output_shape()` method. Error encountered:

Exception encountered when calling MultiHeadAttention.call().

[1mDimension must be 5 but is 4 for '{{node multi_head_attention_36_1/transpose_1}} = Transpose[T=DT_FLOAT, Tperm=DT_INT32](multi_head_attention_36_1/Mul, multi_head_attention_36_1/transpose_1/perm)' with input shapes: [?,99,13,2,32], [4].[0m

Arguments received by MultiHeadAttention.call():
  • query=tf.Tensor(shape=(None, 99, 13, 32), dtype=float32)
  • value=tf.Tensor(shape=(None, None, 32), dtype=float32)
  • key=None
  • query_mask=None
  • value_mask=None
  • key_mask=None
  • attention_mask=None
  • return_attention_scores=False
  • training=None
  • use_causal_mask=False[0m

Arguments received by TransformerDecoder.call():
  • args=('<KerasTensor shape=(None, 99, 13, 32), dtype=float32, sparse=False, name=keras_tensor_133>', '<KerasTensor shape=(None, None, 32), dtype=float32, sparse=None, name=keras_tensor_131>')
  • kwargs=<class 'inspect._empty'>

In [45]:
repeat_train(modelTransformer2(), (X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test), lr=0.001, epochs=3, batch=32,
             seeds=[0], path='checkpoint_transformer_one_layer')

Training with seed 0


TypeError: missing a required argument: 'inputs'

In [27]:
transformer = Transformer(
    num_layers=2,
    d_model=32,
    num_heads=2,
    dff=64,
    input_vocab_size=n_classes,
    target_vocab_size=n_classes,
    dropout_rate=0.1)

transformer.summary()
transformer.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(),
    metrics=metrics2)

transformer.fit(X_train, Y_train, batch_size=64, epochs=3, validation_data=(X_valid,Y_valid))
#repeat_train(transformer, (X_train, Y_train), (X_valid, Y_valid), (X_test, Y_test), lr=0.001, epochs=3, batch=32,
#             seeds=[0], path='checkpoint_transformer_one_layer')

Epoch 1/3


OperatorNotAllowedInGraphError: Exception encountered when calling Transformer.call().

[1mIterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.[0m

Arguments received by Transformer.call():
  • inputs=tf.Tensor(shape=(None, 99, 13), dtype=float32)