<a href="https://colab.research.google.com/github/engmrgh/msc-degree/blob/master/2nd_semester/nlp/project/nlp_project_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install transformers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 13.3 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 49.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.0 MB/s 
Collecting xxhash
  Downlo

In [62]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
from tensorflow.keras import backend
from datetime import datetime
from datasets import Dataset, load_dataset
from transformers import GPT2Config, AutoTokenizer, TFGPT2Model, TFGPT2DoubleHeadsModel, GPT2Tokenizer

In [5]:
# Try to run on TPU if available
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
PATH_BASE = "/content/"

# Dataset

In [5]:
!git clone https://github.com/pengbaolin/SC-GPT.git

Cloning into 'SC-GPT'...
remote: Enumerating objects: 131, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 131 (delta 2), reused 0 (delta 0), pack-reused 125[K
Receiving objects: 100% (131/131), 407.12 KiB | 2.68 MiB/s, done.
Resolving deltas: 100% (40/40), done.


In [6]:
train_dfs = []
test_dfs = []

for domain in ['attraction', 'hotel', 'laptop', 'restaurant', 'taxi', 'train', 'tv']:
    train_dfs.append(pd.read_json(f'/content/SC-GPT/data/{domain}/train.json'))
    test_dfs.append(pd.read_json(f'/content/SC-GPT/data/{domain}/test.json'))

train_df = pd.concat(train_dfs)
test_df = pd.concat(test_dfs)

train_df.columns = ['dialogue act', 'response', 'response-repeated']
train_df.drop(labels=['response-repeated'], inplace=True, axis=1)
test_df.columns = ['dialogue act', 'response', 'response-repeated']
test_df.drop(labels=['response-repeated'], inplace=True, axis=1)

In [7]:
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__'],
    num_rows: 340
})

In [8]:
test_ds = Dataset.from_pandas(test_df)
test_ds

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__'],
    num_rows: 3310
})

In [9]:
MAX_TOKENS = 128
BACT_TOKEN = "<|dialogue_act|>"
EACT_TOKEN = "<|end_dialogue_act|>"
BRESP_TOKEN = "<|dialogue_resp|>"
ERESP_TOKEN = "<|end_dialogue_resp|>"
BOS_TOKENS = BRESP_TOKEN
EOS_TOKEN = ERESP_TOKEN
PAD_TOKEN = "<|pad|>"

# this will download and initialize the pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    bos_token=BOS_TOKENS,
    eos_token=EOS_TOKEN,
    pad_token=PAD_TOKEN,
    max_length=MAX_TOKENS,
    is_split_into_words=True,
)
tokenizer.add_tokens([BACT_TOKEN, EACT_TOKEN], special_tokens=True)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


2

In [10]:
output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    # Add start and end token to each comment
    processed_examples = list()
    for act, response in zip(examples['dialogue act'], examples['response']):
        processed_examples.append(BACT_TOKEN + act + EACT_TOKEN + BRESP_TOKEN + response + ERESP_TOKEN)
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        processed_examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
    bresp_token_id = tokenizer.convert_tokens_to_ids(BRESP_TOKEN)
    pad_token_id = tokenizer.pad_token_id

    output["labels"] = [x[x.index(bresp_token_id):] for x in output["input_ids"]]
    output["labels"] = [x + [pad_token_id] * (MAX_TOKENS - len(x) - 1) for x in output["labels"]]
    output["labels"] = [
        [-100 if x == pad_token_id else x for x in y] for y in output["labels"]
    ]
    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output

In [11]:
train_data = train_ds.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    load_from_cache_file=True,
)
print(train_data)



  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 340
})


In [12]:
test_data = test_ds.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    load_from_cache_file=True,
)
print(test_data)

  0%|          | 0/4 [00:00<?, ?ba/s]

Dataset({
    features: ['dialogue act', 'response', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 3310
})


In [13]:
train_tensor_inputs = tf.convert_to_tensor(train_data["input_ids"])
train_tensor_labels = tf.convert_to_tensor(train_data["labels"])
train_tensor_mask = tf.convert_to_tensor(train_data["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

test_tensor_inputs = tf.convert_to_tensor(test_data["input_ids"])
test_tensor_labels = tf.convert_to_tensor(test_data["labels"])
test_tensor_mask = tf.convert_to_tensor(test_data["attention_mask"])
test = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": test_tensor_inputs, "attention_mask": test_tensor_mask},
        test_tensor_labels,
    )
)

# Model

In [14]:
# Model params
BATCH_SIZE_PER_REPLICA = 28
EPOCHS = 6
INITAL_LEARNING_RATE = 0.001
try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

# prepare data for consumption
train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
def customized_sparse_categorical_cross_entropy(y_true, y_pred, from_logits=False, axis=-1):
    cond = (y_true != -100)
    y_pred = tf.convert_to_tensor(y_pred)
    return backend.sparse_categorical_crossentropy(
      y_true[cond], y_pred[cond], from_logits=from_logits, axis=axis)

In [17]:
# Drecreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# initialize model, use_cache=False important! else wrong shape at loss calc
with strategy.scope():
    model = TFGPT2Model.from_pretrained(
        "gpt2",
        use_cache=False,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=customized_sparse_categorical_cross_entropy)
    model.summary()

KeyboardInterrupt: ignored

In [None]:
# Stop training when validation acc starts dropping
# Save checkpoint of model after each period
now = datetime.now().strftime("%Y-%m-%d_%H%M")
# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", verbose=1, patience=2, restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        PATH_BASE + "/data/models/" + now + "_GPT2-Model_{epoch:02d}_{val_loss:.4f}.h5",
        monitor="val_loss",
        save_format='tf',
        save_best_only=True,
        verbose=1,
        save_weights_only=True
    ),
]

In [None]:
# Train Model
steps_per_epoch = int(BUFFER_SIZE // BATCH_SIZE)
print(
    f"Model Params:\nbatch_size: {BATCH_SIZE}\nEpochs: {EPOCHS}\n"
    f"Step p. Epoch: {steps_per_epoch}\n"
    f"Initial Learning rate: {INITAL_LEARNING_RATE}"
)
hist = model.fit(
    train_ds,
    validation_data=train_ds,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1,
)

# 2nd Try

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
train_df = pd.read_json('/content/drive/MyDrive/soloist/soloist/examples/multiwoz/train.soloist.json')

In [19]:
train_df.head(2)

Unnamed: 0,history,kb,belief,reply,name,dp
0,[user : am looking for a place to to stay that...,kb : hotel one,belief : hotel pricerange = cheap ; type = hotel,"system : okay , do you have a specific area yo...",SNG01856.json,dp : hotel ( request ( area ) )
1,[user : am looking for a place to to stay that...,kb : hotel one,belief : hotel parking = yes ; pricerange = ch...,system : i found [value_count] [value_priceran...,SNG01856.json,dp : booking ( inform ( none = none ) )


In [20]:
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['history', 'kb', 'belief', 'reply', 'name', 'dp'],
    num_rows: 56678
})

In [46]:
MAX_TOKENS = 256
BOS_TOKEN = "=>"
EOB_TOKEN = "<|end_of_belief|>"
EOKB_TOKEN = "<|end_of_knowledge_base|>"
EOR_TOKEN = "<|end_of_response|>"
EOS_TOKEN = EOR_TOKEN
PAD_TOKEN = "<|pad|>"
CLS_TOKEN = "<|CLS|>"

# this will download and initialize the pre trained tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    bos_token=BOS_TOKEN,
    eos_token=EOS_TOKEN,
    pad_token=PAD_TOKEN,
    max_length=MAX_TOKENS,
    is_split_into_words=True,
)
tokenizer.add_tokens([EOB_TOKEN, EOKB_TOKEN, CLS_TOKEN], special_tokens=True) 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


3

In [47]:
output = {}
# texts to numeric vectors of MAX_TOKENS
def tokenize_function(examples, tokenizer=tokenizer):
    # Add start and end token to each comment
    processed_examples = list()
    # NOTE: kb is dummy right now!
    for history, belief, kb, reply in zip(examples['history'], examples['belief'], examples['kb'], examples['reply']):
        joined_history = ' '.join(history)
        processed_examples.append(joined_history + \
                                  BOS_TOKEN + belief + \
                                  EOB_TOKEN + kb + \
                                  EOKB_TOKEN + reply + EOS_TOKEN + CLS_TOKEN)
    # tokenizer created input_ids and attention_mask as output
    output = tokenizer(
        processed_examples,
        add_special_tokens=True,  # Only adds pad not eos and bos
        max_length=MAX_TOKENS,
        truncation=True,
        padding='max_length',
    )
    # shift labels for next token prediction
    # set padding token labels to -100 which is ignored in loss computation
    pad_token_id = tokenizer.pad_token_id

    output["labels"] = [x[1:] for x in output["input_ids"]]

    output_labels = list()
    bos_token_id, eob_token_id, eokb_token_id = \
                                            tokenizer.convert_tokens_to_ids([BOS_TOKEN, EOB_TOKEN, EOKB_TOKEN])
    for y in output['labels']:
        tmp_list = list()
        mask = True
        for x in y:
            if mask or (x == pad_token_id):
                tmp_list.append(-100)
            else:
                tmp_list.append(x)
            
            if x in {bos_token_id, eob_token_id, eokb_token_id}:
                mask = not mask
        output_labels.append(tmp_list)
    output["labels"] = output_labels

    # truncate input ids and attention mask to account for label shift
    output["input_ids"] = [x[:-1] for x in output["input_ids"]]
    output["attention_mask"] = [x[:-1] for x in output["attention_mask"]]
    return output

train_data = train_ds.map(
    tokenize_function,
    batched=True,
    num_proc=strategy.num_replicas_in_sync,
    load_from_cache_file=True,
)
print(train_data)

  0%|          | 0/57 [00:00<?, ?ba/s]

Dataset({
    features: ['history', 'kb', 'belief', 'reply', 'name', 'dp', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 56678
})


In [48]:
train_tensor_inputs = tf.convert_to_tensor(train_data["input_ids"])
train_tensor_labels = tf.convert_to_tensor(train_data["labels"])
train_tensor_mask = tf.convert_to_tensor(train_data["attention_mask"])
train = tf.data.Dataset.from_tensor_slices(
    (
        {"input_ids": train_tensor_inputs, "attention_mask": train_tensor_mask},
        train_tensor_labels,
    )
)

### Model

In [50]:
# Model params
BATCH_SIZE_PER_REPLICA = 28
EPOCHS = 6
INITAL_LEARNING_RATE = 0.001
try:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA * strategy.num_replicas_in_sync
except NameError as e:
    BATCH_SIZE = BATCH_SIZE_PER_REPLICA
BUFFER_SIZE = len(train)

# prepare data for consumption
train_ds = (
    train.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
)
# test_ds = test.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
def customized_sparse_categorical_cross_entropy(y_true, y_pred, from_logits=False, axis=-1):
    cond = (y_true != -100)
    y_pred = tf.convert_to_tensor(y_pred)
    return backend.sparse_categorical_crossentropy(
      y_true[cond], y_pred[cond], from_logits=from_logits, axis=axis)

In [56]:
# Drecreasing learning rate scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    INITAL_LEARNING_RATE,
    decay_steps=500,
    decay_rate=0.7,
    staircase=True)

# initialize model, use_cache=False important! else wrong shape at loss calc
with strategy.scope():
    model = TFGPT2DoubleHeadsModel.from_pretrained(
        "gpt2",
        use_cache=False,
        bos_token_id=tokenizer.bos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )
    model.resize_token_embeddings(len(tokenizer))
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    model.compile(optimizer=optimizer, loss=model.compute_loss)
    model.summary()

All model checkpoint layers were used when initializing TFGPT2DoubleHeadsModel.

Some layers of TFGPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tfgpt2_double_heads_model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer (TFGPT2MainLaye  multiple                 124443648 
 r)                                                              
                                                                 
 multiple_choice_head (TFSeq  multiple                 769       
 uenceSummary)                                                   
                                                                 
Total params: 124,444,417
Trainable params: 124,444,417
Non-trainable params: 0
_________________________________________________________________


In [57]:
# Stop training when validation acc starts dropping
# Save checkpoint of model after each period
now = datetime.now().strftime("%Y-%m-%d_%H%M")
# Create callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", verbose=1, patience=2, restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        PATH_BASE + "/data/models/" + now + "_GPT2-Model_{epoch:02d}_{val_loss:.4f}.h5",
        monitor="val_loss",
        save_format='tf',
        save_best_only=True,
        verbose=1,
        save_weights_only=True
    ),
]

In [85]:
@tf.function
def train_step(batch):
    with tf.GradientTape() as grad:
        attension_mask, input_ids = batch
        outputs = model(input_ids, training=True)
        lm_loss, mc_loss = outputs[:2]

    # tr_loss = lm_loss + mc_loss
    print("hello")
    grad_model = grad.gradient(lm_loss, model.trainable_variables)
    print("So here we are")
    optimizer.apply_gradients(zip(grad_model, model.trainable_variables))
    print("wtf")

In [87]:
for epoch in range(10):
    start = time.time()

    for step, batch in enumerate(train_ds):
        # print(batch.shape)
        train_step(batch)
        print("Error is here")

    # # Produce images for the GIF as you go
    # display.clear_output(wait=True)
    # generate_and_save_images(generator,
    #                          epoch + 1,
    #                          seed)

    # # Save the model every epochs
    # if (epoch + 1) % 15 == 0:
    #   checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

InvalidArgumentError: ignored

In [91]:
model.__dict__.keys()

dict_keys(['_self_setattr_tracking', '_is_model_for_instrumentation', '_instrumented_keras_api', '_instrumented_keras_layer_class', '_instrumented_keras_model_class', '_trainable', '_stateful', 'built', '_input_spec', '_build_input_shape', '_saved_model_inputs_spec', '_saved_model_arg_spec', '_supports_masking', '_name', '_activity_regularizer', '_trainable_weights', '_non_trainable_weights', '_updates', '_thread_local', '_callable_losses', '_losses', '_metrics', '_metrics_lock', '_dtype_policy', '_compute_dtype_object', '_autocast', '_self_tracked_trackables', '_inbound_nodes_value', '_outbound_nodes_value', '_expects_training_arg', '_default_training_arg', '_expects_mask_arg', '_dynamic', '_initial_weights', '_auto_track_sub_layers', '_preserve_input_structure_in_config', '_outer_name_scope', '_is_graph_network', 'inputs', 'outputs', 'input_names', 'output_names', 'stop_training', 'history', 'compiled_loss', 'compiled_metrics', '_compute_output_and_mask_jointly', '_is_compiled', 'opt

In [None]:
model

In [58]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2DoubleHeadsModel.from_pretrained("gpt2")

# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})

embedding_layer = model.resize_token_embeddings(
    len(tokenizer)
)  # Update the model embeddings with the new vocabulary size

choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1

outputs = model(input_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]

All model checkpoint layers were used when initializing TFGPT2DoubleHeadsModel.

Some layers of TFGPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
mc_prediction_scores

<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[-0.8222367 , -0.91927683]], dtype=float32)>