In [None]:
%load_ext autoreload
%autoreload 2

!pip install neptune.tensorflow.keras
!pip install transformers
!pip install sentencepiece

from google.colab import drive, auth
drive.mount('/content/drive', force_remount=True)
auth.authenticate_user()

import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import tensorflow as tf
tf.config.optimizer.set_jit(True)

import neptune.new as neptune

from tensorflow.keras.utils import plot_model
from transformers import TFRobertaModel, RobertaTokenizer
from transformers import TFXLMRobertaModel, XLMRobertaTokenizer
from transformers import TFT5EncoderModel
from transformers import AdamWeightDecay

from glob import glob
from pathlib import Path

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline, tfmodels, protobuf_handler, valid_analysis, apitokens

print('Libraries Imported')

In [None]:
# Initialize TPU Strategy
tpu_strategy = pipeline.check_for_tpu_status()

In [None]:
# Model Selection
c1_trans_name = 'gbert'
c2_trans_name = 'gbert'
c3_trans_name = 'multi'

model_name = 'gbert_gbert_multi_code_rep'
model_objective = 'triple_code_rep'
gcs_bucket = 'md_code_rep'

model_selection = {'triple_code_rep': [protobuf_handler.decode_triple_protobuf,
                                       protobuf_handler.parse_triple_tensor_arrays,
                                       tf.keras.losses.MeanSquaredError,
                                       'val_loss', ['<d>', '<c>','[DIVIDER]', '[EMPTY]', '[EMOJI]']],
                   'dual_code_rep': [protobuf_handler.decode_dual_protobuf,
                                     protobuf_handler.parse_dual_tensor_arrays,
                                     tf.keras.losses.MeanSquaredError,
                                     'val_loss', ['<d>', '<c>', '[DIVIDER]', '[EMPTY]', '[EMOJI]']]}

model_selection = model_selection[model_objective]
decode_protobuf = model_selection[0]
parse_tensor_arrays = model_selection[1]
loss_function = model_selection[2]
checkpoint_metric = model_selection[3]
custom_tokens = model_selection[4]

# Additional Checklist:

# CHECK TOKEN LOCATION FOR CBERT
# 1. Model Class: MDcode, MDOrder, MDExists
# 2. Samples per file
# 3. Data in GCS-Bucket
# 4. Checkpoints and Model Folder

params = {'samples_per_file': 55752,
          'batch_size': 32,
          'learning_rate': 2.5e-5,
          'warmup_rate': 0.04,
          'epochs': 4}

In [None]:
# Initalize Model Name and Tokenizer
trans_options = {'multi': ['Unbabel/xlm-roberta-comet-small', TFXLMRobertaModel, XLMRobertaTokenizer],
                 'cbert': ['microsoft/codebert-base',  TFRobertaModel, RobertaTokenizer],
                 'gbert': ['microsoft/graphcodebert-base', TFRobertaModel, RobertaTokenizer],
                 't5': ['Salesforce/codet5-base', TFT5EncoderModel, RobertaTokenizer]}  

selected_trans = trans_options[c1_trans_name] 
c1_trans_config = selected_trans[0]
c1_model_class = selected_trans[1]
c1_tokenizer_class =  selected_trans[2]
c1_tokenizer = c1_tokenizer_class.from_pretrained(c1_trans_config)
c1_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c1_tokenizer, custom_tokens)

selected_trans = trans_options[c2_trans_name] 
c2_trans_config = selected_trans[0]
c2_model_class = selected_trans[1]
c2_tokenizer_class =  selected_trans[2]
c2_tokenizer = c2_tokenizer_class.from_pretrained(c2_trans_config)
c2_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c2_tokenizer, custom_tokens)

selected_trans = trans_options[c3_trans_name] 
c3_trans_config = selected_trans[0]
c3_model_class = selected_trans[1]
c3_tokenizer_class =  selected_trans[2]
c3_tokenizer = c3_tokenizer_class.from_pretrained(c3_trans_config)
c3_tokenizer = pipeline.add_custom_tokens_to_tokenizer(c3_tokenizer, custom_tokens + ['\n'])

In [None]:
# Create Train and Valid Datasets
auto = tf.data.experimental.AUTOTUNE

gcs_records= f'gs://{gcs_bucket}/{model_name}/*.tfrecords'
tfrecord_files = tf.io.gfile.glob(gcs_records)
train_filepaths = tfrecord_files[:-8]  # 96 total TFRecord files, 16 saved for valid dataset (15%)
valid_filepaths = tfrecord_files[-8:]  # Move into train dataset later
num_train_samples = params['samples_per_file'] * len(train_filepaths)
num_valid_samples = params['samples_per_file'] * len(valid_filepaths)

train_dataset = (tf.data.TFRecordDataset(train_filepaths, num_parallel_reads=auto)
                  .map(decode_protobuf, num_parallel_calls=auto)
                  .map(parse_tensor_arrays, num_parallel_calls=auto)
                  .shuffle(num_train_samples+1, seed=1)
                  .repeat(params['epochs']+1)
                  .batch(params['batch_size'], drop_remainder=True, num_parallel_calls=auto)
                  .prefetch(auto))

valid_dataset = (tf.data.TFRecordDataset(valid_filepaths, num_parallel_reads=auto)
                  .map(decode_protobuf, num_parallel_calls=auto)
                  .map(parse_tensor_arrays, num_parallel_calls=auto)
                  .shuffle(num_valid_samples+1, seed=1)
                  .repeat(params['epochs']+1)
                  .batch(params['batch_size'], drop_remainder=True, num_parallel_calls=auto)
                  .prefetch(auto))

In [None]:
# Set Learning Rate Decayer Parameters
train_steps_per_epoch = num_train_samples / params['batch_size']
validation_steps_per_epoch = num_valid_samples / params['batch_size']

total_steps = train_steps_per_epoch * params['epochs']
warmup_steps = int(params['warmup_rate'] * total_steps)                           

In [None]:
# weight_decay_exclude = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', '_norm']
# def create_groups_lr_optimizer(lr_str):
#   lr_scheduler = tfmodels.WarmupCosineDecay(params['learning_rate']*lr_str, warmup_steps, total_steps)
#   lr_optimizer = AdamWeightDecay(lr_scheduler, weight_decay_rate=0.01, exclude_from_weight_decay=weight_decay_exclude)
#   return lr_optimizer

# grouped_optimizers = [create_groups_lr_optimizer(lr_str) for lr_str in [0.5, 0.75, 1]]

# layer_groups = [model.layers[0].layers[0].layers[0].encoder.layer[i:i+4]
#                 + model.layers[1].layers[0].layers[0].encoder.layer[i:i+4]
#                 for i in range(0, 12, 4)]

# optimizers_and_layers = [(grouped_optimizers[0], layer_groups[0]),
#                          (grouped_optimizers[1], layer_groups[1]),
#                          (grouped_optimizers[2], layer_groups[2] + model.layers[2:])]
# lr_optimizer = tfa.optimizers.MultiOptimizer(optimizers_and_layers)

# if code_trans_name == 't5':
#   code_transformer = code_model_class.from_pretrained(code_trans_config,                                               
#                                                       output_hidden_states=True,
#                                                       from_pt=True,
#                                                       dropout_rate=0.2)
#   code_transformer.resize_token_embeddings(len(code_tokenizer))
#   code_transformer = tfmodels.reinit_weights_and_bias(code_transformer, 5, 't5') 

### Train Model

In [None]:
# Initialize Model
with tpu_strategy.scope():
  # C1 Transformer
  c1_transformer = c1_model_class.from_pretrained(c1_trans_config,
                                                  hidden_dropout_prob=0.12,
                                                  attention_probs_dropout_prob=0.12)
  c1_transformer.resize_token_embeddings(len(c1_tokenizer))
  c1_transformer = tfmodels.reinit_weights_and_bias(c1_transformer, 5, 'roberta') 

  # C2 Transformer
  c2_transformer = c2_model_class.from_pretrained(c2_trans_config,
                                                  hidden_dropout_prob=0.12,
                                                  attention_probs_dropout_prob=0.12)
  c2_transformer.resize_token_embeddings(len(c2_tokenizer))
  c2_transformer = tfmodels.reinit_weights_and_bias(c2_transformer, 5, 'roberta') 

  # C3 Transformer
  c3_transformer = c3_model_class.from_pretrained(c3_trans_config,
                                                  hidden_dropout_prob=0.1,
                                                  attention_probs_dropout_prob=0.1,
                                                  from_pt=True)
  c3_transformer.resize_token_embeddings(len(c3_tokenizer))
  c3_transformer = tfmodels.reinit_weights_and_bias(c3_transformer, 3, 'roberta')  # 6 layers in distilled model, just take reinit top 3

  model = tfmodels.TripleCodeRep(c1_transformer, c2_transformer, c3_transformer)
  
  lr_scheduler = tfmodels.WarmupCosineDecayRestarts(params['learning_rate'], warmup_steps, train_steps_per_epoch)
  optimizer = AdamWeightDecay(learning_rate=lr_scheduler,
                              weight_decay_rate=0.0125,
                              exclude_from_weight_decay=['bias', 'LayerNorm', 'norm'])
  lr_tracker = tfmodels.LRTracker(optimizer)
  metrics = ['mean_squared_error', 'mean_absolute_error', lr_tracker]
  
  model.compile(optimizer=optimizer, 
                loss=loss_function(),  # loss_function() or tfmodels.flipped_huber
                metrics=metrics,
                jit_compile=True)
  display(model.build_graph().summary())
  display(plot_model(model.build_graph()))

In [None]:
# Initalize Neptune Metadata Tracker Callback
neptune_token = apitokens.neptune_token

run = neptune.init(project="robby700/GCode",
                    name="Google AI4Code Challenge",
                    tags=[f'c1_{c1_trans_name}',
                          f'c2_{c2_trans_name}',
                          f'c3_{c3_trans_name}', 
                          f'{model_objective}'],
                    api_token=neptune_token,
                    capture_hardware_metrics=False)

run['hyper_parameters'] = params
neptune_cbk = tfmodels.NeptuneCallback(run=run)

# Initalize Model Checkpoint Callback
checkpoint_filepath = f'gs://{gcs_bucket}/checkpoints/{model_name}'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                               monitor=checkpoint_metric,
                                                               verbose=1,
                                                               save_weights_only=False,
                                                               save_best_only=False)

In [None]:
# Train Model
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    steps_per_epoch=train_steps_per_epoch, 
                    validation_steps=validation_steps_per_epoch,
                    epochs=params['epochs'],
                    callbacks=[neptune_cbk, model_checkpoint_callback])

In [None]:
# Stop Neptune Run
run.stop()

In [None]:
# Save Model
model_save_path =  f'gs://{gcs_bucket}/models/{model_name}'
model_with_opt_save_path =  f'gs://{gcs_bucket}/models_with_opt/{model_name}'

tf.keras.models.save_model(model, model_save_path, include_optimizer=False)
tf.keras.models.save_model(model, model_with_opt_save_path, include_optimizer=True)
print(f"Model and weights have been saved at {model_save_path}")

### Reload Model for Resumed Training

In [None]:
# # Load in saved models

# # Hard reset cosine for last batch
# # If you keep batch size 32, might be able to keep adam optimizer weights
# # Decay only at end, let's reset it

# model_path = f'gs://{gcs_bucket}/checkpoints/{model_name}'
# neptune_run_name = 'GCOD-226' 
# current_epoch = 2

# with tpu_strategy.scope():
#   lr_scheduler = {'AdamWeightDecay': AdamWeightDecay,
#                   'WarmupCosineDecayRestarts': tfmodels.WarmupCosineDecayRestarts,
#                   'LRTracker': tfmodels.LRTracker}
#   load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')

#   model = tf.keras.models.load_model(model_path,
#                                      custom_objects=lr_scheduler,
#                                      options=load_options,
#                                      compile=True)
#   print('Loaded in model')

In [None]:
# Can we do a quick change and just straight up add the warmup back in on each runup?

In [None]:
# Reload Model with Weights

neptune_run_name = 'GCOD-240'
current_epoch = 2
model_save_path = f'gs://{gcs_bucket}/models_with_opt/{model_name}'

with tpu_strategy.scope():
  # C1 Transformer
  c1_transformer = c1_model_class.from_pretrained(c1_trans_config,
                                                  output_hidden_states=True,
                                                  hidden_dropout_prob=0.12,
                                                  attention_probs_dropout_prob=0.12)
  c1_transformer.resize_token_embeddings(len(c1_tokenizer))

  # C2 Transformer
  c2_transformer = c2_model_class.from_pretrained(c2_trans_config,
                                                  output_hidden_states=True,
                                                  hidden_dropout_prob=0.12,
                                                  attention_probs_dropout_prob=0.12)
  c2_transformer.resize_token_embeddings(len(c2_tokenizer))

  # C3 Transformer
  c3_transformer = c3_model_class.from_pretrained(c3_trans_config,
                                                  output_hidden_states=True,
                                                  hidden_dropout_prob=0.1,
                                                  attention_probs_dropout_prob=0.1,
                                                  from_pt=True)
  c3_transformer.resize_token_embeddings(len(c3_tokenizer))
  model = tfmodels.TripleCodeRep(c1_transformer, c2_transformer, c3_transformer)

  lr_scheduler = tfmodels.WarmupCosineDecayRestarts(params['learning_rate'], warmup_steps, train_steps_per_epoch)                                          
  optimizer = AdamWeightDecay(learning_rate=lr_scheduler,
                              weight_decay_rate=0.0125,
                              exclude_from_weight_decay=['bias', 'LayerNorm', 'norm'])
  lr_tracker = tfmodels.LRTracker(optimizer)
  metrics = ['mean_squared_error', 'mean_absolute_error', lr_tracker]
  
  model.compile(optimizer=optimizer, 
                loss=loss_function(),  # loss_function() or tfmodels.flipped_huber
                metrics=metrics,
                jit_compile=True)
  model.load_weights(model_save_path)
  print('Loading in model and weights')

In [None]:
# Initalize Neptune Metadata Tracker Callback
neptune_token = apitokens.neptune_token

run = neptune.init(project="robby700/GCode",
                   api_token=neptune_token,
                   capture_hardware_metrics=False,
                   run=neptune_run_name)
neptune_cbk = tfmodels.NeptuneCallback(run=run)

# Initalize Model Checkpoint Callback
checkpoint_filepath = f'gs://{gcs_bucket}/checkpoints/{model_name}'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                               monitor=checkpoint_metric,
                                                               verbose=1,
                                                               save_weights_only=False,
                                                               save_best_only=True)

In [None]:
# Start training up again
history = model.fit(train_dataset,
                    validation_data=valid_dataset,
                    steps_per_epoch=train_steps_per_epoch, 
                    validation_steps=validation_steps_per_epoch,
                    epochs=params['epochs'],
                    initial_epoch=current_epoch,
                    callbacks=[neptune_cbk, model_checkpoint_callback])

In [None]:
run.stop()

In [None]:
# Save Model
model_save_path =  f'gs://{gcs_bucket}/models/{model_name}'
model_with_opt_save_path =  f'gs://{gcs_bucket}/models_with_opt/{model_name}'

tf.keras.models.save_model(model, model_save_path, include_optimizer=False)
tf.keras.models.save_model(model, model_with_opt_save_path, include_optimizer=True)
print(f"Model and weights have been saved at {model_save_path}")

In [None]:
# Load in weights
# weight_save_path = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/saved_models/opt_weights/{model_name}_opt_weights.npy'
# opt_weights = np.load(weight_save_path, allow_pickle=True)
# grad_vars = model.get_weights()
# zero_grads = [tf.zeros_like(w) for w in grad_vars]

# Set zero gradients which don't do nothing for Adam
# optimizer.apply_gradients(zip(zero_grads, grad_vars))

# Set the weights of the optimizer
# model.optimizer.set_weights(opt_weights)

### Analyze Predictions for Threshold

In [None]:
# Double check all features have been parased and prepared correctly

for batch_features, batch_labels in train_dataset.take(1):
  print(f'Train Dataset Tensor Spec:\n{train_dataset}\n')
  print(f'Parsed Input Ids:\n{batch_features[0]}\n')
  print(f'Parsed Attention Mask:\n{batch_features[1]}\n')
  print(f'Parsed Labels:\n{batch_labels}\n')

In [None]:
# Check predictions on a single batch
predictions = model.predict(batch_features, verbose=2)
print(f'\nPredictions with dtype {predictions.dtype}:\n{predictions}')
print(f'\nTrue Labels wtih dtype {batch_labels.dtype}:\n{batch_labels}')

In [None]:
# Collect valid labels and run predictions for comparison, this takes a while around 20-25 minutes
valid_labels = valid_analysis.get_labels(valid_dataset)
valid_predictions = model.predict(valid_dataset, verbose=1)

In [None]:
# Print ROC Curve and Threshold
roc_auc, best_threshold = valid_analysis.plot_roc_curve(valid_labels, valid_predictions)

In [None]:
# Calculate and plot confusion matrix at best threshold from ROC
binary_valid_preds = np.where(valid_predictions >= best_threshold, 1, 0)

tn, fp, fn, tp = valid_analysis.confusion_matrix(valid_labels, binary_valid_preds).ravel()
precision = tp/(tp+fp)
recall = tp/(tp+fn)

cm, cm_perc = valid_analysis.plot_confusion_matrix(valid_labels, binary_valid_preds)
print(f'At an optimal ROC threshold of {best_threshold:.2f}. Precision is {precision:.2f} and recall is {recall:.2f}.\n')