In [None]:
# only run on google runtime
!pip install tensorflow-text
!pip install tf-models-official
!pip install tensorflow-addons
!pip install scikit-learn
!pip install datasets
!pip install tqdm

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text
from official.nlp import optimization
import tensorflow_addons as tfa
import transformers
import sklearn as sk

In [None]:
# only run on google runtime
# update file paths with location to subtask data
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
input_directory = '/content/drive/MyDrive/2023-2024 School Year/Fall Semester/Natural Language Processing/Project/Data'
raw_train_data = input_directory + '/subtaskA_train_monolingual.jsonl'
raw_dev_data = input_directory + '/subtaskA_dev_monolingual.jsonl'

In [9]:
import json

def extract_data(filename):
  text = []
  with open(filename, 'r', encoding='utf-8') as f:
    jlist = list(f)
    for elem in jlist:
      jsonData = json.loads(elem)
      text.append(jsonData["text"])
  return text

In [10]:
train_text = extract_data(raw_train_data)
dev_text = extract_data(raw_dev_data)

In [None]:
# GPT Default
gpt_default_tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
gpt_default_encoder = transformers.TFGPT2LMHeadModel.from_pretrained("gpt2")

# OPT Default
opt_default_tokenizer = transformers.AutoTokenizer.from_pretrained("facebook/opt-125m")
opt_default_encoder = transformers.TFOPTForCausalLM.from_pretrained("facebook/opt-125m")

models = [
    (gpt_default_tokenizer, gpt_default_encoder),
    (opt_default_tokenizer, opt_default_encoder),
]
softmax = tf.nn.softmax

In [7]:
gpt_default_tokenizer.pad_token = gpt_default_tokenizer.eos_token
gpt_default_encoder.config.pad_token_id = gpt_default_encoder.config.eos_token_id

In [8]:
from tqdm import tqdm

def encode_text(text_list, batch_size, models):
  batch_encoded_text = []
  batch_count = int((len(text_list) / batch_size))
  if len(text_list) % batch_size != 0:
    batch_count += 1
  for i in tqdm(range(batch_count)):
    batch_start = i*batch_size
    batch_end = min(batch_start + batch_size, len(text_list))
    batch_text = text_list[batch_start: batch_end]
    current_batch_count = batch_end - batch_start
    model_output = []
    for tokenizer, model in models:
      tokenized_data = tokenizer(batch_text, padding='max_length', return_tensors="tf", max_length=512, truncation=True)
      outputs = model(tokenized_data)
      vocab_probs = softmax(outputs.logits)
      flattened_probs = tf.reshape(vocab_probs, [vocab_probs.shape[0], -1])
      indicies = tf.reshape(tf.tile(tf.range(tokenized_data['input_ids'].shape[1] - 1), [current_batch_count]), [current_batch_count, tokenized_data['input_ids'].shape[1] - 1])
      indicies = indicies*vocab_probs.shape[2]
      indicies = indicies + tokenized_data['input_ids'][:, 1:]
      selected_probs = tf.gather(flattened_probs, indicies, axis=1, batch_dims=1)
      model_output.append(selected_probs)
    combined_model_probs = tf.stack(model_output, axis=-1)
    batch_encoded_text.append(combined_model_probs)
  return batch_encoded_text

In [None]:
train_text_encoded_batched = encode_text(train_text, 8, models)
dev_text_encoded_batched = encode_text(dev_text, 8, models)

In [19]:
# update paths with location to save encoded train and dev data
rain_text_final_encoded = np.concatenate(train_text_encoded_batched)
dev_text_final_encoded = np.concatenate(dev_text_encoded_batched)
np.save("/content/drive/MyDrive/2023-2024 School Year/Fall Semester/Natural Language Processing/Project/train_final_encoded_probabilities.npy", train_text_final_encoded)
np.save("/content/drive/MyDrive/2023-2024 School Year/Fall Semester/Natural Language Processing/Project/dev_final_encoded_probabilities.npy", dev_text_final_encoded)