# Prepairing

## inSTAR PLATINum

In [1]:
#hugging
!pip install -q transformers
!pip install -q datasets

#keras
!pip install -q --upgrade keras-nlp
!pip install -q --upgrade keras

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.2 which is incompatible.[0m[31m
[0m

## config

In [2]:
import os

os.environ["KERAS_BACKEND"] = "torch"
import keras_nlp
import keras

import transformers

import numpy as np

import torch
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

## random

In [3]:
SEED = 42
os.environ['PYTHONHASHSEED']=str(SEED)
keras.utils.set_random_seed(SEED)
transformers.set_seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

## models

In [4]:
gpt2_types = list(keras_nlp.models.GPT2Backbone.presets.keys())
opt_types = list(keras_nlp.models.OPTBackbone.presets.keys())
print(gpt2_types)
print(opt_types)
GPT2 = gpt2_types[0]
OPT = opt_types[0]
PYTHIA = "EleutherAI/pythia-160m-deduped"
LLAMA = "JackFram/llama-160m"

['gpt2_base_en', 'gpt2_medium_en', 'gpt2_large_en', 'gpt2_extra_large_en', 'gpt2_base_en_cnn_dailymail']
['opt_125m_en', 'opt_1.3b_en', 'opt_2.7b_en', 'opt_6.7b_en']


## constraints

In [5]:
DIR_CONSTRAINTS_PATH = '/content/drive/My Drive/Colab Notebooks/NLP_models_qa_testing/text-generation/constraints/'

# Datasets

In [6]:
import datasets

## datasets

In [7]:
SQUAD_V2 = "squad_v2"
SQL_CONTEXT = "b-mc2/sql-create-context"
ADVERSARIAL_QA = "adversarial_qa"

In [8]:
squad2 = datasets.load_dataset(SQUAD_V2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
sql = datasets.load_dataset(SQL_CONTEXT)

In [10]:
adversarial_qa = datasets.load_dataset(ADVERSARIAL_QA, 'adversarialQA')

## tokenizers

In [11]:
gpt2_tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset(GPT2)
opt_tokenizer = keras_nlp.models.OPTTokenizer.from_preset(OPT)
pythia_tokenizer = transformers.AutoTokenizer.from_pretrained(
  PYTHIA,
  revision="step103000",
  cache_dir="./pythia-160m-deduped/step103000",
  padding_side='left',
  model_max_length=1024,
)
llama_tokenizer = transformers.AutoTokenizer.from_pretrained(
    LLAMA,
    cache_dir="./llama/160m",
    padding_side='left',
    model_max_length=1024,
)

def lambda_pythia(input):
  tokens = pythia_tokenizer.batch_encode_plus(input, add_special_tokens=False, truncation=True)
  return tokens.input_ids

def lambda_llama(input):
  tokens = llama_tokenizer.batch_encode_plus(input, add_special_tokens=False, truncation=True,)
  return tokens.input_ids


lambda_tokenizers = {
    GPT2: gpt2_tokenizer,
    LLAMA: lambda_llama,
    PYTHIA: lambda_pythia,
    OPT: opt_tokenizer,
}

## preprocess of data

In [12]:
MAX_TOKENS = 128

In [13]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [14]:
def get_entire_dataset(dataset, names_arr):
  df_list = [dataset[name].to_pandas() for name in names_arr]
  return pd.concat(df_list)

In [15]:
squad2 = get_entire_dataset(squad2, ['train','validation'])
sql = get_entire_dataset(sql, ['train'])
adversarial_qa = get_entire_dataset(adversarial_qa, ['train','validation','test'])

In [16]:
def extract_answer(answer_dict):
    text_list = answer_dict.get('text', [])
    non_empty_text_list = [text for text in text_list if text]
    return non_empty_text_list[0] if non_empty_text_list else None

In [17]:
ANSWER = 'answer'
CONTEXT = 'context'
QUESTION = 'question'

SPLITS = ['train', 'validation', 'test']

for dataset in [squad2, adversarial_qa]:
  dataset[ANSWER] = dataset['answers'].apply(extract_answer)
  dataset.dropna(subset=[ANSWER], inplace=True)

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
def create_constraints_qa(dataset, tokenizers, name):
  texts = []
  conditions = [True]*len(dataset)
  conditions = np.array(conditions)

  for _, example in dataset.iterrows():
    context = example[CONTEXT]
    question = example[QUESTION]
    answer = example[ANSWER]

    text = "context:" + context + " question:" + question + " answer:" + answer
    texts.append(text)

  BATCH_SIZE = 256
  for tokenizer in iter(tokenizers):
    for i in range(0, len(texts), BATCH_SIZE):
      batch_texts = texts[i:i+BATCH_SIZE]
      tokenized_texts = tokenizer(batch_texts)
      batch_conditions = conditions[i:i+BATCH_SIZE]
      condition_tokenizer = np.array([len(tokens) < MAX_TOKENS-4 for tokens in tokenized_texts])
      np.logical_and(batch_conditions, condition_tokenizer, out=batch_conditions)

  os.makedirs(DIR_CONSTRAINTS_PATH, exist_ok=True)
  np.save(f'{DIR_CONSTRAINTS_PATH}{name}.npy', conditions)

In [20]:
squad2 = create_constraints_qa(squad2, lambda_tokenizers.values(), "squad2")

In [20]:
sql = create_constraints_qa(sql, lambda_tokenizers.values(), "sql")

In [20]:
adversarial_qa = create_constraints_qa(adversarial_qa, lambda_tokenizers.values(), "adversarial_qa")