In [100]:
!pip install transformers evaluate peft python-dotenv huggingface_hub wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Imports**

In [149]:
import os
import csv
import json
import gc

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence
import evaluate
from transformers import Trainer, TrainingArguments, GPT2DoubleHeadsModel, AutoTokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

import csv
import json
import pandas as pd
from collections import defaultdict
import re
import random
from random import shuffle
from sklearn.model_selection import train_test_split


In [103]:
import logging
logging.disable(logging.WARNING)

In [104]:
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Colab_Notebooks/.env")
HF = os.getenv("HF_TOKEN")
WANDB = os.getenv("WANDB_TOKEN")

!huggingface-cli login --token $HF
!wandb login $WANDB

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [147]:
MODEL_PATH = "/content/drive/MyDrive/Colab_Notebooks/Brainy/ckpt/brainy-gpt-2/checkpoint-400" ## A GPT2 Model pretrained for next-sentence prediction
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained(MODEL_PATH) # Initialize for language modelling

# Add special tokens
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"
T = "[THERA]"
C = "[CLI]"
tokenizer.bos_token = "<bos>"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"cls_token": tokenizer.cls_token, "sep_token":tokenizer.sep_token, "pad_token":tokenizer.pad_token, "bos_token":tokenizer.bos_token, "additional_special_tokens":[T, C]})
embedding_layer = model.resize_token_embeddings(len(tokenizer))

In [107]:
PATH = "/content/drive/MyDrive/Colab_Notebooks/Brainy/"

**Read in data from CSV and preprocess to appropriate format**

In [108]:
exp = re.compile(r"\[.*\]")
def speechprocess(line):
    replacements = ["I see", "I understand", "I hear you"]
    sub = replacements[random.randint(0,2)]
    #disfluencies = re.compile(r"um|uh[.,]?", re.IGNORECASE)
    speechacts = re.compile(r"hmm|mm-hmm", re.IGNORECASE)
    line = speechacts.sub(sub, line, re.IGNORECASE)
    return line.strip(" |-")

In [111]:
dataset = defaultdict(list)

with open(PATH + "therapy_data.csv", "r", encoding="utf-8") as csvf:
    csvreader = csv.DictReader(csvf)
    for i, line in enumerate(csvreader):
        text =speechprocess(line["utterance_text"]).strip("-")
        if exp.match(text):
            continue
        if line["interlocutor"] == "therapist":
            dataset["therapist_lines"].append( f"{T} " + text.replace(r"^h ", "Okay"))
        if line["interlocutor"] == "client":
            dataset["client_lines"].append(f"{C} " + text.replace(r"^h ", "Okay"))
            
                

**Datasets for Language Modelling** 

In [113]:
replies = dataset["therapist_lines"][:-40]
inputs = dataset["client_lines"]
all_inputs = [(r,i) for r,i in zip(replies,inputs)]

train, dev = train_test_split(all_inputs, random_state=0)

train_replies, train_inputs = [i[0] for i in train], [i[1] for i in train]
dev_replies, dev_inputs = [i[0] for i in dev], [i[1] for i in dev]

def get_input_data(replies,inputs):
  histories, ttids, labels, pos_ids = [], [], [], []
  for i, (th,cl) in enumerate(zip(replies, inputs)):
    if i == 0:
      continue
    if i % 3 == 0:
      prev_thera, prev_client, next_thera = tokenizer.encode(replies[i-3]), tokenizer.encode(inputs[i-2]), tokenizer.encode(replies[i-1])
      history = [tokenizer.bos_token_id] + prev_thera + prev_client + next_thera + [tokenizer.eos_token_id]
      pos_id = list(range(len(history)))
      ttid = [0]*(len(prev_thera)+1) + [1]*len(prev_client) + [0] *(len(next_thera)+1)

      tokenized_lm_target = next_thera + [tokenizer.eos_token_id]
      padding = [-100] * (len(history) - len(tokenized_lm_target))
      label = padding + tokenized_lm_target

      assert len(ttid) == len(history) == len(label) == len(pos_id)
      histories.append(history)
      ttids.append(ttid)
      labels.append(label)
      pos_ids.append(pos_id)
  return histories, ttids, labels, pos_ids

class BrainyData(Dataset):
  def __init__(self, inputs, ttids, labels,posids):
    self.input = inputs
    self.ids = ttids
    self.lm_labels = labels
    self.posids = posids
  def __len__(self):
    return len(self.input)
  def __getitem__(self, index):
    input = self.input[index]
    id = self.ids[index]
    lm_label = self.lm_labels[index]
    pos_id = self.posids[index]

    example = {"input_ids":input, "token_type_ids":id.squeeze(), "labels":lm_label, "position_ids":pos_id.squeeze()}
    return example

def convert_to_tensor(replies, inputs):
  histories, ttids, labels,posids = get_input_data(replies, inputs)
  
  input_tensors, tt_id_tensors, label_tensors, pos_id_tensors = [], [], [], []
  for h,t,l,p in zip(histories, ttids, labels,posids):
    input_tensors.append(torch.tensor(h))
    tt_id_tensors.append(torch.tensor(t))
    label_tensors.append(torch.tensor(l))
    pos_id_tensors.append(torch.tensor(p))
    

  return BrainyData(input_tensors,tt_id_tensors, label_tensors, pos_id_tensors)
  
train_data=convert_to_tensor(train_replies, train_inputs)
dev_data=convert_to_tensor(dev_replies, dev_inputs)


In [115]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)

model_name = "brainy-gpt-4.1"
SAVE_DIR = "/content/drive/MyDrive/Colab_Notebooks/Brainy/ckpt/"

training_args = TrainingArguments(
  output_dir = SAVE_DIR + f"{model_name}",
  log_level = "error",
  num_train_epochs = 10,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adamw_torch",
  weight_decay = 0.01,
  per_device_train_batch_size = 1,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "epoch",
  eval_steps = 100,
  logging_steps = 10,
  push_to_hub = False
)

trainer = Trainer(
  model = model,
  args = training_args,
  train_dataset = train_data,
  eval_dataset = dev_data,
  data_collator = data_collator,
  tokenizer = tokenizer,
)

trainer.train()

{'loss': 10.4503, 'learning_rate': 5.555555555555555e-05, 'epoch': 0.31}
{'loss': 3.9219, 'learning_rate': 0.0001111111111111111, 'epoch': 0.63}
{'loss': 2.7411, 'learning_rate': 0.00016666666666666666, 'epoch': 0.94}
{'eval_loss': 2.9260401725769043, 'eval_runtime': 9.6004, 'eval_samples_per_second': 17.708, 'eval_steps_per_second': 17.708, 'epoch': 0.97}
{'loss': 2.2945, 'learning_rate': 0.0002222222222222222, 'epoch': 1.25}
{'loss': 2.159, 'learning_rate': 0.0002777777777777778, 'epoch': 1.57}
{'loss': 2.0441, 'learning_rate': 0.0003333333333333333, 'epoch': 1.88}
{'eval_loss': 2.8131580352783203, 'eval_runtime': 8.993, 'eval_samples_per_second': 18.904, 'eval_steps_per_second': 18.904, 'epoch': 1.97}
{'loss': 1.7804, 'learning_rate': 0.0003888888888888889, 'epoch': 2.19}
{'loss': 1.7001, 'learning_rate': 0.0004444444444444444, 'epoch': 2.5}
{'loss': 1.7903, 'learning_rate': 0.0005, 'epoch': 2.82}
{'eval_loss': 2.963383674621582, 'eval_runtime': 9.2594, 'eval_samples_per_second': 18

In [None]:
model.push_to_hub("michelleyunun/brainy-lm-2")

Datasets for Classification (Next-Sentence Prediction)

In [152]:
distraction_lines = []
with open(PATH + "WikiQA-train.tsv", "r") as tsvf:
    for i, line in enumerate(tsvf.readlines()):
        if i == 0:
            continue
        if i % 5 == 0:
            distraction_lines.append(line.split("\t")[-2] + f" {tokenizer.cls_token}")
random.shuffle(distraction_lines) # shuffle to get rid of related lines

gold_replies = dataset["therapist_lines"]
train_gold, val_gold = train_test_split(gold_replies, random_state=0)
train_distract, val_distract = train_test_split(distraction_lines,  random_state=0)
test_gold, test_distract = val_gold[:100], val_distract[:300]

from copy import deepcopy

def get_mc_inputs_and_ids(gold_replies, distract_replies):
    mc_inputs = []
    mc_labels = []
    for i in range(len(gold_replies)-1):
      gold_reply = gold_replies[i]

      mc_label = random.randint(0, 1)
      mc_labels.append([mc_label])
      distractors = distraction_lines[i:i+2]
    
      distractors[mc_label] = gold_reply
      mc_input = deepcopy(distractors)

      mc_inputs.append(mc_input)

    return mc_inputs,mc_labels      

class BrainyData(Dataset):
  def __init__(self, inputs, ids, mc_labels, lm_labels, masks):
    self.input = inputs
    self.ids = ids
    self.mc_labels = mc_labels
    self.lm_labels = lm_labels
    self.masks = masks
  def __len__(self):
    return len(self.input)
  def __getitem__(self, index):
    input = self.input[index]
    id = self.ids[index]
    mc_label = self.mc_labels[index]
    mask = self.masks[index]
    lm_label = self.lm_labels[index]

    example = {"input_ids":input, "attention_mask":mask, "mc_token_ids":id.squeeze(), "mc_labels":mc_label, "labels":lm_label}
    return example

def convert_to_tensor(inputs, labels):
  inputs,labels = get_mc_inputs_and_ids(inputs, labels)
  
  input_tensors, id_tensors, label_tensors, attention_mask, lm_label,context_lens = [], [], [], [], [], []
  for inp,lab in zip(inputs,labels):
    ids = torch.tensor([[len(i.split())-1 for i in inp]])

    encoded_targets = pad_sequence([torch.tensor(tokenizer.encode(target)) for target in inp], batch_first=True, padding_value=tokenizer.pad_token_id)
    lm_target = torch.full(encoded_targets.shape,torch.tensor(-100))
    lm_target[lab] = encoded_targets[lab]

    mask=encoded_targets!=tokenizer.pad_token_id
    lm_label.append(lm_target)
    id_tensors.append(ids)
    input_tensors.append(encoded_targets)
    attention_mask.append(mask.long())
    label_tensors.append(torch.tensor(lab))

  return BrainyData(input_tensors,id_tensors, label_tensors, lm_label, attention_mask)

mc_train_data = convert_to_tensor(train_gold, train_distract)
mc_val_data = convert_to_tensor(val_gold, val_distract)
mc_test_data = convert_to_tensor(test_gold, test_distract)

In [151]:
## Load model (now finetuned for language modelling on therapy transcripts) for further tuning on NSP
model = GPT2LMHeadModel.from_pretrained("michelleyunun/brainy-lm-2")

In [None]:
model_name = "brainy-gpt-4.1"
SAVE_DIR = "/content/drive/MyDrive/Colab_Notebooks/Brainy/ckpt/"

training_args = TrainingArguments(
  output_dir = SAVE_DIR + f"{model_name}",
  log_level = "error",
  num_train_epochs = 6,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adamw_torch",
  weight_decay = 0.01,
  per_device_train_batch_size = 1,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 100,
  logging_steps = 10,
  push_to_hub = False
)

trainer = Trainer(
  model = model,
  args = training_args,
  train_dataset = mc_train_data,
  eval_dataset = mc_val_data,
  data_collator = data_collator,
  tokenizer = tokenizer,
)

trainer.train()

{'loss': 4.6136, 'learning_rate': 5.555555555555555e-05, 'epoch': 0.1}
{'loss': 3.7075, 'learning_rate': 0.0001111111111111111, 'epoch': 0.2}
{'loss': 2.8564, 'learning_rate': 0.00016666666666666666, 'epoch': 0.31}
{'loss': 2.6114, 'learning_rate': 0.0002222222222222222, 'epoch': 0.41}
{'loss': 2.4255, 'learning_rate': 0.0002777777777777778, 'epoch': 0.51}
{'loss': 2.4714, 'learning_rate': 0.0003333333333333333, 'epoch': 0.61}
{'loss': 2.2504, 'learning_rate': 0.0003888888888888889, 'epoch': 0.72}
{'loss': 2.1473, 'learning_rate': 0.0004444444444444444, 'epoch': 0.82}
{'loss': 2.1767, 'learning_rate': 0.0005, 'epoch': 0.92}
{'loss': 1.9413, 'learning_rate': 0.0004898373983739837, 'epoch': 1.02}
{'eval_loss': 2.4188029766082764, 'eval_runtime': 29.164, 'eval_samples_per_second': 17.864, 'eval_steps_per_second': 17.864, 'epoch': 1.02}
{'loss': 1.3908, 'learning_rate': 0.0004796747967479675, 'epoch': 1.13}
{'loss': 1.4738, 'learning_rate': 0.0004695121951219512, 'epoch': 1.23}
{'loss': 1.

In [117]:
gc.collect()

2958

In [118]:
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
from transformers import AutoModel
brainy = GPT2LMHeadModel.from_pretrained("michelleyunun/brainy-lm-2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [144]:
inputs = tokenizer.encode("[CLI] I am feeling anxious. [THERA] When do you feel anxious? [CLI] When I'm around other people. [THERA]", return_tensors="pt")

In [146]:
# set seed to reproduce results. Feel free to change the seed though to get different results

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = brainy.generate(
    inputs,
    do_sample=True, 
    max_length=50, 
    no_repeat_ngram_size=2, 
    top_k=60, 
    top_p=0.95, 
    num_return_sequences=5
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Output:
----------------------------------------------------------------------------------------------------
0:  I am feeling anxious.  When do you feel anxious?  When I'm around other people.  I see. So, in terms of how anxious you are, how important it is to sort of be in your head,
1:  I am feeling anxious.  When do you feel anxious?  When I'm around other people.  As you would a young woman? You would? I guess that would be the time when you might be feeling
2:  I am feeling anxious.  When do you feel anxious?  When I'm around other people.  In some ways, it makes me feel good. It makes you appreciate how much my role as your doctor plays a role for
3:  I am feeling anxious.  When do you feel anxious?  When I'm around other people.  Yeah. And how often do those people come to your house for a visit? It's like here, you know, after
4:  I am feeling anxious.  When do you feel anxious?  When I'm around other people.  And what do they bring you in? And, um, what I understand is that so