In [1]:
!pip install transformers evaluate peft python-dotenv huggingface_hub wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.2.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting huggingface_hub
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Imports**

In [3]:
import os
import csv
import json
import gc

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence
import evaluate
from transformers import Trainer, TrainingArguments, GPT2DoubleHeadsModel, AutoTokenizer, GPT2LMHeadModel
from tqdm import tqdm
from collections import defaultdict

import csv
import json
import pandas as pd
from collections import defaultdict
import re
import random
from random import shuffle
from sklearn.model_selection import train_test_split


In [4]:
import logging
logging.disable(logging.WARNING)

In [5]:
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/Colab_Notebooks/.env")
HF = os.getenv("HF_TOKEN")
WANDB = os.getenv("WANDB_TOKEN")

!huggingface-cli login --token $HF
!wandb login $WANDB

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [6]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2") # Initialize for language modelling
# Add special tokens
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"
T = "[THERA]"
C = "[CLI]"
tokenizer.bos_token = "<bos>"
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({"cls_token": tokenizer.cls_token, "sep_token":tokenizer.sep_token, "pad_token":tokenizer.pad_token, "bos_token":tokenizer.bos_token, "additional_special_tokens":[T, C]})
embedding_layer = model.resize_token_embeddings(len(tokenizer))

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
PATH = "/content/drive/MyDrive/Colab_Notebooks/Brainy/"

**Read in data from CSV and preprocess to appropriate format**

In [8]:
exp = re.compile(r"\[.*\]")
def speechprocess(line):
    replacements = ["I see", "I understand", "I hear you"]
    sub = replacements[random.randint(0,2)]
    #disfluencies = re.compile(r"um|uh[.,]?", re.IGNORECASE)
    speechacts = re.compile(r"hmm|mm-hmm", re.IGNORECASE)
    line = speechacts.sub(sub, line, re.IGNORECASE)
    return line.strip(" |-")

In [9]:
dataset = defaultdict(list)

with open(PATH + "therapy_data.csv", "r", encoding="utf-8") as csvf:
    csvreader = csv.DictReader(csvf)
    for i, line in enumerate(csvreader):
        text =speechprocess(line["utterance_text"]).strip("-")
        if exp.match(text):
            continue
        if line["interlocutor"] == "therapist":
            dataset["therapist_lines"].append( f"{T} " + text.replace(r"^h ", "Okay"))
        if line["interlocutor"] == "client":
            dataset["client_lines"].append(f"{C} " + text.replace(r"^h ", "Okay"))
            
                

#Datasets for Language Modelling

In [None]:
## Split 
replies = dataset["therapist_lines"][:-40]
inputs = dataset["client_lines"]
all_inputs = [(r,i) for r,i in zip(replies,inputs)]

train, dev = train_test_split(all_inputs, random_state=0)

train_replies, train_inputs = [i[0] for i in train], [i[1] for i in train]
dev_replies, dev_inputs = [i[0] for i in dev], [i[1] for i in dev]

In [10]:

def get_input_data(replies,inputs):
  histories, ttids, labels, pos_ids = [], [], [], []
  for i, (th,cl) in enumerate(zip(replies, inputs)):
    if i == 0:
      continue
    if i % 3 == 0:
      prev_thera, prev_client, next_thera = tokenizer.encode(replies[i-3]), tokenizer.encode(inputs[i-2]), tokenizer.encode(replies[i-1])
      history = [tokenizer.bos_token_id] + prev_thera + prev_client + next_thera + [tokenizer.eos_token_id]
      pos_id = list(range(len(history)))
      ttid = [0]*(len(prev_thera)+1) + [1]*len(prev_client) + [0] *(len(next_thera)+1)

      tokenized_lm_target = next_thera + [tokenizer.eos_token_id]
      padding = [-100] * (len(history) - len(tokenized_lm_target))
      label = padding + tokenized_lm_target

      assert len(ttid) == len(history) == len(label) == len(pos_id)
      histories.append(history)
      ttids.append(ttid)
      labels.append(label)
      pos_ids.append(pos_id)
  return histories, ttids, labels, pos_ids

class BrainyData(Dataset):
  def __init__(self, inputs, ttids, labels,posids):
    self.input = inputs
    self.ids = ttids
    self.lm_labels = labels
    self.posids = posids
  def __len__(self):
    return len(self.input)
  def __getitem__(self, index):
    input = self.input[index]
    id = self.ids[index]
    lm_label = self.lm_labels[index]
    pos_id = self.posids[index]

    example = {"input_ids":input, "token_type_ids":id.squeeze(), "labels":lm_label, "position_ids":pos_id.squeeze()}
    return example

def convert_to_tensor(replies, inputs):
  histories, ttids, labels,posids = get_input_data(replies, inputs)
  
  input_tensors, tt_id_tensors, label_tensors, pos_id_tensors = [], [], [], []
  for h,t,l,p in zip(histories, ttids, labels,posids):
    input_tensors.append(torch.tensor(h))
    tt_id_tensors.append(torch.tensor(t))
    label_tensors.append(torch.tensor(l))
    pos_id_tensors.append(torch.tensor(p))
    

  return BrainyData(input_tensors,tt_id_tensors, label_tensors, pos_id_tensors)
  
train_data=convert_to_tensor(train_replies, train_inputs)
dev_data=convert_to_tensor(dev_replies, dev_inputs)

In [12]:
from transformers import DataCollatorForLanguageModeling, Trainer
data_collator = DataCollatorForLanguageModeling(tokenizer,mlm=False)


In [13]:

model_name = "brainy-gpt-4.2"
SAVE_DIR = "/content/drive/MyDrive/Colab_Notebooks/Brainy/ckpt/"

training_args = TrainingArguments(
  output_dir = SAVE_DIR + f"{model_name}",
  log_level = "error",
  num_train_epochs = 5,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adamw_torch",
  weight_decay = 0.01,
  per_device_train_batch_size = 1,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "epoch",
  eval_steps = 100,
  logging_steps = 10,
  push_to_hub = False
)

trainer = Trainer(
  model = model,
  args = training_args,
  train_dataset = train_data,
  eval_dataset = dev_data,
  data_collator = data_collator,
  tokenizer = tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmichellejyyun[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
0,4.7982,3.792897
1,3.1538,3.096917
2,2.7794,3.025011
3,2.197,3.046166
4,1.7467,3.209731


TrainOutput(global_step=155, training_loss=4.8943736968501925, metrics={'train_runtime': 632.6343, 'train_samples_per_second': 4.039, 'train_steps_per_second': 0.245, 'total_flos': 84064587264000.0, 'train_loss': 4.8943736968501925, 'epoch': 4.85})

In [14]:
model.push_to_hub("michelleyunun/brainy-3")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/michelleyunun/brainy-3/commit/a079a2bcf913f1c645b66345ca48474f05beadc2', commit_message='Upload model', commit_description='', oid='a079a2bcf913f1c645b66345ca48474f05beadc2', pr_url=None, pr_revision=None, pr_num=None)

#Datasets for Classification (Next-Sentence Prediction)

In [15]:
## Load model (now finetuned for language modelling on therapy transcripts) for further tuning on NSP
model2 = GPT2DoubleHeadsModel.from_pretrained("michelleyunun/brainy-3")
embedding_layer = model2.resize_token_embeddings(len(tokenizer))

Downloading (…)lve/main/config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [20]:
distraction_lines = []
with open(PATH + "WikiQA-train.tsv", "r") as tsvf:
    for i, line in enumerate(tsvf.readlines()):
        if i == 0:
            continue
        if i % 5 == 0:
            distraction_lines.append(line.split("\t")[-2] + f" {tokenizer.cls_token}")
random.shuffle(distraction_lines) # shuffle to get rid of related lines

gold_replies = [" ".join(l.split()[1:]) for l in dataset["therapist_lines"]]
train_gold, val_gold = train_test_split(gold_replies, random_state=0)
train_distract, val_distract = train_test_split(distraction_lines,  random_state=0)
test_gold, test_distract = val_gold[:100], val_distract[:300]

from copy import deepcopy

def get_mc_inputs_and_ids(gold_replies, distract_replies):
    mc_inputs = []
    mc_labels = []
    for i in range(len(gold_replies)-1):
      gold_reply = gold_replies[i]

      mc_label = random.randint(0, 1)
      mc_labels.append([mc_label])
      distractors = distraction_lines[i:i+2]
    
      distractors[mc_label] = gold_reply + f" {tokenizer.cls_token}"
      mc_input = deepcopy(distractors)

      mc_inputs.append(mc_input)

    return mc_inputs,mc_labels      

class BrainyData(Dataset):
  def __init__(self, inputs, ids, mc_labels, lm_labels, masks):
    self.input = inputs
    self.ids = ids
    self.mc_labels = mc_labels
    self.lm_labels = lm_labels
    self.masks = masks
  def __len__(self):
    return len(self.input)
  def __getitem__(self, index):
    input = self.input[index]
    id = self.ids[index]
    mc_label = self.mc_labels[index]
    mask = self.masks[index]
    lm_label = self.lm_labels[index]

    example = {"input_ids":input, "attention_mask":mask, "mc_token_ids":id.squeeze(), "mc_labels":mc_label, "labels":lm_label}
    return example

def convert_to_tensor(inputs, labels):
  inputs,labels = get_mc_inputs_and_ids(inputs, labels)
  
  input_tensors, id_tensors, label_tensors, attention_mask, lm_label,context_lens = [], [], [], [], [], []
  for inp,lab in zip(inputs,labels):
    ids = torch.tensor([[len(tokenizer.encode(i))-1 for i in inp]])

    encoded_targets = pad_sequence([torch.tensor(tokenizer.encode(target)) for target in inp], batch_first=True, padding_value=tokenizer.pad_token_id)
    lm_target = torch.full(encoded_targets.shape,torch.tensor(-100))
    lm_target[lab] = encoded_targets[lab]

    mask=encoded_targets!=tokenizer.pad_token_id
    lm_label.append(lm_target)
    id_tensors.append(ids)
    input_tensors.append(encoded_targets)
    attention_mask.append(mask.long())
    label_tensors.append(torch.tensor(lab))

  return BrainyData(input_tensors,id_tensors, label_tensors, lm_label, attention_mask)

mc_train_data = convert_to_tensor(train_gold, train_distract)
mc_val_data = convert_to_tensor(val_gold, val_distract)
mc_test_data = convert_to_tensor(test_gold, test_distract)

In [24]:
model_name = "brainy-gpt-4.2"
SAVE_DIR = "/content/drive/MyDrive/Colab_Notebooks/Brainy/ckpt/"

training_args = TrainingArguments(
  output_dir = SAVE_DIR + f"{model_name}",
  log_level = "error",
  num_train_epochs = 6,
  learning_rate = 5e-4,
  lr_scheduler_type = "linear",
  warmup_steps = 90,
  optim = "adamw_torch",
  weight_decay = 0.01,
  per_device_train_batch_size = 1,
  per_device_eval_batch_size = 1,
  gradient_accumulation_steps = 16,
  evaluation_strategy = "steps",
  eval_steps = 100,
  logging_steps = 10,
  push_to_hub = False
)

trainer = Trainer(
  model = model2,
  args = training_args,
  train_dataset = mc_train_data,
  eval_dataset = mc_val_data,
  data_collator = data_collator,
  tokenizer = tokenizer,
)

trainer.train()

{'loss': 9.3486, 'learning_rate': 5.555555555555555e-05, 'epoch': 0.1}
{'loss': 4.5747, 'learning_rate': 0.0001111111111111111, 'epoch': 0.2}
{'loss': 3.4962, 'learning_rate': 0.00016666666666666666, 'epoch': 0.31}
{'loss': 3.2528, 'learning_rate': 0.0002222222222222222, 'epoch': 0.41}
{'loss': 3.2852, 'learning_rate': 0.0002777777777777778, 'epoch': 0.51}
{'loss': 3.2394, 'learning_rate': 0.0003333333333333333, 'epoch': 0.61}
{'loss': 3.0877, 'learning_rate': 0.0003888888888888889, 'epoch': 0.72}
{'loss': 3.0843, 'learning_rate': 0.0004444444444444444, 'epoch': 0.82}
{'loss': 3.134, 'learning_rate': 0.0005, 'epoch': 0.92}
{'loss': 3.0016, 'learning_rate': 0.0004898373983739837, 'epoch': 1.02}
{'eval_loss': 2.710951089859009, 'eval_runtime': 24.6384, 'eval_samples_per_second': 21.146, 'eval_steps_per_second': 21.146, 'epoch': 1.02}
{'loss': 2.3304, 'learning_rate': 0.0004796747967479675, 'epoch': 1.13}
{'loss': 2.3034, 'learning_rate': 0.0004695121951219512, 'epoch': 1.23}
{'loss': 2.1

TrainOutput(global_step=582, training_loss=1.5950657256280434, metrics={'train_runtime': 2198.2437, 'train_samples_per_second': 4.266, 'train_steps_per_second': 0.265, 'train_loss': 1.5950657256280434, 'epoch': 5.96})

In [30]:
model2.push_to_hub("bird-watching-society-of-greater-clare/brainy-llm")

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/bird-watching-society-of-greater-clare/brainy-llm/commit/183caca96defa96b597f6c48682cbbd7354ee9e0', commit_message='Upload model', commit_description='', oid='183caca96defa96b597f6c48682cbbd7354ee9e0', pr_url=None, pr_revision=None, pr_num=None)

# Generating Responses with Fine-tuned and Vanilla GPT-2

In [26]:

brainy = GPT2LMHeadModel.from_pretrained("michelleyunun/brainy-lm-3")
vanilla = GPT2LMHeadModel.from_pretrained("gpt2")
embedding_layer = vanilla.resize_token_embeddings(len(tokenizer))

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [40]:
inputs = tokenizer.encode("[THERA] Nice to meet you. How can I help you? [CLI] I've been anxious and depressed. [THERA] Tell", return_tensors="pt")

In [41]:
# set seed to reproduce results. Feel free to change the seed though to get different results

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = brainy.generate(
    inputs,
    do_sample=True, 
    max_length=100, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Output:
----------------------------------------------------------------------------------------------------
0:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell me a little bit about that. It sounds like part of you really does want to quit? So, um, how do you plan to go from here? What-how do you-whatIX would like us to do? How about- how about our friendship? All that could make a big difference. So, how do you think that could go? So you looked a
1:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell me, how's-how's the anxiety affecting you? How's the depression affecting you? What's the best way to move forward from here? What are the top five things that you could do to move forward in life? What do you wanna do with that? Where do you wanna go from here? What do you wanna do with that? How do you wanna look forward
2:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell me a little bit about how t

In [42]:
# set seed to reproduce results. Feel free to change the seed though to get different results

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = vanilla.generate(
    inputs,
    do_sample=True, 
    max_length=100, 
    no_repeat_ngram_size=2, 
    top_k=60, 
    top_p=0.95, 
    num_return_sequences=5
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))


Output:
----------------------------------------------------------------------------------------------------
0:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell about I About I About
 The a A It What m My o O M
1:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell The 11 18 5 9 12 $ I 4 9 I 4 6 10 6 11 12 8 7 17 5 8 7 17 7 8 17
2:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell the the you over about me I do don my you I about me what
 I what

3:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell how about $
 - I A ( ) -- The S My the,   \ // C That. `
 I The I `
4:  Nice to meet you. How can I help you?  I've been anxious and depressed.  Tell My I my,   18 about $ 5 12 19 I about 18 $ 12 15 17 20 1 1 4 4 5 7 11 11
