In [None]:
!pip install transformers

In [None]:
DATA_DIR = r"./data/dialogue.csv"

In [None]:
import torch
import tqdm
import pandas as pd
import numpy as np
from torch.optim import Adam
from transformers import AutoTokenizer
from huggingface_hub import notebook_login
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [None]:
# setting up device for training and inference
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print('Using device:', device)
if device == 'cuda':
    print(torch.cuda.get_device_name(0))

In [None]:
# loading and manipulating data
df = pd.read_csv(DATA_DIR)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
# preprocessing data
class Preprocessor(Dataset):

  def __init__(self, dataframe, tokenizer):
    self.data = dataframe
    self.lst = []

    for i, j in zip(self.data["questions"], self.data["answers"]):
      self.lst.append("<s> "+i+" <bot>: "+j+" <e>")

    #self.lst = self.lst[:40000]

    self.tokenized_lst = tokenizer(self.lst, max_length=30, truncation=True, padding="max_length", return_tensors="pt")
    self.input_ids = self.tokenized_lst['input_ids']
    self.attention_mask = self.tokenized_lst['attention_mask']


  def __len__(self):
    return len(self.lst)

  def __getitem__(self, idx):
    return (self.input_ids[idx], self.attention_mask[idx])

In [None]:
# loding and modifying tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "<pad>", "bos_token": "<s>", "eos_token": "<e>"})
tokenizer.add_tokens(["<bot>:"])

# loading pretrained model
pretrained_model = GPT2LMHeadModel.from_pretrained("gpt2")
pretrained_model.resize_token_embeddings(len(tokenizer))
pretrained_model = pretrained_model.to(device)

In [None]:
data = Preprocessor(dataframe=df, tokenizer=tokenizer)
print(f"dataset length: {len(data.lst)}")
data.lst[:50]

In [None]:
# training configuration
epochs = 100
optimizer = Adam(pretrained_model.parameters(), lr=1e-3)
train_data =  DataLoader(data, batch_size=64)

In [None]:
# defining predection function
def predict(query):
  inp = "<s> "+query+" <bot>: "
  inp = tokenizer(inp, return_tensors="pt")
  inp_ids = inp["input_ids"].to(device)
  att_masks = inp["attention_mask"].to(device)
  out = pretrained_model.generate(inp_ids, attention_mask=att_masks)
  out = tokenizer.decode(out[0])
  return out

In [None]:
# training loop
for i in tqdm.tqdm(range(epochs)):
  for inp_id, att_mask in train_data:
    inp_id = inp_id.to(device)
    att_mask = att_mask.to(device)
    optimizer.zero_grad()
    loss = pretrained_model(inp_id, attention_mask=att_mask, labels=inp_id).loss
    loss.backward()
    optimizer.step()
  # saving model after each epoch
  torch.save(pretrained_model.state_dict(), "model_state.pt")
  print(predict("Hi, how are you today?"))

In [None]:
# prompting
while True:
  query = input()
  print(predict(query))

In [None]:
# pushing to hub
notebook_login()

In [None]:
pretrained_model.push_to_hub("saul-gpt2-mk2")
tokenizer.push_to_hub("saul-gpt2-mk2")

In [None]:
from transformers import AutoTokenizer, AutoModel
model = AutoModel.from_pretrained("parthsolanke/saul-gpt2-mk2")
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained("parthsolanke/saul-gpt2-mk2")