In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m62.8 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
num_added_tokens = tokenizer.add_special_tokens({"pad_token":"<pad>","bos_token": "<startofstring>","eos_token": "<endofstring>"})
tokenizer("Hey, How are you?")

In [None]:
print('We have added', num_added_tokens, 'tokens')
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer.convert_ids_to_tokens([10814, 11, 1374, 389, 345, 30])

['Hey', ',', 'ĠHow', 'Ġare', 'Ġyou', '?']

In [None]:
tokenizer.batch_encode_plus("Hey, How are you?") # It's diving each letter not word

In [None]:
num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
print('We have added', num_added_toks, 'tokens')
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer()

In [None]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    def __init__(self,path:str):
        self.data = json.load(open(path,"r"))

        self.X = []
        for i in self.data:
            for j in i["dialog"]:
                self.X.append(j["text"])

        for idx,i in enumerate(self.X):
            try:
                self.X[idx] = " <startofstring> " +i+ " <BOT> : " + self.X[idx+1] + "<endofstring>"
            except:
                break

        self.X = self.X[:-1]
        print(self.X[0])

        self.X_encoded = tokenizer(self.X,max_length=40, truncation=True, pad_to_max_length = True, return_tensors="pt") # Do padding to max length or othre parameters from yesterday's context
        self.input_ids = self.X_encoded["input_ids"]
        self.attention_mask = self.X_encoded["attention_mask"]


    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        return (self.input_ids[idx],self.attention_mask[idx])

tokenizer.add_special_tokens({
    "pad_token": "<pad>",
    "bos_token": "<startofstring>",
    "eos_token": "<endofstring>"
})

tokenizer.add_tokens(["<bot>:"])



In [None]:
ChatData("/content/gpt2.json")

In [None]:
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model.resize_token_embeddings(len(tokenizer))
model = model.to(device)

chatData = ChatData("/content/gpt2.json")
chatData = DataLoader(chatData,batch_size = 64)

optim = Adam(model.parameters(),lr = 1e-3)
#optimizer = AdamW(model.parameters(),lr = 1e-5)

In [None]:
type(chatData)

In [None]:
def train(chatData,model,optim):
    epochs = 10
    for i in tqdm.tqdm(range(epochs)):
        for X,a in chatData:
          X = X.to(device)
          a = a.to(device)
          optim.zero_grad()
          loss = model(X, attention_mask = a,labels = X).loss
          loss.backward()
          optim.step()
        torch.save(model.state_dict(), "model_state.pt")

model.train()

In [None]:
train(chatData,model,optim)

100%|██████████| 10/10 [06:43<00:00, 40.34s/it]


In [None]:
def infer(input):
    input = " <startofstring> " +input+ " <BOT> : "
    input = tokenizer(input,return_tensors = "pt")
    X = input["input_ids"].to(device)
    a = input["attention_mask"].to(device)
    output = model.generate(X,attention_mask = a)
    output = tokenizer.decode(output[0])
    return output

In [None]:
print(infer("hey"))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> hey <BOT> : рш <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [None]:
while True:
  inp = input()
  print(infer(inp))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Hi <BOT> : рш <endofstring> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> How are you? <BOT> : рш <endofstring> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Do you read? <BOT> : рш <endofstring> <pad> <pad> <pad> <pad> <pad>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> You have libraray? <BOT> :  I am a teacher, but


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> I love I phone. I just bought new one <BOT> : iphone <endofstring>


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> Pls give precise reponses <BOT> :  [Off Pepe] Fortunately I
