# Colbert-AI v2.0

##### ***Using Pytorch, Transformers and Open-AI's GPT-2***

*Installing Transformers*

In [None]:
!pip install -q transformers

*importing all the required modules*

In [None]:
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os

#### **Choosing a Model**
##### Transformes has 4 models
![Image by Jay Alammar from post The Illustrated GPT-2](https://i.imgur.com/yrIxPVX.png)

**Model Names**:
- `gpt2-small` (124M Model)
- `gpt2-medium` (345M Model)
- `gpt2-large` (774M Model)
- `gpt2-xl` (1558M Model)

*In our case we focused on making a lighter model so we used medium*

In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
    
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

`choose_from_top`:
- Function to first select topN tokens from the probability list and then based on the selected N word distribution

`generate_text`:
- At each prediction step, GPT2 model needs to know all of the previous sequence elements to predict the next one. Below is a function that will tokenize the starting input text, and then in a loop, one new token is predicted at each step and is added to the sequence, which will be fed into the model in the next step. In the end, the token list is decoded back into a text.

In [None]:
def choose_from_top(probs, n=10):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob)
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

def generate_text(input_str, text_len = 100):
    cur_ids = torch.tensor(tokenizer.encode(input_str)).unsqueeze(0).long().to(device)
    model.eval()
    with torch.no_grad():
        for i in range(text_len):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) 
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=10)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        print(output_text)

### Generating The Text

In [None]:
generate_text("Donald Trump visits India and ")

## **Fine-tuning GPT-2 on Captions Dataset from YouTube**


In [None]:
class Text_Corpus(Dataset):
    def __init__(self, dataset_path = '/content/drive/My Drive'):
        super().__init__()
        corpus_path = os.path.join(dataset_path, 'captions.txt')
        self.token_list = []
        self.end_of_text_token = "<|endoftext|>"

        with open(corpus_path) as f:
            data = f.read()
            self.token_list = data.split("<|endoftext|>")

        for i in range(len(self.token_list)):
          self.token_list[i] = self.end_of_text_token+self.token_list[i]+self.end_of_text_token

    def __len__(self):
        return len(self.token_list)

    def __getitem__(self, item):
        return self.token_list[item]

*Loading the dataset from `Text_Corpus`*

In [None]:
dataset = Text_Corpus()
print("Number of Tokens Found:", len(dataset))
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

*Assigning Parameterts (EPOCH, Batch Size, etc)*

In [None]:
BATCH_SIZE = 1
EPOCHS = 30
LEARNING_RATE = 1e-5
WARMUP_STEPS = 10000
MAX_SEQ_LEN = 550

### *Training the Model for 30 Epochs*

In [None]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
text_count = 0
sum_loss = 0.0
batch_count = 0

tmp_text_tens = None

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started " + '=' * 30)
    for idx,text in enumerate(data_loader):
            
        text_tens = torch.tensor(tokenizer.encode(text[0])).unsqueeze(0).to(device)

        if text_tens.size()[1] > MAX_SEQ_LEN:
            continue
        if not torch.is_tensor(tmp_text_tens):
            tmp_text_tens = text_tens
            continue
        else:
            if tmp_text_tens.size()[1] + text_tens.size()[1] > MAX_SEQ_LEN:
                work_text_tens = tmp_text_tens
                tmp_text_tens = text_tens
            else:
                tmp_text_tens = torch.cat([tmp_text_tens, text_tens[:,1:]], dim=1)               
                continue
                          
        outputs = model(work_text_tens, labels=work_text_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data                    
        text_count = text_count + 1

        if text_count == BATCH_SIZE:
            text_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 1000:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

### **Generating 20 Samples**

In [None]:
model.eval()
with torch.no_grad():
    
    for text_idx in range(20):

        cur_ids = torch.tensor(tokenizer.encode("<|startoftext|>START:")).unsqueeze(0).to(device)
        
        for i in range(250):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0)
            if i < 2:
                n = 15
            else:
                n = 3
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n)
            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1)
            if next_token_id in tokenizer.encode('<|endoftext|>'):
                break
            
        output_list = list(cur_ids.squeeze().to('cpu').numpy())
        output_text = tokenizer.decode(output_list)
        print(f"SAMPLE {text_idx}: {output_text.capitalize()} \n")

## **Contributors**
- [Abbas Mohammed](https://github.com/iam-abbas) *(iam-abbas on github)*
- [Shubham Rao](https://github.com/cshubhamrao) *(cshubhamrao on github)*

**Mentions:-**
- [Martins Frolovs](https://towardsdatascience.com/teaching-gpt-2-a-sense-of-humor-fine-tuning-large-transformer-models-on-a-single-gpu-in-pytorch-59e8cec40912)