In [5]:
import numpy as np 
import pandas as pd
import torch
import logging
from tqdm import tqdm
import math
import argparse
import os

In [6]:
!git clone https://github.com/huggingface/transformers
!pip install transformers/
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
fatal: destination path 'transformers' already exists and is not an empty directory.
/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Processing ./transformers
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.27.0.dev0-py3-none-any.whl size=6553454 sha256=7f21f6f7c02184fd38e65321ce1c622075d4046bf12536dafc56c0ae

In [16]:
parser = argparse.ArgumentParser()
parser.add_argument('--seed', type=int, default=88888)
parser.add_argument("--model_name", default="gpt2", type=str)
parser.add_argument("--max_seq_length", default=512, type=int)
parser.add_argument("--train_batch_size", default=4, type=int)
parser.add_argument("--valid_batch_size", default=4, type=int)
parser.add_argument("--num_train_epochs", default=1, type=int)
parser.add_argument("--warmup", default=0.1, type=float)
parser.add_argument("--learning_rate", default=5e-5, type=float)
parser.add_argument("--input_text_path", default='/kaggle/input/rocstories', type=str)
args, _ = parser.parse_known_args()


In [23]:
DATAPATH=args.input_text_path
def combinetext(prompt, story):
    fp=open(os.path.join(DATAPATH,prompt),encoding='utf8')
    fs=open(os.path.join(DATAPATH,story),encoding='utf8')
    prompts=fp.readlines()
    stories=fs.readlines()
    assert len(prompts)==len(stories)
    combine=[]
    for i in range(len(prompts)):
        combine.append(prompts[i].rstrip()+' <sep> '+" ".join(stories[i].split()[:300]))
    return combine

def cleanpunctuation(s):
    for p in '!,.:;?':
        s=s.replace(' '+p,p)
    s=s.replace(' '+'n\'t','n\'t')
    s=s.replace(' '+'\'s','\'s')
    s=s.replace(' '+'\'re','\'re')
    s=s.replace(' '+'\'ve','\'ve')
    s=s.replace(' '+'\'ll','\'ll')
    s=s.replace(' '+'\'am','\'am')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' m','\'m')
    s=s.replace(' '+'\'m','\'m')
    s=s.replace(' '+'\' ve','\'ve')
    s=s.replace(' '+'\' s','\'s')
    s=s.replace('<newline>','\n')
    return s   

train_text=combinetext('train.src', 'train.tgt')
train_text=list(map(cleanpunctuation,train_text))
valid_text=combinetext('valid.src', 'valid.tgt')
valid_text=list(map(cleanpunctuation,valid_text))
test_text=combinetext('test.src', 'test.tgt')
test_text=list(map(cleanpunctuation,test_text))

In [24]:
len(train_text)

176688

In [25]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token=tokenizer.eos_token

inputs_train = tokenizer(train_text, padding=True,truncation=True,max_length=args.max_seq_length)
inputs_valid = tokenizer(valid_text, padding=True,truncation=True,max_length=args.max_seq_length)
inputs_test = tokenizer(test_text, padding=True,truncation=True,max_length=args.max_seq_length)


In [26]:
def create_labels(inputs):
    labels=[]
    for ids,attention_mask in zip(inputs['input_ids'],inputs['attention_mask']):
        label=ids.copy()
        real_len=sum(attention_mask)
        padding_len=len(attention_mask)-sum(attention_mask)
        label[:]=label[:real_len]+[-100]*padding_len
        labels.append(label)
    inputs['labels']=labels
    
create_labels(inputs_train)
create_labels(inputs_valid)
create_labels(inputs_test)

In [27]:
print(inputs_train['input_ids'][6])
print(inputs_train['attention_mask'][6])
print(inputs_train['labels'][6])



[24724, 373, 1762, 379, 257, 47519, 13, 1279, 325, 79, 29, 6451, 11, 257, 6491, 12828, 276, 510, 284, 262, 3753, 13, 339, 2540, 22187, 546, 703, 890, 465, 2057, 373, 2263, 13, 4048, 1422, 470, 760, 703, 284, 6324, 13, 45120, 11, 607, 30521, 263, 35018, 290, 49566, 262, 582, 866, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[24724, 373, 1762, 379, 257, 47519, 13, 1279, 325, 79, 29, 6451, 11, 257, 6491, 12828, 276, 510, 284, 262, 3753, 13, 339, 2540, 22187, 5

In [28]:
class StoryDataset:
    def __init__(self, inputs):
        self.ids = inputs['input_ids']
        self.attention_mask = inputs['attention_mask']
        self.labels=inputs['labels']

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, item):

        return [torch.tensor(self.ids[item], dtype=torch.long),
                torch.tensor(self.attention_mask[item], dtype=torch.long),
                torch.tensor(self.labels[item], dtype=torch.long)]
            

In [29]:
train_batch_size=args.train_batch_size
valid_batch_size=args.valid_batch_size

traindata=StoryDataset(inputs_train)
train_dataloader = torch.utils.data.DataLoader(
    traindata,
    shuffle=False,
    batch_size=train_batch_size)

validdata=StoryDataset(inputs_valid)
valid_dataloader = torch.utils.data.DataLoader(
    validdata,
    shuffle=False,
    batch_size=valid_batch_size)

In [16]:
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [17]:
num_train_epochs = args.num_train_epochs
training_steps_per_epoch=len(train_dataloader)
total_num_training_steps = int(training_steps_per_epoch*num_train_epochs)
weight_decay=0
learning_rate=args.learning_rate
adam_epsilon=1e-8
warmup_steps=int(total_num_training_steps*args.warmup)
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_num_training_steps
)



In [18]:
print("***** Running training *****")
print("  Total_num_training_step = {}".format(total_num_training_steps))
print("  Num Epochs = {}".format(num_train_epochs))
print(f"  Train_batch_size per device = {train_batch_size}")
print(f"  Valid_batch_size per device = {valid_batch_size}")
model.to('cuda')
for epoch in range(num_train_epochs):
    print(f"Start epoch{epoch+1} of {num_train_epochs}")
    train_loss=0
    epoch_iterator = tqdm(train_dataloader,desc='Iteration')
    model.train()
    model.zero_grad()    
    for _, inputs in enumerate(epoch_iterator):        
        d1,d2,d3=inputs
        d1=d1.to('cuda')
        d2=d2.to('cuda')
        d3=d3.to('cuda')
        output = model(input_ids=d1, attention_mask=d2,labels=d3)
        batch_loss=output[0]
        batch_loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        train_loss+=batch_loss.item()
        epoch_iterator.set_description('(batch loss=%g)' % batch_loss.item())
        del batch_loss
    print(f'Average train loss per example={train_loss/training_steps_per_epoch} in epoch{epoch+1}')    
    print(f'Starting evaluate after epoch {epoch+1}')
    eval_loss=[]    
    model.eval()    
    for inputs in tqdm(valid_dataloader, desc="eval"):
        d1,d2,d3=inputs
        d1=d1.to('cuda')        
        d2=d2.to('cuda')
        d3=d3.to('cuda')
        with torch.no_grad():
            output = model(input_ids=d1, attention_mask=d2,labels=d3)
            batch_loss=output[0]
        eval_loss+=[batch_loss.cpu().item()]
        del batch_loss
    eval_loss=np.mean(eval_loss)
    perplexity=math.exp(eval_loss)
    print(f'Average valid loss per example={eval_loss} in epoch{epoch+1}')    
    print(f'Perplextiy for valid dataset in epoch{epoch+1} is {perplexity}')

***** Running training *****
  Total_num_training_step = 44172
  Num Epochs = 1
  Train_batch_size per device = 4
  Valid_batch_size per device = 4
Start epoch1 of 1


(batch loss=2.60208): 100%|██████████| 44172/44172 [1:48:27<00:00,  6.79it/s]


Average train loss per example=2.6197360202927826 in epoch1
Starting evaluate after epoch 1


eval: 100%|██████████| 2454/2454 [01:27<00:00, 28.13it/s]

Average valid loss per example=2.4752597598011357 in epoch1
Perplextiy for valid dataset in epoch1 is 11.884793903870971





In [7]:
# def generate_story(prompt,target,k=0,p=0.9,output_length=50,temperature=1,num_return_sequences=1,repetition_penalty=1.0):
# #     print("====prompt====\n")
# #     print(prompt+"\n")
# #     print('====target story is as below===\n')
# #     print(target+"\n")
#     encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt")
# #     print("ENcoded : ", encoded_prompt)
#     model.to('cpu')
#     model.eval()
#     output_sequences = model.generate(
#         input_ids=encoded_prompt,
#         max_length=output_length,
#         temperature=temperature,
#         top_k=k,
#         top_p=p,
#         repetition_penalty=repetition_penalty,
#         do_sample=True,
#         num_return_sequences=num_return_sequences
#     )
# #     print(output_sequences)
#     if len(output_sequences.shape) > 2:
#         output_sequences.squeeze_()
#     text = ""
#     for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
# #         print("=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1))
#         generated_sequence = generated_sequence.tolist()
# #         print(generated_sequence)
#         # Decode text
#         text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
#         # Remove all text after eos token
# #         text = text[: text.find(tokenizer.eos_token)]
#     return text

In [30]:
test_prompt = []
f = open('/kaggle/input/rocstories/test.src')
for i in f.readlines():
    test_prompt.append(i)
f.close()
test_target = []
f = open('/kaggle/input/rocstories/test.tgt')
for i in f.readlines():
    test_target.append(i)
f.close()

In [32]:
f = open('final_ans.txt','w+')
testdata = StoryDataset(inputs_test)
test_dataloader = torch.utils.data.DataLoader(
    testdata,
    shuffle=False,
    batch_size=valid_batch_size)

# Set model to evaluation mode
model_pred.to('cuda')
model_pred.eval()
for inputs in tqdm(test_dataloader, desc="eval"):
    d1,d2,d3=inputs
    d1=d1.to('cuda')        
    d2=d2.to('cuda')
    d3=d3.to('cuda')
    generated = model_pred.generate(input_ids=d1, max_length=50)
    for g in generated:
        text = tokenizer.decode(g, skip_special_tokens=True)
        f.write(text+"\n")
f.close()

eval:   0%|          | 0/1228 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 88, but `max_length` is set to 50. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
eval:   0%|          | 1/1228 [00:03<1:08:46,  3.36s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detecte

In [65]:
model.save_pretrained('/kaggle/working/gpt2_roc')

In [3]:
!zip -r gpt2_roc.zip /kaggle/working/gpt2_roc

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
  adding: kaggle/working/gpt2_roc/ (stored 0%)
  adding: kaggle/working/gpt2_roc/pytorch_model.bin (deflated 9%)
  adding: kaggle/working/gpt2_roc/config.json (deflated 51%)
  adding: kaggle/working/gpt2_roc/generation_config.json (deflated 24%)


In [4]:
from IPython.display import FileLink
FileLink(r'gpt2_roc.zip')

In [None]:
f = open('final_ans.txt')
arr = []
for i in f.readlines():
    i = i.split('<sep>')
    arr.append(i[1].lstrip())
f.close()
f = open('gpt2_res.txt','w+')
for i in arr:
    f.write(i)
f.close()

In [9]:
tokenizer_pred = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer_pred.pad_token=tokenizer_pred.eos_token
model_pred = GPT2LMHeadModel.from_pretrained('gpt2_roc')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [33]:
!pip install bleu

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)
Collecting bleu
  Downloading bleu-0.3.tar.gz (5.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting efficiency
  Downloading efficiency-1.1.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: bleu, efficiency
  Building wheel for bleu (setup.py) ... [?25ldone
[?25h  Created wheel for bleu: filename=bleu-0.3-py3-none-any.whl size=5801 sha256=113291bbd38b587a892e5880e45c6bf717db9944f5f8bc4e14c6cb90bd9a1cf3
  Stored in directory: /root/.cache/pip/wheels/90/95/f7/3b9dd43fae308b83b018fb3c1b8647d622b3401c23a7ebc41e
  Building wheel for efficiency (setup.py) ... [?25ldone
[?25h  Created wheel for efficiency: filename=efficiency-1.1-py3-none-any.whl size=22158 sha256=4060ca32ffb718805e7c182a1a52625f0326172d182147ea1877447740bc9fca
  Stored in directory: /root/.cache/pip/wheels/eb/c2/8e/2bc261126dcc968d98baacf12e6af4108e78652c

In [66]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.util import ngrams

def compute_scores(file1_path, file2_path):
    with open(file1_path, 'r', encoding='utf-8') as f1, open(file2_path, 'r', encoding='utf-8') as f2:
        ref_sentences = [line for line in f1]
        hyp_sentences = [line for line in f2]
    
    # preprocess sentences
    ref_sentences = [nltk.word_tokenize(sent.lower()) for sent in ref_sentences]
    hyp_sentences = [nltk.word_tokenize(sent.lower()) for sent in hyp_sentences]

    # compute BLEU-1 and BLEU-2 scores
    bleu_1 = 0
    bleu_2 = 0
    for i in range(len(ref_sentences)):
        if(i%100==0):
            print(i, " done")
        bleu_1 += sentence_bleu(ref_sentences[i], hyp_sentences[i], weights=(1,0,0,0))
        bleu_2 += sentence_bleu(ref_sentences[i], hyp_sentences[i], weights=(0,1,0,0))

    bleu_1 /= len(ref_sentences)
    bleu_2 /= len(ref_sentences)

    # compute distinct-1 and distinct-2 scores
    def distinct_n(n, sentences):
        ngrams_set = set()
        ngram_count = 0
        for sent in sentences:
            sent_ngrams = list(ngrams(sent, n))
            ngram_count += len(sent_ngrams)
            ngrams_set.update(sent_ngrams)
        return len(ngrams_set) / ngram_count

    distinct_1 = distinct_n(1, hyp_sentences)
    distinct_2 = distinct_n(2, hyp_sentences)
    return bleu_1, bleu_2, distinct_1, distinct_2

bleu_1, bleu_2, distinct_1, distinct_2 = compute_scores('/kaggle/input/rocstories/test.tgt', '/kaggle/working/gpt2_res.txt')
print(f'BLEU-1: {bleu_1:.4f}')
print(f'BLEU-2: {bleu_2:.4f}')
print(f'Distinct-1: {distinct_1:.4f}')
print(f'Distinct-2: {distinct_2:.4f}')