<a href="https://colab.research.google.com/github/manikandannp/MLCodes/blob/main/Document%20Summarization_with_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Document sumamrization with GPT2 (tried with all gpt2, gpt-medium, gpt-large & gpt-xl

### Importing required packages

In [None]:
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline, set_seed, GPT2Model
import warnings
warnings.filterwarnings('ignore')

### Load the data

In [None]:
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

# Read files/documents

file_path = '/content/35185-0.txt'
file_path = '/content/agreement train.txt'
train_file = read_txt(file_path)
file_path = '/content/agreement validation.txt'
test_file = read_txt(file_path)

### Split the text into training and validation sets

In [None]:
# Remove excess newline characters
train_file = re.sub(r'\n+', '\n', train_file).strip()
train_text = train_file[:]

In [None]:
# Split the text into training and validation sets
test_file = re.sub(r'\n+', '\n', test_file).strip()
val_text = test_file[:]

In [None]:
# Save the training and validation data as text files
with open("train.txt", "w") as t:
    t.write(train_text)

with open("val.txt", "w") as v:
    v.write(val_text)

### Load GPT2Tokenizer

In [None]:
# Set up the tokenizer
set_seed(42)
checkpoint = "gpt2" #124M parameters
checkpoint = "gpt2-xl" #1.5B parameters
checkpoint = 'gpt2-large' #774M parameters
checkpoint = 'gpt2-medium' #355M parameters
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

In [None]:
# Tokenize sample text using GP2Tokenizer
sample_ids = tokenizer("Hello world")
sample_ids

{'input_ids': [15496, 995], 'attention_mask': [1, 1]}

In [None]:
# Generate tokens for sample text
sample_tokens = tokenizer.convert_ids_to_tokens(sample_ids['input_ids'])
sample_tokens

['Hello', 'Ġworld']

In [None]:
# Generate original text back
tokenizer.convert_tokens_to_string(sample_tokens)

'Hello world'

### Tokenize text data

In [None]:
# Tokenize train text
train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)

# Tokenize validation text
val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128)

In [None]:
# Length of train and validation set
len(train_dataset), len(val_dataset)

(38, 9)

In [None]:
# Batch-size
train_dataset[0].shape, val_dataset[0].shape

(torch.Size([128]), torch.Size([128]))

In [None]:
# Create a Data collator object
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

### Load Model

In [None]:
# Set up the model
model = GPT2LMHeadModel.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

**Note: The training time for different GPT models with GPU for this dataset are as follows:**

* **GPT-2 : ~20 minutes for 100 epochs**

* **GPT-2 Medium:  ~1 hour for 100 epochs**

* **GPT-2 Large : Run out of memory**

### Fine-tune Model

In [None]:
# Set up the training arguments

model_output_path = "/content/gpt_model"

training_args = TrainingArguments(
    output_dir = model_output_path,
    overwrite_output_dir = True,
    per_device_train_batch_size = 4, # try with 2
    per_device_eval_batch_size = 4,  #  try with 2
    num_train_epochs = 100,
    save_steps = 1_000,
    save_total_limit = 2,
    logging_dir = './logs',
    )

In [None]:
# Train the model
trainer = Trainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
)
trainer.train()

trainer.save_model(model_output_path)
tokenizer.save_pretrained(model_output_path)

Step,Training Loss
500,0.1324
1000,0.0071


Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


('/content/gpt_model/tokenizer_config.json',
 '/content/gpt_model/special_tokens_map.json',
 '/content/gpt_model/vocab.json',
 '/content/gpt_model/merges.txt',
 '/content/gpt_model/added_tokens.json')

### Test Model with user input prompts

In [None]:
def generate_response(model, tokenizer, prompt, max_length=100):

    input_ids = tokenizer.encode(prompt, return_tensors="pt")      # 'pt' for returning pytorch tensor

    # Create the attention mask and pad token id
    attention_mask = torch.ones_like(input_ids)
    pad_token_id = tokenizer.eos_token_id

    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        attention_mask=attention_mask,
        pad_token_id=pad_token_id
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
# Load the fine-tuned model and tokenizer

my_model = GPT2LMHeadModel.from_pretrained(model_output_path)
my_tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# prompt 1
prompt = "what defines WITNESS in the agreement?"  # Replace with your desired prompt
response = generate_response(my_model, my_tokenizer, prompt)
print("Generated response:", response)

Generated response: what defines WITNESS in the agreement?
The SELLER has agreed to get consent deed duly executed to this Sale transaction from his wife/her husband, sons and daughters on or before date of registration of Sale Deed and assured that they all join to execute sale deed in favour of the purchaser.
It is hereby expressly provided and agreed by the parties here to that both parties are entitled to enforce specific performance of the agreement against each other in case of breach of any conditions mentioned in this Agreement


In [None]:
# prompt 3
#Extract answer only from the document.
#Sample1: Who is the SELLER for purchaser Mrs. UMA P ? Answer: Mr. MANIKANDAN s/o. PURUSHOTHAMAN
#Sample2: Who is the SELLER for purchaser Mr. BENGALURU ? Answer: Mr. KANNAN s/o. MANIKAM
#Sample3: Who is the SELLER for purchaser Mr. MANIKANDANNNP ? Answer: Mr. SHANMUGAM s/o. NAIKUM
#Sample4: Who is the SELLER for purchaser Mr. AMMANJI ? Answer:

prompt = """
Extract answer only from the document.
Sample1: SELLER for purchaser Mrs. UMA P ? Answer: Mr. MANIKANDAN s/o. PURUSHOTHAMAN
Sample2: SELLER for purchaser Mr. BENGALURU ? Answer: Mr. KANNAN s/o. MANIKAM
Sample3: SELLER for purchaser Mr. MANIKANDANNNP ? Answer: Mr. SHANMUGAM s/o. NAIKUM
Sample4: SELLER for purchaser Mr. AMMANJI ? Answer:
"""

response = generate_response(my_model, my_tokenizer, prompt, max_length=150)
print("Generated response:", response)

Generated response: 
Extract answer only from the document.
Sample1: SELLER for purchaser Mrs. UMA P? Answer: Mr. MANIKANDAN s/o. PURUSHOTHAMAN
Sample2: SELLER for purchaser Mr. BENGALURU? Answer: Mr. KANNAN s/o. MANIKAM
Sample3: SELLER for purchaser Mr. MANIKANDANNNP? Answer: Mr. SHANMUGAM s/o. NAIKUM
Sample4: SELLER for purchaser Mr. AMMANJI? Answer:
SCHEDULE
IN WITNESS WHEREOF the SELLER and the PURCHASER have signed this
