# ***FINE TUNING GPT2 ON CUSTOM DATA***

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset directly
dataset = load_dataset("text", data_files="/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Data/dojdataset.txt", split="train")  # Replace "your_data.txt" with your file name

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens
special_tokens = {
    'additional_special_tokens': ['<|startoftext|>', '|bot|', '<|endoftext|>']
}
tokenizer.add_special_tokens(special_tokens)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for dynamic padding
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Trained LLM model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    prediction_loss_only=True,
    evaluation_strategy="no",
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Trained LLM model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Trained LLM model")

# Testing the model (optional)
def generate_answer(question):
    input_text = f"<|startoftext|> {question} |bot|"
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example
question = "What are the number of cases registered as of January,2023?"
print(generate_answer(question))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Step,Training Loss
10,3.7964
20,3.2373
30,2.9222
40,2.6012
50,2.2988
60,2.3959
70,2.1564
80,2.0958
90,1.9573


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 What are the number of cases registered as of January,2023?  The number of registered cases is currently at 1,000,000. 


# ***TRAINED GPT2 MODEL INFO***

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Path to your model directory
model_path = "/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Trained LLM model"

# Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained(model_path)  # No safetensors keyword here
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# Check if the model was loaded properly
print(model)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50260, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50260, bias=False)
)


# ***TESTING PRE TRAINED MODEL***

In [2]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Trained LLM model"  # Path to your fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Define a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)

# Test Queries
def generate_response(query):
    input_text = f"<|startoftext|> {query} |bot|"
    output = generator(input_text, max_length=100, num_return_sequences=1)
    response = output[0]['generated_text'].split("|bot|")[1].split("<|endoftext|>")[0].strip()
    return response

# Example Query
query = "Can you explain what the Tele-Law service is?"
response = generate_response(query)
print(f"Query: {query}")
print(f"Response: {response}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Query: Can you explain what the Tele-Law service is?
Response: It provides legal advice on domestic disputes involving domestic violence, public safety issues such as domestic terrorism, and financial matters. The service is available for free in over 20 cities around the country, providing free legal help and advice to domestic claimants, especially in cases involving child marriage, dowry, forced marriage, child marriage, and dowry fraud.


# ***MAIN QUERY MODEL***


In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset directly
dataset = load_dataset("text", data_files="/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Data/querydata.txt", split="train")  # Replace "your_data.txt" with your file name

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add special tokens
special_tokens = {
    'additional_special_tokens': ['<|startoftext|>', '|bot|', '<|endoftext|>']
}
tokenizer.add_special_tokens(special_tokens)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator for dynamic padding
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Main Query Model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    prediction_loss_only=True,
    evaluation_strategy="no",
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Main Query Model")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Main Query Model")

# Testing the model (optional)
def generate_answer(question):
    input_text = f"<|startoftext|> {question} |bot|"
    inputs = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=100, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Example
question = "What are the number of cases registered as of January,2023?"
print(generate_answer(question))


Generating train split: 0 examples [00:00, ? examples/s]

model.safetensors:  61%|######1   | 336M/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/120 [00:00<?, ? examples/s]



Step,Training Loss
10,4.4666
20,2.723
30,2.7742
40,2.2415
50,2.2171
60,2.1174
70,1.9535
80,1.8876
90,1.8647


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 What are the number of cases registered as of January,2023?  model1 


In [3]:
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel

# Load fine-tuned model and tokenizer
model_path = "/content/drive/MyDrive/Colab Notebooks/SIH- DOJ CHATBOT/Main Query Model"  # Path to your fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Define a pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id)

# Test Queries
def generate_response(query):
    input_text = f"<|startoftext|> {query} |bot|"
    output = generator(input_text, max_length=100, num_return_sequences=1)
    response = output[0]['generated_text'].split("|bot|")[1].split("<|endoftext|>")[0].strip()
    return response

# Example Query
query = "What are total number of civil cases registered in 2016 India?"
response = generate_response(query)
print(f"Query: {query}")
print(f"Response: {response}")


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Query: What are total number of civil cases registered in 2016 India?
Response: model2
