In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [None]:
!pip install huggingface_hub ipywidgets 
! pip install -U "huggingface_hub[cli]"


In [None]:
from huggingface_hub import notebook_login

notebook_login()    

In [None]:
record = []
x = []

In [None]:
!pip install rouge_score

In [None]:
!pip install indic-nlp-library

In [None]:
!pip install datasets transformers accelerate evaluate sentencepiece

In [None]:
!pip install sentencepiece

In [None]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the number of available GPUs
    num_gpus = torch.cuda.device_count()
    print("Number of available GPUs:", num_gpus)

    # Get the name of each GPU
    for gpu_id in range(num_gpus):
        gpu_name = torch.cuda.get_device_name(gpu_id)
        print("GPU", gpu_id, ":", gpu_name)
else:
    print("CUDA is not available. Make sure you have a GPU and PyTorch with CUDA support installed.")


In [None]:
!export CUDA_VISIBLE_DEVICES=0

In [None]:
from torch import cuda
import torch
device = 'cuda' if cuda.is_available() else 'cpu'
torch.cuda.set_device(0)

In [None]:
import torch

# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    # Get the index of the currently selected device
    gpu_number = torch.cuda.current_device()
    print("GPU number:", gpu_number)
else:
    print("CUDA is not available. Make sure you have a GPU and PyTorch with CUDA support installed.")


## Getting the model and datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("ai4bharat/IndicSentenceSummarization", 'hi')

In [None]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AlbertTokenizer, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

# Or use tokenizer = AlbertTokenizer.from_pretrained("ai4bharat/IndicBART", do_lower_case=False, use_fast=False, keep_accents=True)

model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

In [None]:

# Or use model = MBartForConditionalGeneration.from_pretrained("ai4bharat/IndicBART")

# Some initial mapping
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")
# To get lang_id use any of ['<2as>', '<2bn>', '<2en>', '<2gu>', '<2hi>', '<2kn>', '<2ml>', '<2mr>', '<2or>', '<2pa>', '<2ta>', '<2te>']

# First tokenize the input and outputs. The format below is how IndicBART was trained so the input should be "Sentence </s> <2xx>" where xx is the language code. Similarly, the output should be "<2yy> Sentence </s>".
inp = tokenizer("I am a boy </s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[  466,  1981,    80, 25573, 64001, 64004]])

out = tokenizer("<2hi> मैं  एक लड़का हूँ </s>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[64006,   942,    43, 32720,  8384, 64001]])
# Note that if you use any language other than Hindi or Marathi, you should convert its script to Devanagari using the Indic NLP Library.
model_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])

# For loss
print(out)
print(out[:,1:])
out[:,0:-1] ## This is not label smoothed.

# For logits
#model_outputs.logits


In [None]:
print(pad_id)

In [None]:

# For generation. Pardon the messiness. Note the decoder_start_token_id.

model.eval() # Set dropouts to zero

model_output=model.generate(inp, use_cache=True, num_beams=4, max_length=20, min_length=1, early_stopping=True, pad_token_id=pad_id, bos_token_id=bos_id, eos_token_id=eos_id, decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2en>"))

# Decode to get output strings

decoded_output=tokenizer.decode(model_output[0], skip_special_tokens
=True, clean_up_tokenization_spaces=False)

print(decoded_output) # I am a boy
# Note that if your output language is not Hindi or Marathi, you should convert its script from Devanagari to the desired language using the Indic NLP Library.


# Preprocess the dataset

In [None]:
dataset = dataset.remove_columns(['id', 'url'])

In [None]:
# Context length of the model
from transformers import AutoConfig

# Load the model configuration
config = AutoConfig.from_pretrained("ai4bharat/IndicBART")

# Get the maximum sequence length
max_length = config.max_position_embeddings

print("Maximum sequence length:", max_length)

In [None]:
def add_words(example):
    # Modify the column 'column_name' by adding words to each entry
    example['input'] = [entry + "  </s> <2hi>" for entry in example['input']]
    example['target'] = ["<2hi> " + entry + " </s>" for entry in example['target']]
    return example

# Apply the function to the dataset using map
mod_dataset = dataset.map(add_words, batched=True)


In [None]:
max_input_length = 1024


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["input"],
        max_length=max_input_length,
        add_special_tokens=False,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    labels = tokenizer(
        examples["target"], truncation=True, add_special_tokens=False, return_tensors="pt", padding=True, max_length=max_input_length, 
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = mod_dataset.map(preprocess_function, batched=True)

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(
    mod_dataset["train"].column_names + ["token_type_ids"]
)

In [None]:
del dataset
del mod_dataset

# Evalutaion with ROUGE

In [None]:
import evaluate

rouge_score = evaluate.load("rouge")

In [None]:
import numpy as np
import indicnlp.tokenize.sentence_tokenize as tok

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(tok.sentence_split(pred.strip(), lang='hi', delim_pat='auto')) for pred in decoded_preds]
    decoded_labels = ["\n".join(tok.sentence_split(label.strip(), lang='hi', delim_pat='auto')) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    # Extract the median scores
    print(result)
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from datasets import load_metric

# Load the ROUGE metric
rouge_metric = load_metric("rouge")

# Example data: reference summaries and generated summaries
references = ["Reference summary 1", "Reference summary 2"]
hypotheses = ["Generated summary 1", "Generated summary 2"]

# Compute ROUGE scores
rouge_scores = rouge_score.compute(predictions=hypotheses, references=references)

# Print the scores
print(rouge_scores)


## Creating a subset

In [None]:
from datasets import DatasetDict, Dataset

split_size = 0.1
sub_train_size = int(tokenized_dataset["train"].num_rows * split_size)
sub_test_size = int(tokenized_dataset["test"].num_rows * split_size)
sub_val_size = int(tokenized_dataset["validation"].num_rows * split_size)

train_subset = Dataset.from_dict(tokenized_dataset["train"].shuffle(seed=42)[:sub_train_size])
test_subset = Dataset.from_dict(tokenized_dataset["test"].shuffle(seed=42)[:sub_test_size])
val_subset = Dataset.from_dict(tokenized_dataset["validation"].shuffle(seed=42)[:sub_val_size])


In [None]:
print(train_subset)

In [None]:
subset_dataset = DatasetDict()
subset_dataset["train"] = train_subset
subset_dataset["test"] = test_subset
subset_dataset["validation"] = val_subset
subset_dataset

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Wandb

In [None]:
!pip install wandb
    

In [None]:
import wandb

In [None]:
run = wandb.init(project="hindi-summariser", config={
    'learning_rate': 5e-5,
    'architecture': 'IndicBART',
    "epochs": "8"
},
entity="parvp"
           )


In [None]:
def wandb_callback()

In [None]:
del run

In [None]:
https://docs.wandb.ai/guides/track/tracking-faq#initstarterror-error-communicating-with-wandb-process-

## Generation Config

In [None]:
model.config.get_config_dict("ai4bharat/IndicBART")

In [None]:
from transformers import GenerationConfig
generation_config = GenerationConfig(
    max_new_tokens=512, 
    do_sample=True, 
    top_k=50,
    use_cache=True,
    num_beams=4,
    min_length=5,
    pad_token_id=pad_id,
    bos_token_id=bos_id, 
    eos_token_id=eos_id, 
    decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>")   
)

generation_config.save_pretrained("config.json")



In [None]:
# Load the configuration from the saved file
loaded_config = GenerationConfig.from_pretrained("config.json")
print(loaded_config)
model.generation_config = loaded_config


In [None]:
print(model.generation_config)

In [None]:
bruh = AutoConfig.from_pretrained("ai4bharat/IndicBART")
print(config == bruh)

# Training

In [None]:
from transformers import Seq2SeqTrainingArguments

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"])
model_name = "IndicBART"

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-IndicSentenceSummarisation",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=True,
    report_to="wandb",
    run_name="first"
)


In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=subset_dataset["train"],
    eval_dataset=subset_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=None,
)

In [None]:
trainer.train()

In [None]:
tokenz = tokenizer.SPECIAL_TOKENS_ATTRIBUTES
print(tokenizer.)

In [None]:
trainer.evaluate()

In [None]:
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated())

In [None]:
inp = tokenizer("I am a boy </s> <2en>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[  466,  1981,    80, 25573, 64001, 64004]])

out = tokenizer("<2hi> मैं  एक लड़का हूँ </s>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids # tensor([[64006,   942,    43, 32720,  8384, 64001]])
# Note that if you use any language other than Hindi or Marathi, you should convert its script to Devanagari using the Indic NLP Library.
inp.to(device)
out.to(device)

model_outputs=model(input_ids=inp, decoder_input_ids=out[:,0:-1], labels=out[:,1:])

In [None]:
string = "मुंबई इंडियंस ने आईपीएल 2024 के 14वें मैच में पहले बैटिंग करते हुए 20 ओवर में 9 विकेट खोकर 125 रन बनाए। राजस्थान रॉयल्स के गेंदबाजों के सामने मुंबई का टॉप-ऑर्डर फ्लॉप रहा। मुंबई के तीन बल्लेबाजों को ट्रेंट बोल्ट ने शून्य पर पवेलियन भेजा। हार्दिक और तिलक वर्मा के बीच अर्धशतकीय साझेदारी बनी। तिलक के बल्ले से 32 रन और हार्दिक ने 34 रन की पारी खेली। बोल्ट के अलावा राजस्थान के लिए चहल ने भी 3 विकेट चटकाए। इसके जवाब में 126 रन का पीछा करते हुए राजस्थान की टीम ने 15.3 ओवर में ही लक्ष्य हासिल कर लिया। राजस्थान की टीम की तरफ से रियाग पराग ने नाबाद 54 रन की पारी खेली, जिसमें 5 चौके और 3 छक्के शामिल रहे।"
inp = tokenizer(f"{string} </s> <2hi>", add_special_tokens=False, return_tensors="pt", padding=True).input_ids
inp = inp.to(model.device)
model_output=model.generate(inp, use_cache=True,
    min_length=30, #increase min length,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    temperature=1.0,
    num_beams=5,  # Use beam search
    repetition_penalty=2.0,  # Increase repetition penalty
    length_penalty=1.0,  # Adjust length penalty
    early_stopping=True,
    pad_token_id=pad_id,
    bos_token_id=bos_id,
    eos_token_id=eos_id,
    decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>"))


In [None]:
decoded_output=tokenizer.decode(model_output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
decoded_output

In [None]:
x = tokenized_dataset["train"]

In [None]:
x["input_ids"]

# Using Accelerator

In [None]:
subset_dataset.set_format("torch")

In [None]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(
    subset_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    subset_dataset["validation"], collate_fn=data_collator, batch_size=batch_size
)

In [None]:
from torch.utils.data import DataLoader

batch_size = 8
train_dataloader = DataLoader(
    subset_dataset["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=batch_size,
)
eval_dataloader = DataLoader(
    subset_dataset["validation"], collate_fn=data_collator, batch_size=batch_size
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)
print(optimizer)

In [None]:
from transformers import get_scheduler

num_train_epochs = 20
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
import indicnlp.tokenize.sentence_tokenize as tok

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    preds = ["\n".join(tok.sentence_split(pred.strip(), lang='hi', delim_pat='auto')) for pred in decoded_preds]
    labels = ["\n".join(tok.sentence_split(label.strip(), lang='hi', delim_pat='auto')) for label in decoded_labels]

    return decoded_preds, decoded_labels

In [None]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        record.append(loss)
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                generation_config=generation_config
            )

            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = batch["labels"]

            # If we did not pad to max length, we need to pad the labels too
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
            )

            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()

            # Replace -100 in the labels as we can't decode them
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            decoded_preds = tokenizer.batch_decode(
                generated_tokens, skip_special_tokens=True
            )
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)

            decoded_preds, decoded_labels = postprocess_text(
                decoded_preds, decoded_labels
            )

            rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)

    # Compute metrics
    result = rouge_score.compute()
    # Extract the median ROUGE scores
    wandb.log(result)
    result = {key: value * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}
    print(f"Epoch {epoch}:", result, f"Loss: {loss}")

    # Save and upload
    """accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )"""

In [None]:
x.append([y.cpu().detach().item() for y in record])

In [None]:
z = np.array(x[-1])
z = np.transpose(z)
z.size

In [None]:
import matplotlib.pyplot as plt


# Create x-axis values (epochs or steps)
epochs = range(1, len(record) + 1)

# Plot the loss
plt.plot(epochs, z, 'b', label='Training loss')
plt.title('Training Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1]):
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))


In [None]:
import sys

local_vars = list(locals().items())
for var, obj in local_vars:
    print(var, sys.getsizeof(obj))
