## Helpful Code for Training Models in Huggingface

> Import Library

In [None]:
!pip install evaluate, rouge_score

In [None]:
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
import accelerate   # for distributed training
from diffusers import StableDiffusionPipeline
import evaluate     # custom evaluation script
import torchmetrics
import arxiv  

# Huggingface Transformers
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM, 
    Trainer, 
    TrainingArguments,
    DataCollatorForSeq2Seq
)

> Data Collection

In [None]:
# Define the search query
search_text = "Deep Learning for Ageing Research" 

# Search for papers on arXiv
search = arxiv.Search(query=search_text, max_results=50, sort_by=arxiv.SortCriterion.Relevance)

# Collect the results
result_list = []
for result in search.results():
    result_list.append({
        "title": result.title,
        "published": result.published,
        "abstract": result.summary,
        "url": result.pdf_url,
        "categories": result.categories
    })

# Save the results to a JSON file (optional)
with open('arxiv_papers.json', 'w') as f:
    json.dump(result_list, f, indent=4)


> Prepare the Dataset

In [None]:
# Create a list of dictionaries with 'document' and 'summary' keys
# For demonstration, we'll use the abstract as both the document and the summary
# In practice, you'd want a more meaningful summary
train_data = []
for paper in result_list:
    train_data.append({
        "document": paper["abstract"],
        "summary": paper["abstract"]  # Replace with actual summaries if available
    })

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(pd.DataFrame(train_data))

# Split the dataset into training and evaluation sets
split_dataset = dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']


> Tokenization

In [None]:

model_name = "facebook/bart-large-cnn"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define the maximum sequence length
max_input_length = 512
max_target_length = 128

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["document"], 
        max_length=max_input_length, 
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], 
            max_length=max_target_length, 
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)


> Data Collator

In [None]:
# Set up a data collator to dynamically pad the inputs during training:
# it loads the data from the dataset and pads it to the maximum length of the samples

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer, 
    model=AutoModelForSeq2SeqLM.from_pretrained(model_name), 
    padding=True
)


> Define Training Arguments

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./models",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy="steps",
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU is available
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True
)


> Initialize the Trainer

In [None]:
# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


rouge = evaluate.load("rouge") # Load evaluation metric

# rouge = torchmetrics.text.ROUGEScore()  # Initialize TorchMetrics ROUGE


# Define a compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = [[(label if label != -100 else tokenizer.pad_token_id) for label in doc] for doc in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    #------------------------------- ROUGE Score (using evaluate) -------------------------------#
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(tokenizer.tokenize(pred)) for pred in decoded_preds]
    decoded_labels = ["\n".join(tokenizer.tokenize(label)) for label in decoded_labels]
    
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    #--------------------------------------------------------------------------#
    #------------------------------- ROUGE Score (using TorchMetrics) -------------------------------#
    # # Update the ROUGE metric
    # rouge.reset()  # Reset metrics to ensure no accumulation from previous evaluations
    # rouge.update(predictions=decoded_preds, references=decoded_labels)
    # rouge_scores = rouge.compute()
    
    # # Extract the scores
    # result = {
    #     "rouge1": rouge_scores["rouge1"].mid.fmeasure * 100,
    #     "rouge2": rouge_scores["rouge2"].mid.fmeasure * 100,
    #     "rougeL": rouge_scores["rougeL"].mid.fmeasure * 100,
    # }
    #--------------------------------------------------------------------------#
    return result

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


> Train and Evaluate the Model

In [None]:
# Start training
trainer.train()

# Evaluate the model
trainer.evaluate()


In [None]:
model.save_pretrained("model-saved")  # Save the model
tokenizer.save_pretrained("tokenizer-saved")  # Save the tokenizer


# Load the saved model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("model-saved").to(device)
tokenizer = AutoTokenizer.from_pretrained("tokenizer-saved").to(device)

> Visualizing AI Generated Images, Audio, and Video

In [None]:
from IPython.display import display, Markdown, Audio, Image, Video
import matplotlib.pyplot as plt
import librosa
import librosa.display
import cv2

# Display Generated Text (Assuming you have the text)
generated_text = "Once upon a time, in a land far away..."
display(Markdown(f"### **Generated Text:**\n{generated_text}"))

# Display Generated Image
generated_image_path = "generated_image.png"
display(Image(filename=generated_image_path, width=400, height=300))

# Display Generated Audio
generated_audio_path = "generated_audio.mp3"
display(Audio(filename=generated_audio_path, autoplay=False))

# Display Audio Waveform
y, sr = librosa.load(generated_audio_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(y, sr=sr)
plt.title("AI-Generated Audio Waveform")
plt.xlabel("Time (seconds)")
plt.ylabel("Amplitude")
plt.show()

# Display Generated Video
generated_video_path = "generated_video.mp4"
display(Video(filename=generated_video_path, embed=True, width=640, height=480))

# Display First Frame of Video
cap = cv2.VideoCapture(generated_video_path)
ret, frame = cap.read()

if ret:
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    plt.imshow(frame)
    plt.axis('off')
    plt.title("First Frame of the AI-Generated Video")
    plt.show()
else:
    print("Failed to read the video.")

cap.release()


TypeError: a bytes-like object is required, not 'NoneType'

<IPython.core.display.Video object>

In [33]:
from IPython.display import Audio, display

# Play audio from a URL
audio_url = "https://www.soundjay.com/ambient/sounds/boarding-accouncement-1.mp3"
display(Audio(url=audio_url, autoplay=True))


