### Step:1 Extract the text PDFs

In [1]:
from pathlib import Path
import PyPDF2

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.
    :param pdf_path: Path to the PDF file.
    :return: Extracted text as a string.
    """
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text()
    return text

In [3]:
def extract_text_from_pdfs(input_dir, output_dir):
    """
    Extracts text from all PDF files in a directory and saves them to text files.
    :param input_dir: Directory containing PDF files.
    :param output_dir: Directory to save extracted text files.
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for pdf_file in input_path.glob("*.pdf"):
        print(f"Processing {pdf_file.name}")
        text = extract_text_from_pdf(pdf_file)
        output_file = output_path / f"{pdf_file.stem}.txt"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Extracted text saved to {output_file}")

In [4]:
# Usage
extract_text_from_pdfs("D:\\src_git\\LP\\LP\\projects\\paper_summarize\\", "D:\\src_git\\LP\\LP\\projects\\paper_summarize\\summary\\" )

Processing kafka.pdf
Extracted text saved to D:\src_git\LP\LP\projects\paper_summarize\summary\kafka.txt
Processing unikernels.pdf
Extracted text saved to D:\src_git\LP\LP\projects\paper_summarize\summary\unikernels.txt


### Step2: Prepare the dataset

In [5]:
import os
import json

In [6]:
def prepare_dataset(input_folder, output_file, task="summarization"):
    """
    Prepares a dataset for training a model.
    :param input_folder: Folder containing text files extracted from PDFs.
    :param output_file: Output JSON file for the dataset.
    :param task: Task type - 'summarization' or 'qa' (question-answering).
    """
    data = []
    for file in os.listdir(input_folder):
        if file.endswith(".txt"):
            with open(os.path.join(input_folder, file), "r", encoding="utf-8") as f:
                text = f.read()
            
            if task == "summarization":
                # Splitting into chunks (modify as per requirement)
                chunks = [text[i:i+500] for i in range(0, len(text), 500)]
                for chunk in chunks:
                    data.append({
                        "input_text": chunk,
                        "summary": "Provide a concise summary of the above text."
                    })
            elif task == "qa":
                # Example questions and answers for training (manual curation needed)
                data.append({
                    "context": text[:500],
                    "question": "What is the main topic of the document?",
                    "answer": "Provide the primary topic."
                })
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"Dataset saved to {output_file}")

In [7]:
# Example usage:
prepare_dataset("D:\\src_git\\LP\\LP\\projects\\paper_summarize\\summary\\", "summarization_dataset.json", task="summarization")
prepare_dataset("D:\\src_git\\LP\\LP\\projects\\paper_summarize\\summary\\", "qa_dataset.json", task="qa")

Dataset saved to summarization_dataset.json
Dataset saved to qa_dataset.json


### Step3: Train a hugging face model

In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

In [12]:
def train_model(dataset_path, output_dir, model_name="t5-small", task="summarization"):
    """
    Trains a Hugging Face model for summarization or question-answering.
    :param dataset_path: Path to the training dataset (JSON format).
    :param output_dir: Directory to save the trained model.
    :param model_name: Base model to fine-tune.
    :param task: Task type - 'summarization' or 'qa'.
    """
    # Load dataset
    dataset = load_dataset('json', data_files={"data": dataset_path})["data"]

    # Split dataset into train and test
    dataset_split = dataset.train_test_split(test_size=0.2, seed=42)  # 80% train, 20% test

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    # Tokenize data
    def preprocess_function(examples):
        inputs = examples["input_text"] if task == "summarization" else examples["context"] + " " + examples["question"]
        targets = examples["summary"] if task == "summarization" else examples["answer"]
        model_inputs = tokenizer(inputs, max_length=512, truncation=True)
        labels = tokenizer(targets, max_length=128, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    tokenized_data = dataset_split.map(preprocess_function, batched=True)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_steps=500,
        save_total_limit=2,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_data["train"],
        eval_dataset=tokenized_data["test"],
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()
    trainer.save_model(output_dir)
    print(f"Model saved to {output_dir}")

In [13]:
# Example usage:
train_model("summarization_dataset.json", "trained_model_summarization", task="summarization")

Generating data split: 225 examples [00:00, 44998.97 examples/s]
Map: 100%|██████████| 180/180 [00:00<00:00, 4990.18 examples/s]
Map: 100%|██████████| 45/45 [00:00<00:00, 2566.75 examples/s]
  trainer = Trainer(
  0%|          | 0/69 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
                                               
 33%|███▎      | 23/69 [00:13<00:22,  2.09it/s]

{'eval_loss': 3.8300540447235107, 'eval_runtime': 0.783, 'eval_samples_per_second': 57.474, 'eval_steps_per_second': 7.663, 'epoch': 1.0}


                                               
 67%|██████▋   | 46/69 [00:25<00:10,  2.30it/s]

{'eval_loss': 2.967337131500244, 'eval_runtime': 0.784, 'eval_samples_per_second': 57.394, 'eval_steps_per_second': 7.653, 'epoch': 2.0}


                                               
100%|██████████| 69/69 [00:38<00:00,  1.78it/s]

{'eval_loss': 2.699805498123169, 'eval_runtime': 0.7693, 'eval_samples_per_second': 58.497, 'eval_steps_per_second': 7.8, 'epoch': 3.0}
{'train_runtime': 38.8011, 'train_samples_per_second': 13.917, 'train_steps_per_second': 1.778, 'train_loss': 3.7667722840240034, 'epoch': 3.0}
Model saved to trained_model_summarization





### Load the trained model

In [15]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def load_model(model_dir):
    """
    Loads a trained model and tokenizer from a directory.
    :param model_dir: Path to the directory containing the saved model.
    :return: Loaded tokenizer and model.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
    return tokenizer, model

# Example usage
tokenizer, model = load_model("trained_model_summarization")

### Test the model

In [19]:
def test_model(tokenizer, model, input_text, max_length=128):
    """
    Generates a prediction from the trained model.
    :param tokenizer: Loaded tokenizer.
    :param model: Loaded model.
    :param input_text: Input text to summarize or query.
    :param max_length: Maximum length of the generated output.
    :return: Generated summary or answer.
    """
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(inputs, max_length=max_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
input_text = """Log  processing  has  become  a  critical  component  of  the  data \
pipeline for consumer internet companies. We introduce Kafka, a \
distributed messaging system that we developed for collecting and \
delivering high volumes of log data with low latency. Our system \
incorporates  ideas  from  existing  log  aggregators  and  messaging \ 
systems,  and  is  suitable  for  both  offline  and  online  message \
consumption.  We  made  quite  a  few  unconventional  yet  practical \
design choices in Kafka to make our system efficient and scalable. \
Our experimental results show that Kafka has superior \
performance  when  compared  to  two  popular  messaging  systems. \ 
We  have  been  using  Kafka  in  production  for  some  time  and  it  is \ 
processing hundreds of gigabytes of new data each day"""
output = test_model(tokenizer, model, input_text)
print(f"Model Output: {output}")

  input_text = """Log  processing  has  become  a  critical  component  of  the  data \


Model Output: .  We have been using Kafka in production for some time and it is  processing hundreds of gigabytes of new data each day.


### Save the model for future use.

In [20]:
def save_model(tokenizer, model, save_dir):
    """
    Saves the trained model and tokenizer to a directory.
    :param tokenizer: Tokenizer to save.
    :param model: Model to save.
    :param save_dir: Directory to save the model and tokenizer.
    """
    tokenizer.save_pretrained(save_dir)
    model.save_pretrained(save_dir)
    print(f"Model saved to {save_dir}")

# Example usage
save_model(tokenizer, model, "final_trained_model")

Model saved to final_trained_model


### Share and test the saved model

In [22]:
# Reloading locally saved model
reloaded_tokenizer, reloaded_model = load_model("final_trained_model")

# Testing the reloaded model
input_text = """Log  processing  has  become  a  critical  component  of  the  data \
pipeline for consumer internet companies. We introduce Kafka, a \
distributed messaging system that we developed for collecting and \
delivering high volumes of log data with low latency. Our system \
incorporates  ideas  from  existing  log  aggregators  and  messaging \ 
systems,  and  is  suitable  for  both  offline  and  online  message \
consumption.  We  made  quite  a  few  unconventional  yet  practical \
design choices in Kafka to make our system efficient and scalable. \
Our experimental results show that Kafka has superior \
performance  when  compared  to  two  popular  messaging  systems. \ 
We  have  been  using  Kafka  in  production  for  some  time  and  it  is \ 
processing hundreds of gigabytes of new data each day"""
output = test_model(reloaded_tokenizer, reloaded_model, input_text)
print(f"Reloaded Model Output: {output}")


  input_text = """Log  processing  has  become  a  critical  component  of  the  data \


Reloaded Model Output: .  We have been using Kafka in production for some time and it is  processing hundreds of gigabytes of new data each day.


### Archiving the model for sharing.

In [23]:
import shutil
import os

def archive_model(model_dir, archive_name):
    """
    Archives a model directory into a .tar.gz file.
    :param model_dir: Directory containing the model files.
    :param archive_name: Name of the archive file (without extension).
    """
    archive_path = f"{archive_name}.tar.gz"
    shutil.make_archive(archive_name, 'gztar', model_dir)
    print(f"Model archived as {archive_path}")

# Example usage
archive_model("final_trained_model", "final_model_archive")


Model archived as final_model_archive.tar.gz
