Notebook for uploading PDF, extracting all Text and Pre-Processing using a 1B or 3B model

In [41]:
#!pip install PyPDF2
#!pip install rich ipywidgets

In [14]:
pdf_path = './2402.13116v3.pdf'
DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
#DEFAULT_MODEL = "meta-llama/Llama-3.2-1B-Instruct" <- Don't think this would be necessary

In [31]:
from difflib import HtmlDiff
from IPython.display import HTML, display

In [49]:
# Import necessary libraries
import PyPDF2
from typing import Optional
import os
import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer

from tqdm.notebook import tqdm
import warnings

warnings.filterwarnings('ignore')

In [9]:
def validate_pdf(file_path: str) -> bool:
    if not os.path.exists(file_path):
        print(f"Error: File not found at path: {file_path}")
        return False
    if not file_path.lower().endswith('.pdf'):
        print("Error: File is not a PDF")
        return False
    return True

In [10]:
def extract_text_from_pdf(file_path: str, max_chars: int = 100000) -> Optional[str]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            # Create PDF reader object
            pdf_reader = PyPDF2.PdfReader(file)
            
            # Get total number of pages
            num_pages = len(pdf_reader.pages)
            print(f"Processing PDF with {num_pages} pages...")
            
            extracted_text = []
            total_chars = 0
            
            # Iterate through all pages
            for page_num in range(num_pages):
                # Extract text from page
                page = pdf_reader.pages[page_num]
                text = page.extract_text()
                
                # Check if adding this page's text would exceed the limit
                if total_chars + len(text) > max_chars:
                    # Only add text up to the limit
                    remaining_chars = max_chars - total_chars
                    extracted_text.append(text[:remaining_chars])
                    print(f"Reached {max_chars} character limit at page {page_num + 1}")
                    break
                
                extracted_text.append(text)
                total_chars += len(text)
                print(f"Processed page {page_num + 1}/{num_pages}")
            
            final_text = '\n'.join(extracted_text)
            print(f"\nExtraction complete! Total characters: {len(final_text)}")
            return final_text
            
    except PyPDF2.PdfReadError:
        print("Error: Invalid or corrupted PDF file")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {str(e)}")
        return None


In [11]:
# Get PDF metadata
def get_pdf_metadata(file_path: str) -> Optional[dict]:
    if not validate_pdf(file_path):
        return None
    
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            metadata = {
                'num_pages': len(pdf_reader.pages),
                'metadata': pdf_reader.metadata
            }
            return metadata
    except Exception as e:
        print(f"Error extracting metadata: {str(e)}")
        return None

In [12]:
# Extract metadata first
print("Extracting metadata...")
metadata = get_pdf_metadata(pdf_path)
if metadata:
    print("\nPDF Metadata:")
    print(f"Number of pages: {metadata['num_pages']}")
    print("Document info:")
    for key, value in metadata['metadata'].items():
        print(f"{key}: {value}")

# Extract text
print("\nExtracting text...")
extracted_text = extract_text_from_pdf(pdf_path)

# Display first 500 characters of extracted text as preview
if extracted_text:
    print("\nPreview of extracted text (first 500 characters):")
    print("-" * 50)
    print(extracted_text[:500])
    print("-" * 50)
    print(f"\nTotal characters extracted: {len(extracted_text)}")

# Optional: Save the extracted text to a file
if extracted_text:
    output_file = 'extracted_text.txt'
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"\nExtracted text has been saved to {output_file}")

Extracting metadata...

PDF Metadata:
Number of pages: 44
Document info:
/Author: 
/CreationDate: D:20240311015030Z
/Creator: LaTeX with hyperref
/Keywords: 
/ModDate: D:20240311015030Z
/PTEX.Fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5
/Producer: pdfTeX-1.40.25
/Subject: 
/Title: 
/Trapped: /False

Extracting text...
Processing PDF with 44 pages...
Processed page 1/44
Processed page 2/44
Processed page 3/44
Processed page 4/44
Processed page 5/44
Processed page 6/44
Processed page 7/44
Processed page 8/44
Processed page 9/44
Processed page 10/44
Processed page 11/44
Processed page 12/44
Processed page 13/44
Processed page 14/44
Processed page 15/44
Processed page 16/44
Reached 100000 character limit at page 17

Extraction complete! Total characters: 100016

Preview of extracted text (first 500 characters):
--------------------------------------------------
1
A Survey on Knowledge Distillation of Large
Language Models
Xiaohan Xu1, M

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"

SYS_PROMPT = """
You are a world class text pre-processor, here is the raw data from a PDF, please parse and return it in a way that is crispy and usable to send to a podcast writer.

The raw data is messed up with new lines, Latex math and you will see fluff that we can remove completely. Basically take away any details that you think might be useless in a podcast author's transcript.

Remember, the podcast could be on any topic whatsoever so the issues listed above are not exhaustive

The goal is to use this in a podcast research transcript so a lot of the emails, citations, and things like that can be removed-please be smart with what you remove and be creative ok?

Remember DO NOT START SUMMARIZING THIS, YOU ARE ONLY CLEANING UP THE TEXT AND RETURNING AS IS

Be very smart and aggressive with removing details, you will get a running portion of the text and keep returning the processed text.

ALWAYS start your response directly with processed text and NO ACKNOWLEDGEMENTS about my questions ok?
Here is the text:
"""

In [22]:
accelerator = Accelerator()
model = AutoModelForCausalLM.from_pretrained(
    DEFAULT_MODEL,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    device_map=device,
)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL, use_safetensors=True)
model, tokenizer = accelerator.prepare(model, tokenizer)

In [50]:
def process_chunk(text_chunk, chunk_num):
    """Process a chunk of text and return both input and output for verification"""
    conversation = [
        {"role": "system", "content": SYS_PROMPT},
        {"role": "user", "content": text_chunk},
    ]
    
    prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    
    with torch.no_grad():
        output = model.generate(
            **inputs,
            temperature=0.7,
            top_p=0.9,
            max_new_tokens=512
        )
    
    processed_text = tokenizer.decode(output[0], skip_special_tokens=True)[len(prompt):].strip()
    
    # Print chunk information for monitoring
    #print(f"\n{'='*40} Chunk {chunk_num} {'='*40}")
    print(f"INPUT TEXT:\n{text_chunk[:500]}...")  # Show first 500 chars of input
    print(f"\nPROCESSED TEXT:\n{processed_text[:500]}...")  # Show first 500 chars of output
    print(f"{'='*90}\n")
    
    return processed_text

In [51]:
INPUT_FILE = "./extracted_text.txt"  # Replace with your file path
CHUNK_SIZE = 1000  # Adjust chunk size if needed

# Read the file
with open(INPUT_FILE, 'r', encoding='utf-8') as file:
    text = file.read()

# Calculate number of chunks
num_chunks = (len(text) + CHUNK_SIZE - 1) // CHUNK_SIZE

# Cell 6: Process the file with ordered output
# Create output file name
output_file = f"clean_{os.path.basename(INPUT_FILE)}"

In [None]:
processed_text = ""  # Initialize complete processed text
with open(output_file, 'w', encoding='utf-8') as out_file:
    for chunk_num in tqdm(range(num_chunks), desc="Processing chunks"):
        # Get chunk with overlap
        start_idx = chunk_num * CHUNK_SIZE
        end_idx = start_idx + CHUNK_SIZE
        
        chunk = text[start_idx:end_idx]
        
        # Process chunk and append to complete text
        processed_chunk = process_chunk(chunk, chunk_num)
        processed_text += processed_chunk + "\n"
        
        # Write chunk immediately to file
        out_file.write(processed_chunk + "\n")
        
        # Force flush the file to disk
        out_file.flush()

Processing chunks:   0%|          | 0/101 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
1
A Survey on Knowledge Distillation of Large
Language Models
Xiaohan Xu1, Ming Li2, Chongyang Tao3, Tao Shen4, Reynold Cheng1, Jinyang Li1,
Can Xu5, Dacheng Tao6, Tianyi Zhou2
1The University of Hong Kong2University of Maryland3Microsoft
4University of Technology Sydney5Peking University6The University of Sydney
{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu
ckcheng@cs.hku.hk jl0725@connect.hku.hk
Abstract —In the era of Large Language Models (LLMs), Knowledge Distillati...

PROCESSED TEXT:
Tao Shen4, Reynold Cheng1, Jinyang Li1,
Can Xu5, Dacheng Tao6, Tianyi Zhou2
1The University of Hong Kong2University of Maryland3Microsoft
4University of Technology Sydney5Peking University6The University of Sydney
{shawnxxh,chongyangtao,hishentao }@gmail.com {minglii,tianyi }@umd.edu
ckcheng@cs.hku.hk...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ed knowledge to smaller models and its utility in model compression and self-
improvement. Our survey is meticulously structured around three foundational pillars: algorithm ,skill, and verticalization – providing
a comprehensive examination of KD mechanisms, the enhancement of specific cognitive abilities, and their practical implications
across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and KD, illustrating how
DA emerges as a powerfu...

PROCESSED TEXT:
ulously structured around three foundational pillars: algorithm, skill, and verticalization – providing a comprehensive examination of knowledge distillation mechanisms, the enhancement of specific cognitive abilities, and their practical implications across diverse fields. Crucially, the survey navigates the intricate interplay between data augmentation (DA) and knowledge distillation, illustrating how DA emerges as a powerful paradigm within the knowledge disti

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
on and
proposing future research directions. By bridging the gap between proprietary and open-source LLMs, this survey underscores the
potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal
terms that regulate the use of LLMs, ensuring ethical and lawful application of KD of LLMs. An associated Github repository is available
at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs.
Index Terms —Large lang...

PROCESSED TEXT:
ce LLMs, this survey underscores the potential for more accessible, efficient, and powerful AI solutions. Most importantly, we firmly advocate for compliance with the legal terms that regulate the use of LLMs, ensuring ethical and lawful application of knowledge distillation. An associated Github repository is available at https://github.com/Tebmer/Awesome-Knowledge-Distillation-of-LLMs.

Index Terms —Large language models, knowledge distillation, data augmentati

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 have un-
locked new realms of possibility, from generating human-
like text to offering sophisticated problem-solving capa-
bilities. The core significance of these LLMs lies in their
emergent abilities (Wei et al., 2022a,b; Xu et al., 2024a), a
phenomenon where the models display capabilities beyond
their explicit training objectives, enabling them to tackle a
diverse array of tasks with remarkable proficiency. Their
deep understanding of context, nuance, and the intrica-
cies of human languag...

PROCESSED TEXT:
ergent** abilities, a phenomenon where the models display capabilities beyond their explicit training objectives, enabling them to tackle a diverse array of tasks with remarkable proficiency. Their **deep** understanding of context, nuance, and intricacies of human language enables them to excel in a wide array of applications, from creative content generation to complex problem-solving....



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
g to revolutionize industries,
augment human creativity, and redefine our interaction with
technology.
Despite the remarkable capabilities of proprietary LLMs
like GPT-4 and Gemini, they are not without their shortcom-
ings, particularly when viewed in light of the advantages
offered by open-source models. A significant drawback is
their limited accessibility and higher cost (OpenAI et al.,
2023). These proprietary models often come with substantial
usage fees and restricted access, making them ...

PROCESSED TEXT:
ogy.
Despite the remarkable capabilities of proprietary LLMs like GPT-4 and Gemini, they are not without their shortcomings, particularly when viewed in light of the advantages offered by open-source models.
A significant drawback is their limited accessibility and higher cost (OpenAI et al., 2023). These models often come with substantial usage fees and restricted access, making them less attainable for individuals and smaller organizations.
In terms of data pri

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
straints of accessibility, cost, and adaptability
thus present significant challenges in leveraging the full
potential of proprietary LLMs.
In contrast to proprietary LLMs, open-source modelsarXiv:2402.13116v3  [cs.CL]  8 Mar 2024
2
like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al.,
2023a) bring several notable advantages. One of the primary
benefits of open-source models is their accessibility and
adaptability. Without the constraints of licensing fees or
restrictive usage policies, t...

PROCESSED TEXT:
aging the full
potential of proprietary LLMs. In contrast to proprietary LLMs, open-source models
arXiv:2402.13116v3  [cs.CL]  8 Mar 2024
2
like LLaMA (Touvron et al., 2023) and Mistral (Jiang et al., 2023a) bring several notable advantages. One of the primary benefits of open-source models is their accessibility and adaptability. Without the constraints of licensing fees or restrictive usage policies, these models are more readily available to a broader range of

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
y stemming from their relatively
limited scale and resources compared to their proprietary
counterparts. One of the most significant limitations is
the smaller model scale, which often results in lower per-
formance on real-world tasks with a bunch of instruc-
tions (Zheng et al., 2023a). These models, with fewer pa-
rameters, may struggle to capture the depth and breadth
of knowledge embodied in larger models like GPT-4. Ad-
ditionally, the pre-training investment in these open-source
models is...

PROCESSED TEXT:
parts. One of the most significant limitations is the smaller model scale, which often results in lower performance on real-world tasks with a bunch of instructions (Zheng et al., 2023a). These models, with fewer parameters, may struggle to capture the depth and breadth of knowledge embodied in larger models like GPT-4. Additionally, the pre-training investment in these open-source models is typically less substantial. This reduced investment can lead to a narrow

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ized applications. This
limitation becomes particularly evident when these models
are compared to the highly fine-tuned proprietary LLMs,
which are often tailored to excel in a wide array of complex
scenarios (OpenAI et al., 2023).
Primarily, recognizing the disparities between propri-
etary and open-source LLMs, KD techniques have surged
as a means to bridge the performance gap between these
models (Gou et al., 2021; Gupta and Agrawal, 2022). Knowl-
edge distillation, in this context, involves ...

PROCESSED TEXT:

When it comes to recognizing the differences between proprietary and open-source LLMs, it becomes particularly evident when comparing them to highly fine-tuned proprietary models. These models are often tailored to excel in a wide array of complex scenarios.

**The Limitation of Proprietary Models**
------------------------------------

Recognizing the disparities between proprietary and open-source LLMs is crucial. The fine-tuning process for proprietary models

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
t al., 2021) has emerged as a
prevalent paradigm to achieve knowledge distillation of
LLMs, where a small seed of knowledge is used to prompt
the LLM to generate more data with respect to a specific
skill or domain (Taori et al., 2023). Secondly, KD still retains
its fundamental role in compressing LLMs, making them
more efficient without significant loss in performance. (Gu
et al., 2024; Agarwal et al., 2024). More recently, the strategy
of employing open-source LLMs as teachers for their own
s...

PROCESSED TEXT:
ed of knowledge is used to prompt the LLM to generate more data with respect to a specific skill or domain. Secondly, KD retains its fundamental role in compressing LLMs, making them more efficient without significant loss in performance. (Gu et al., 2024; Agarwal et al., 2024). More recently, the strategy of employing open-source LLMs as teachers for their own self-improvement has emerged as a promising approach, enhancing their capabilities significantly. Figur

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:

via self-generated knowledge.
A key aspect of the knowledge distillation is the en-
hancement of skills such as advanced context following
(e.g., in-context learning (Huang et al., 2022a) and in-
struction following (Taori et al., 2023)), improved align-
ment with user intents (e.g., human values/principles (Cui
et al., 2023a), and thinking patterns like chain-of-thought
(CoT) (Mukherjee et al., 2023)), and NLP task specialization
(e.g., semantic understanding (Ding et al., 2023a), and code
gen...

PROCESSED TEXT:
standing
and in-context learning
improved alignment with user intents
human values and principles
thinking patterns like chain-of-thought
and NLP task specialization
semantic understanding
code generation
these skills are crucial for a wide range of applications
from casual conversations to complex problem-solving
in specialized domains
in vertical domains like healthcare
law
science
where accuracy and contextual knowledge are paramount
knowledge distillation ena

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
rom the
proprietary models that have been extensively trained and
fine-tuned in these areas.
The benefits of knowledge distillation in the era of
LLMs are multifaceted and transformative (Gu et al., 2024).
Through a suite of distillation techniques, the gap between
proprietary and open-source models is significantly nar-
rowed (Chiang et al., 2023; Xu et al., 2023a) and even
filled (Zhao et al., 2023a). This process not only streamlines
computational requirements but also enhances the environ-
m...

PROCESSED TEXT:
of knowledge distillation in the era of LLMs are multifaceted and transformative.
Through a suite of distillation techniques, the gap between proprietary and open-source models is significantly narrowed.
This process streamlines computational requirements and enhances environmental sustainability of AI operations.
Open-source models become more proficient with lesser computational overhead.
Furthermore, knowledge distillation fosters an accessible and equitable A

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 growth across various industries
and research domains.
The escalating need for a comprehensive survey on the
knowledge distillation of LLMs stems from the rapidly
evolving landscape of AI (OpenAI et al., 2023; Team et al.,
2023) and the increasing complexity of these models. As AI
continues to penetrate various sectors, the ability to effi-
ciently and effectively distill knowledge from proprietary
LLMs to open-source ones becomes not just a technical
aspiration but a practical necessity. This ...

PROCESSED TEXT:
ey on the
knowledge distillation of LLMs stems from the rapidly evolving landscape of AI
and the increasing complexity of these models
as ai continues to penetrate various sectors the ability to efficiently and effectively distill knowledge from proprietary lls to open-source ones becomes not just a technical aspiration but a practical necessity...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
eRankOptimizationy,1y,2y3y1y2y3≻≻rank……
DataCuration
X,YrawdatasynthesizefeedbackFeedback
input
outputSelf-Knowledge
outputinputinput
YlabelLabelingExpansion
X,YdemonstrationsexpandFeature
featureinput,outputextractSec.4Sec.5
Sec.3.1Sec.3.2
Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated
as ‘Sec.’ in this figure. RM S(·)denotes the student reward model.
the growing demand for more accessible, cost-effective, and
adaptable ...

PROCESSED TEXT:
utSelf-Knowledge
labelingExpansion
X,YdemonstrationsexpandFeature
featureinput,outputextractSec.4Sec.5
Sec.3.1Sec.3.2
Fig. 2: An overview of this survey on knowledge distillation of large language models. Note that ‘Section’ is abbreviated
as ‘Sec.’ in this figure. RM S(·)denotes the student reward model.
the growing demand for more accessible, cost-effective, and
adaptable AI solutions that can cater to a diverse range
of applications and users. A survey in this

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
posing direc-
tions for future research.
Survey Organization. The remainder of this survey is orga-
nized into several comprehensive sections, each designed to
offer a deep dive into the multifaceted aspects of knowledge
distillation within the realm ofLLMs. Following this intro-
duction, §2 provides a foundational overview of knowledge
distillation, comparing traditional techniques with those
emerging in the era of LLMs and highlighting the role of
data augmentation (DA) in this context. §3 del...

PROCESSED TEXT:
s organized into several comprehensive sections, each designed to offer a deep dive into the multifaceted aspects of knowledge distillation within the realm of LLMs. Following this introduction, §2 provides a foundational overview of knowledge distillation, comparing traditional techniques with those emerging in the era of LLMs and highlighting the role of data augmentation (DA) in this context. §3 delves into the approaches to elicit knowledge from teacher LLMs 

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
guage understanding (NLU), genera-
tion (NLG), information retrieval, recommendation systems,
and the evaluation of text generation. In §5, we ventureinto domain-specific vertical distillation, showcasing how
knowledge distillation techniques are applied within spe-
cialized fields such as law, healthcare, finance, and science,
illustrating the practical implications and transformative
impact of these approaches. The survey suggests open
problems in §6, identifying current challenges and gaps in...

PROCESSED TEXT:
lves the process of identifying, selecting, and combining the most relevant knowledge from a dataset to create a distilled representation of the original information....



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
large, complex model (teacher) to a
smaller, more efficient model (student) (Gou et al., 2021).
This technique is pivotal in mitigating the challenges posed
by the computational demands and resource constraints of
deploying large-scale models in practical applications.
Historically, knowledge distillation techniques, prior to
the era of LLMs, primarily concentrated on transferring
knowledge from complex, often cumbersome neural net-
works to more compact and efficient architectures (Sanh
et al.,...

PROCESSED TEXT:
ue is pivotal in mitigating the challenges posed
by the computational demands and resource constraints of
deploying large-scale models in practical applications.
Historically, knowledge distillation techniques, prior
to the era of LLMs, primarily concentrated on transferring
knowledge from complex, often cumbersome neural net-
works to more compact and efficient architectures (Sanh
et al., 2019; Kim and Rush, 2016)....



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 (Chenglin et al., 2023)
ExpansionSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Code Alpaca (Chaudhary, 2023)
Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a),
WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b)
CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a),
Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024)
ZeroGen (Ye et al...

PROCESSED TEXT:
Code Alpaca (Chaudhary, 2023)
Self-Align (Sun et al., 2024b), WizardLM (Xu et al., 2023a), WizardCoder (Luo et al., 2023a),
WizardMath (Luo et al., 2023b), AugGPT (Dai et al., 2023a), TDG (He et al., 2023b)
CurationUltraChat (Ding et al., 2023b), Phi-1 (Gunasekar et al., 2023), Phi-1.5 (Li et al., 2023a),
Phi-2 (Mar, 2023), Magicoder (Wei et al., 2023), WaveCoder (Yu et al., 2024)
ZeroGen (Ye et al., 2022), SunGen (Gao et al., 2023a), InPars (Bonifacio et al., 20

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
2024)
Self-KnowledgeSelf-Instruct (Wang et al., 2022a), Self-Align (Sun et al., 2024b), RLCD (Yang et al., 2024a),
ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), ReST (Gulcehre et al., 2023),
Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022)
DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), WizardLM (Xu et al., 2023a),
Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et a...

PROCESSED TEXT:
et al., 2024a), 
ImpDistill (Jung et al., 2023), LMSI (Huang et al., 2023a), 
ReST (Gulcehre et al., 2023), Self-Rewarding (Yuan et al., 2024a), Baize (Xu et al., 2023b), 
STaR (Zelikman et al., 2022), 
DistillationSupervised Fine-TuningAlpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023), 
WizardLM (Xu et al., 2023a), 
Self-Instruct (Wang et al., 2022a), Baize (Xu et al., 2023b), STaR (Zelikman et al., 2022), 
Divergence and SimilarityDistilGPT (Sanh et al.

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
kill
DistillationContext FollowingInstruction FollowingSelf-Instruct (Wang et al., 2022a), Alpaca (Taori et al., 2023), Vicuna (Chiang et al., 2023),
WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023),
WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a),
Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b),
CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023),...

PROCESSED TEXT:
et al., 2023), Vicuna (Chiang et al., 2023),
WizardLM (Xu et al., 2023a), Orca (Mukherjee et al., 2023), Orca 2 (Mitra et al., 2023),
WizardMath (Luo et al., 2023b), Llama-GPT4 (Peng et al., 2023a),
Multi-turn DialogueVicuna (Chiang et al., 2023), Baize (Xu et al., 2023b), UltraLLaMA (Ding et al., 2023b),
CAMEL (Li et al., 2023b), OpenChat (Wang et al., 2023c), Zephyr (Tunstall et al., 2023),
RAG Capbility KARD (Kang et al., 2023a), SAIL (Luo et al., 2023c), Self

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
2023), UltraFeedback (Cui et al., 2023a),
ValueCAI (Bai et al., 2022a), Align Honesty (Yang et al., 2023a), SANDBOX (Liu et al., 2023b),
Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a)
AgentTool UsingToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023),
ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a),
Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (...

PROCESSED TEXT:
, 2023a), SANDBOX (Liu et al., 2023b),
Self-Align (Sun et al., 2024b), UltraFeedback (Cui et al., 2023a), RLCD (Yang et al., 2024a)
AgentToolformer (Schick et al., 2023), Graph-ToolFormer (Zhang, 2023), Gorilla (Patil et al., 2023),
ToolAlpaca (Tang et al., 2023a), ToolLLM (Qin et al., 2023a), CRAFT (Yuan et al., 2023a),
Confucius (Gao et al., 2023b), MLLM-Tool (Wang et al., 2024), α-UMi (Shen et al., 2024),
PlanningFireAct (Chen et al., 2023b), AgentTuning (Zeng

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
OMP (Xu et al., 2024b), MaRio (Ramnath et al., 2023),
ID (Jung et al., 2023), GPT-3 Labeling (Wang et al., 2021b), BioGPT (Guo et al., 2023a),
ChatGPT NMT (Yang and Nicolai, 2023),
Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022),
AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a),
RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al., 2023),
Recommendation NDR (Mysore et al., 20...

PROCESSED TEXT:
l., 2021b), BioGPT (Guo et al., 2023a),
ChatGPT NMT (Yang and Nicolai, 2023),
Information RetrievalQUILL (Srinivasan et al., 2022), Promptgator (Dai et al., 2023b), InPars (Bonifacio et al., 2022),
AugTriever (Meng et al., 2023), (Sun et al., 2023a), RankVicuna (Pradeep et al., 2023a),
RankZephyr (Pradeep et al., 2023b), ExaRanker (Ferraretto et al., 2023),
Recommendation NDR (Mysore et al., 2023), InstrcutRec (Zhang et al., 2023b), ONCE (Liu et al., 2023c),
Text

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
lti-ModalityLLaVA (Liu et al., 2023e), SVIT (Zhao et al., 2023b), LVIS-Instruct4V (Wang et al., 2023e), Shikra (Chen et al., 2023c),
LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b),
Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e),
Verticalization
DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen ...

PROCESSED TEXT:
23e), Shikra (Chen et al., 2023c),
LSKD (Park et al., 2023), DetGPT (Pi et al., 2023; Zhao et al., 2023c), LRV (Liu et al., 2023f), NExT-GPT (Wu et al., 2023b),
Valley (Luo et al., 2023d), ILuvUI (Jiang et al., 2023d), StableLLaVA (Li et al., 2023c), PointLLM (Xu et al., 2023e),
Verticalization
DistillationLaw (Huang et al., 2023b; Cui et al., 2023b); Medical & Healthcare (Zhang et al., 2023c; Chen et al., 2023d); Finance (Zhang and Yang, 2023)...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
network to mimic the
output of a larger teacher network, often through techniques
like soft target training, where the student learns from
the softened softmax output of the teacher. Please refer to
the survey (Gou et al., 2021) for more details on general
knowledge distillation techniques in AI and DL.
In contrast, the advent of LLMs has revolutionized
the knowledge distillation landscape. The current era of
knowledge distillation in LLMs shifts the focus from mere
architecture compression to t...

PROCESSED TEXT:
training, where the student learns from the softened softmax output of the teacher.
refer to the survey (Gou et al., 2021) for more details on general knowledge distillation techniques in AI and DL.
In contrast, the advent of LLMs has revolutionized
the knowledge distillation landscape.
The current era of knowledge distillation in LLMs shifts the focus from mere
architecture compression to the more nuanced process of knowledge elicitation and transfer
Taori et al

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
r reduce the model size , the current focus in LLM-based
knowledge distillation is to extract and transfer the rich,
nuanced understanding that these models have developed.
The key to this modern approach lies in heuristic and
carefully designed prompts, which are used to elicit specific
knowledge (Ding et al., 2023b) or capabilities (Chaudhary,
2023) from the LLMs. These prompts are crafted to tap
into the LLM’s understanding and capabilities in various
domains, ranging from natural language un...

PROCESSED TEXT:
tract nuanced understanding from LLMs."...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
their explicit training objectives.
Furthermore, this era of knowledge distillation also em-
phasizes the transfer of more abstract qualities such as
reasoning patterns (Mitra et al., 2023), preference align-
ment (Cui et al., 2023a), and value alignment (Sun et al.,
2024b). This is in stark contrast to the earlier focus on output
replication (Taori et al., 2023), indicating a shift towards
a more holistic and comprehensive transfer of cognitive
capabilities. The current techniques involve not j...

PROCESSED TEXT:
ch as 
reasoning patterns, preference alignment, and value alignment. This shift towards a more holistic and comprehensive transfer of cognitive capabilities. The current techniques involve not just the replication of outputs, but also the emulation of thought processes and decision-making patterns of the teacher model. This involves complex strategies like chain-of-thought prompting, where the student model is trained to learn the reasoning process of the teache

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 al., 2022) emerges as a critical paradigm integral
to the process of knowledge distillation. Unlike traditional
DA techniques such as paraphrasing (Gangal et al., 2022) orback-translation (Longpre et al., 2019), which primarily aim
at expanding the training dataset in a somewhat mechanical
manner. DA within the context of LLMs focuses on the
generation of novel, context-rich training data tailored to
specific domains and skills. This innovation is driven by the
unique capabilities of LLMs to ge...

PROCESSED TEXT:
e traditional
DA techniques such as paraphrasing (Gangal et al., 2022) or back-translation (Longpre et al., 2019), which primarily aim
at expanding the training dataset in a somewhat mechanical
manner. DA within the context of LLMs focuses on the
generation of novel, context-rich training data tailored to
specific domains and skills. This innovation is driven by the
unique capabilities of LLMs to generate coherent, diverse,
and intricate data samples that closely

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
capability gap between proprietary and open-
source models. Through DA, LLMs are prompted to create
targeted, high-quality datasets that are not merely larger in
volume but are also rich in diversity and specificity. This
approach enables the distillation process to be more effec-
tive, ensuring that the distilled models not only replicate
the teacher model’s output behavior but also embody its
deep-seated understanding and cognitive strategies.
The significance and necessity of DA for achieving...

PROCESSED TEXT:
e targeted, high-quality datasets that are not merely larger in volume but also rich in diversity and specificity. This approach enables the distillation process to be more effective, ensuring that the distilled models replicate the teacher model’s output behavior and embody its deep-seated understanding and cognitive strategies.

DA is a key factor in achieving knowledge discovery in the LLM era, as it enables the distilled models to acquire and refine capabilit

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ssible approach to harnessing
the power of LLMs. It empowers open-source models with
the ability to approximate the contextual adeptness, ethical
alignment, and deep semantic insights characteristic of their
proprietary counterparts, thereby democratizing access to
advanced AI capabilities and fostering innovation across a
broader spectrum of applications and users.
2.3 Survey Scope
Building on the discussions introduced earlier, this survey
aims to comprehensively explore the landscape of knowl...

PROCESSED TEXT:
fields, including but not limited to
- Developing AI models that can learn and improve over time
- Enhancing human-AI collaboration and interaction
- Improving language understanding and generation capabilities
- Enabling the creation of high-quality, explainable AI models
2.3.1 Survey Scope
This survey aims to investigate the current state of knowledge distillation within the context of LLMs
in three primary areas:
- KD Algorithms
- Skill Distillation
- Vertical

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ions and methodologies of knowledge distillation. It
includes an in-depth exploration of the processes involved
in constructing knowledge from teacher models (e.g., pro-
prietary LLMs) and integrating this knowledge into student
models (e.g., open-source LLMs). Under the umbrella of
‘knowledge ’, we delve into strategies such as labeling (Hsieh
et al., 2023), expansion (Taori et al., 2023), curation (Gu-
nasekar et al., 2023), feature understanding (Agarwal et al.,
6
2024), feedback mechanisms (...

PROCESSED TEXT:
onstructing knowledge 
from teacher models (e.g., proprietary LLMs) and integrating this 
knowledge into student models (e.g., open-source LLMs)
strategies such as labeling, expansion, curation, feature 
understanding, feedback mechanisms, and self-knowledge generation 
under the umbrella of 'knowledge' we delve into
strategies such as supervised fine-tuning, divergence minimization, 
reinforcement learning techniques, and rank optimization strategies
knowledge d

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
This analysis
aims to illuminate how these algorithms facilitate the trans-
fer of knowledge, ensuring that open-source models can
replicate and, in some cases, surpass the capabilities of their
proprietary counterparts.
Skill Distillation. This facet examines the specific compe-
tencies and capabilities enhanced through KD. It encom-
passes detailed discussions on context following (Taori et al.,
2023; Luo et al., 2023c), with subtopics like instruction
following and retrieval-augmented generat...

PROCESSED TEXT:
models to replicate and potentially surpass proprietary counterparts in various capabilities.

**Distillation**
This facet focuses on the specific strengths and capabilities enhanced through Knowledge Distillation. It includes discussions on context and following (Taori et al., 2023; Luo et al., 2023c), instruction following and retrieval-augmented generation (RAG) capabilities.

**Alignment**
In the realm of alignment, a survey investigates thinking patterns, pe

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
, text generation evaluation, and code gen-
eration. Finally, the survey addresses multi-modality (Liu
et al., 2023e; Zhao et al., 2023b), exploring how KD enhances
LLMs’ ability to interpret and integrate multiple forms of
input, enriching their utility and applicability across various
contexts.
Verticalization Distillation. This section assesses the ap-
plication of KD across diverse vertical domains, offering
insights into how distilled LLMs can be tailored for spe-
cialized fields such as La...

PROCESSED TEXT:
iu et al., 2023; Zhao et al., 2023b), exploring how KD enhances LLM's ability to interpret and integrate multiple forms of input, enriching their utility and applicability across various contexts.
Verticalization Distillation. This section assesses the application of KD across diverse vertical domains, offering insights into how distilled LLMs can be tailored for specialized fields such as Law (LAW, 2023), Medical & Healthcare (Wang et al., 2023a), Finance (Zhang

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
roader AI and ML ecosystem.
By navigating through these facets, this survey en-
deavors to provide an extensive and nuanced analysis of
knowledge distillation in the era of LLMs. It serves as a
guide for researchers, practitioners, and enthusiasts in the
field, shedding light on current methodologies, challenges,
and opportunities for innovation in this rapidly evolving
domain.
Declaration. This survey represents our earnest effort to
provide a comprehensive and insightful overview of knowl-
edg...

PROCESSED TEXT:
n extensive and nuanced analysis of knowledge distillation in the era of LLMs. It serves as a guide for researchers, practitioners, and enthusiasts in the field, shedding light on current methodologies, challenges, and opportunities for innovation in this rapidly evolving domain.
Declaration. This survey represents our earnest effort to provide a comprehensive and insightful overview of knowledge distillation techniques applied to LLMs, focusing on algorithms, sk

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 their impacts
across a range of applications.
2.4 Distillation Pipeline in LLM Era
SeedKnowledgeSkill/Domain
TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer
driveGeneratedKnowledgeLearningObjectivetrain
Fig. 4: An illustration of a general pipeline to distill knowl-
edge from a large language model to a student model.
The general distillation pipeline of LLMs is a structured
and methodical process aimed at transferring knowledge
from a sophisticated teacher model to a less ...

PROCESSED TEXT:
skill or domain to be learned.**

their impacts
across a range of applications.

2.4 Distillation Pipeline in LLM Era
SeedKnowledgeSkill/Domain
TeacherLLMKnowledgeElicitationStudentModelDistillationAlgorithmsteer
driveGeneratedKnowledgeLearningObjectivetrain
Fig. 4: An illustration of a general pipeline to distill knowl-
edge from a large language model to a student model.
The general distillation pipeline of LLMs is a structured
and methodical process aimed at t

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
lves directing the teacher LLM towards a
specific target skill or domain. This is achieved through care-
fully crafted instructions or templates that guide the LLM’s
focus. These instructions are designed to elicit responses
that demonstrate the LLM’s proficiency in a particular area,
be it a specialized domain like healthcare or law, or a skill
such as reasoning or language understanding. The objective
here is to utilize the teacher LLM’s extensive training and
nuanced capabilities to generate ...

PROCESSED TEXT:
the LLM’s focus
elicit responses that demonstrate the LLM’s proficiency
in a particular area
specialized domain like healthcare or law
or a skill
such as reasoning or language understanding
objective
to utilize the teacher LLM’s extensive training and nuanced capabilities
to generate outputs that are rich in the specific knowledge or skills desired for the student model
seed knowledge
typically comprises
a small dataset or specific data clues relevant to the elic

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 seed knowledge is crucial as it provides a
foundation upon which the teacher model can build and
expand, thereby creating more comprehensive and in-depth
knowledge examples.
III. Generation of Distillation Knowledge. In response
to the seed knowledge and steering instructions, the teacher
LLM generates knowledge examples. These examples are
predominantly in the form of question-and-answer (QA)
dialogues or narrative explanations, aligning with the nat-
ural language processing/understanding cap...

PROCESSED TEXT:
ately creating a more comprehensive and in-depth knowledge base.
III. Distillation of Knowledge. The teacher LLM generates knowledge examples, primarily in the form of question-and-answer dialogues or narrative explanations, aligning with the language processing capabilities of the model.
In certain specialized cases, the outputs may include logits or hidden features, although this is less common due to the complexity of the data form.
The generated knowledge exa

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ge examples to train the student
model. This training is guided by a loss function that aligns
with the learning objectives. The loss function quantifies
the student model’s performance in replicating or adapting
the knowledge from the teacher model. By minimizing this
loss, the student model learns to emulate the target skills or
domain knowledge of the teacher, thereby acquiring similar
capabilities. The process involves iteratively adjusting the
student model’s parameters to reduce the discre...

PROCESSED TEXT:
xt (o) and combined with the seed knowledge (s) to produce the output (o')...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
ch the LLM can
explore to generate novel knowledge, Parse( o, s)stands for
to parse the distillation example ( e.g., (x, y)) from the
teacher LLM’s output o(plus the input sin some cases),
andpTrepresents the teacher LLM with parameters θT.
Given the datasets D(kd)
Ibuilt for distillation, we then define
a learning objective as
L=X
ILI(D(kd)
I;θS), (2)
whereP
Idenotes there could be multiple tasks or skills
being distilled into one student model, LI(·;·)stands for a
specific learning objective, ...

PROCESSED TEXT:
on example ( e.g., (x, y)) from the
teacher LLM’s output o(plus the input sin some cases),
andpTrepresents the teacher LLM with parameters θT.
Given the datasets D(kd)
Ibuilt for distillation, we then define
a learning objective as
L=X
ILI(D(kd)
I;θS), (2)
where...



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
 LLMs (Eq.1), and ‘Distillation,’
centered on injecting this knowledge into student models
(Eq.2). We will elaborate on these two processes in the
subsequent sections.
3.1 Knowledge
This section focuses on the approaches to elicit knowledge
from teacher LLMs. According to the manners to acquire
knowledge, we divided them into Labeling ,Expansion ,DataCuration ,Feature ,Feedback , and Self-Knowledge . Figure 5
shows an illustration of these knowledge elicitation meth-
ods.
3.1.1 Labeling
Labeling...

PROCESSED TEXT:
aborate on these two processes in the
subsequent sections.
3.1 Knowledge
This section focuses on the approaches to elicit knowledge
from teacher LLMs. According to the manners to acquire
knowledge, we divided them into Labeling,Expansion,DataCuration,Feature,Feedback, and Self-Knowledge. Figure 5
shows an illustration of these knowledge elicitation methods....



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


INPUT TEXT:
lable through the
predefined Iandc. This process can be formulated as
follows:
D(lab)={x, y|x∼ X, y∼pT(y|I⊕c⊕x)}. (3)
Input xcould be sourced from existing NLP task
datasets, which serve as typical reservoirs for distillation
efforts. Numerous works have sought to harness the capa-
bilities of powerful LLMs as teachers for annotating dataset
samples across a range of tasks. For instance, efforts in
natural language understanding involve using LLMs to cat-
egorize text (Gilardi et al., 2023; Ding...

PROCESSED TEXT:
, y∼pT(y|I⊕c⊕x)}. (3)
Input xcould be sourced from existing NLP task
datasets, which serve as typical reservoirs for distillation
efforts. Numerous works have sought to harness the
capabilities of powerful LLMs as teachers for annotating dataset
samples across a range of tasks. For instance, efforts in
natural language understanding involve using LLMs to
categorize text (Gilardi et al., 2023; Ding et al., 2023a; He et al.,
2023a), while in natural language genera

In [None]:
print(f"\nProcessing complete!")
print(f"Input file: {INPUT_FILE}")
print(f"Output file: {output_file}")
print(f"Total chunks processed: {num_chunks}")

# Preview the beginning and end of the complete processed text
print("\nPreview of final processed text:")
print("\nBEGINNING:")
print(processed_text[:1000])
print("\n...\n\nEND:")
print(processed_text[-1000:])