### Notes: If ImportError occurs, it's probably due to the huggingface-hub. 
> pip install huggingface-hub==0.25.0
> 
> Reference: https://replicate.com/blog/how-to-prompt-llama

## Import

In [1]:
import torch
from transformers import pipeline
import re
import os

from langchain import HuggingFacePipeline, PromptTemplate
from transformers import pipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
import os
from tqdm import tqdm
import pandas as pd
import itertools, csv, json


DEVICE = "cuda:0" if torch.cuda.is_available() else "CPU"
HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN') #in terminal: export HUGGING_FACE_TOKEN="YOUR_TOKEN"


## Load Model

In [3]:
'''
Possible Models:
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
- meta-llama/Llama-3.1-70B-Instruct
'''
model_id = "meta-llama/Llama-3.2-3B-Instruct" 

# Quanitisize your model dtype (for sparsity)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Set token using ENV variable
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGING_FACE_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=HUGGING_FACE_TOKEN,
    quantization_config=bnb_config
)

augmentation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Data

In [19]:
directory_path = "/home/dongkyu/exported_docs"

In [20]:
import os
import re

def preprocess_text(text):
    """
    Cleans and preprocesses text for LLaMA dataset.

    Args:
        text (str): The raw text to preprocess.

    Returns:
        str: Cleaned and normalized text.
    """
    # Normalize whitespace and remove excessive newlines
    text = re.sub(r'\s+', ' ', text)  # Replace all whitespace (including newlines) with a single space
    text = re.sub(r'\n+', '\n', text.strip())  # Replace multiple newlines with a single newline
    # Remove any unwanted characters or patterns (URLs, etc.)
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    return text.strip()

def load_txt_files_to_text_format(directory, output_file):
    """
    Converts `.txt` files from a nested directory structure into a single `.txt` file with preprocessing.

    Args:
        directory (str): Root directory containing `.txt` files (including subdirectories).
        output_file (str): Path to save the combined and cleaned text file.

    Returns:
        None
    """
    with open(output_file, 'w', encoding='utf-8') as output_f:
        # Recursively traverse the directory
        for root, _, files in os.walk(directory):
            for file_name in sorted(files):
                if file_name.endswith('.txt'):  # Only process `.txt` files
                    file_path = os.path.join(root, file_name)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        cleaned_text = preprocess_text(content)
                        # Write the cleaned text to the output file
                        output_f.write(cleaned_text + '\n')  # Add a newline between entries

    print(f"Processed `.txt` files from nested directories saved to {output_file}.")

# Example usage
directory = directory_path  # Replace with your root directory with subdirectories
output_file = "raw_text.txt"  # Output file path
load_txt_files_to_text_format(directory, output_file)


Processed `.txt` files from nested directories saved to finetune.txt.


In [21]:
def read_text_file_lines(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.readlines()
    except FileNotFoundError:
        print(f"Error: The file at {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

file_path = "raw_text.txt"
lines = read_text_file_lines(file_path)


In [22]:
lines[0]

'Title: (SLAM) Navigating While Mapping\uf0c1 URL:  Section: getting_started/index.html -------------------------------------------------------------------------------- ## Overview\uf0c1 This document explains how to use Nav2 with SLAM. The following steps show ROS 2 users how to generate occupancy grid maps and use Nav2 to move their robot around. This tutorial applies to both simulated and physical robots, but will be completed here on a physical robot. Before completing this tutorial, completing theGetting Startedis highly recommended especially if you are new to ROS and Navigation2. In this tutorial we’ll be using SLAM Toolbox. More information can be found in theROSCon talk for SLAM Toolbox ## Requirements\uf0c1 You must install Navigation2, Turtlebot3, and SLAM Toolbox. If you don’t have them installed, please followGetting Started. SLAM Toolbox can be installed via: or from built from source in your workspace with: ## Tutorial Steps\uf0c1 ## 0- Launch Robot Interfaces\uf0c1 For 

# Generate
- '<|pad|>'
- '<|begin_of_text|>'
- '<|eot_id|>'

In [43]:
def write_instruction(note, pipe, max_new_tokens=100, num_beams=1, temperature=1.0):
    messages = [
    {
        "role": "system",
        "content": ("You are a helpful assistant. Your task is to make a fine-tuning dataset. Given a ANSWER, infer the QUERY. Think carefully and write the appropriate QUERY that would derive this ANSWER. Write only the query. Example: [ANSWER] 4 [/ANSWER] [QUERY] What is 2+2? [/QUERY]"),
    },
    {
        "role": "user",
        "content": (
            "[ANSWER]: {noteA} [/ANSWER]".format(noteA= note)
            ),
    }
]

    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    answer = outputs[0]["generated_text"][-1]
    return answer


In [44]:
def write_instruction_answer(note, pipe, max_new_tokens=800, num_beams=1, temperature=1.0):
    messages = [
    {
        "role": "system",
        "content": ("You are a helpful assistant.Your task is to make a fine-tuning dataset. Your response should be within [CORE] [/CORE], do not add explanations \
        Given a CONTEXT, select the most informative CORE. Example: [CONTEXT] Have you heard of ROS? The Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. ROS is used in many occasions. [/CONTEXT] [CORE] Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. [/CORE]"),
    },
    {
        "role": "user",
        "content": (
            "[CONTEXT]: {noteA} [/CONTEXT]".format(noteA= note)
            ),
    }
]

    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    answer = outputs[0]["generated_text"][-1]
    return answer


In [None]:
def generate_fine_tuning_dataset(lines, pipe, output_csv_path, max_new_tokens=100, num_beams=1, temperature=1.0):
    """
    Generates a fine-tuning dataset with queries and input lines, then saves it to a CSV file.
    
    Args:
        lines (list): A list of text strings (answers).
        pipe: The pipeline function for generating queries.
        output_csv_path (str): Path to save the resulting CSV file.
        max_new_tokens (int): Maximum number of new tokens to generate.
        num_beams (int): Beam search parameter for the model.
        temperature (float): Sampling temperature for the model.
    """
    results = []

    for line in lines:
        query = write_instruction(
                note=line,
                pipe=pipe,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                temperature=temperature
            )

        query= query['content']
        results.append({"query": query, "input": line})
        for _ in range(2):
            quiz = write_instruction_answer(
                    note=line,
                    pipe=pipe,
                    max_new_tokens=max_new_tokens,
                    num_beams=num_beams,
                    temperature=temperature
                )
    
            quiz= quiz['content']
            print("QUIZ: ", quiz)
            query = write_instruction(
                note=quiz,
                pipe=pipe,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                temperature=temperature
            )
            query= query['content']
            print("QUERY: ", query)
            results.append({"query": query, "input": quiz})
    # Save results to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["query", "input"])
        writer.writeheader()
        writer.writerows(results)

    print(f"Dataset saved to {output_csv_path}")

# Example usage
output_csv_path = "generated_dataset.csv"
generate_fine_tuning_dataset(lines, augmentation_pipeline, output_csv_path)


# Make into ShareGPT Format

In [2]:
df= pd.read_csv('generated_dataset.csv')

In [3]:
def clean_special_tokens(df):
    """
    Removes special tokens [QUERY], [/QUERY], [ANSWER], and [/ANSWER] from the 'query' and 'input' columns in a DataFrame.
    
    Args:
        df (pd.DataFrame): The input DataFrame with 'query' and 'input' columns.

    Returns:
        pd.DataFrame: A new DataFrame with cleaned columns.
    """
    # Define the list of special tokens to remove
    special_tokens = ["[QUERY]", "[/QUERY]", "[ANSWER]", "[/ANSWER]", "[TEXT]", "[/TEXT]", "[QUIZ]", "[/QUIZ]", "[CONTEXT]", "[/CONTEXT]", "[CORE]", "[/CORE]"]
    
    # Remove special tokens from both columns
    for token in special_tokens:
        df['query'] = df['query'].str.replace(token, "", regex=False)
        df['input'] = df['input'].str.replace(token, "", regex=False)
    # Optionally strip leading and trailing whitespace
    df['query'] = df['query'].str.strip()
    df['input'] = df['input'].str.strip()

    return df

In [4]:
cleaned_df = clean_special_tokens(df)

In [5]:
cleaned_df

Unnamed: 0,query,input
0,What is the purpose of the SLAM Toolbox in ROS 2?,Title: (SLAM) Navigating While Mapping URL: ...
1,What is the purpose of this document?,: This document explains how to use Nav2 with ...
2,What is the purpose of using Nav2 with SLAM?,: Title: (SLAM) Navigating While Mapping URL:...
3,How to load and use the STVL costmap plugin in...,Title: (STVL) Using an External Costmap Plugin...
4,What is pluginlib plugin STVL?,STVL is a demonstrative pluginlib plugin that ...
...,...,...
1330,What is eProsima Fast DDS?,: eProsima Fast DDS installation instructions\...
1331,How to install eProsima Fast DDS?,eProsima Fast DDS URL: Section: Installation...
1332,How to install ROS 2 on macOS?,Title: macOS (source) URL: Section: Installa...
1333,What is the minimum macOS version supported?,: Title: macOS (source) URL: Section: Instal...


In [6]:
def save_dataframe_to_json(df, output_path):
    """
    Save a DataFrame with 'query' and 'input' columns into a JSON file,
    where each row is represented as a list of dictionaries for 'system', 'user', and 'assistant'.

    Args:
        df (pd.DataFrame): Input DataFrame with 'query' and 'input' columns.
        output_path (str): Path to save the JSON file.

    Returns:
        None
    """
    data = []
    for _, row in df.iterrows():
        conversation = [
            {"role": "system", "content": "You are an helpful agent."},
            {"role": "user", "content": row['query']},
            {"role": "assistant", "content": row['input']}
        ]
        data.append(conversation)

    # Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=4)

In [7]:
save_dataframe_to_json(cleaned_df, 'output.json')