### Notes: If ImportError occurs, it's probably due to the huggingface-hub. 
> pip install huggingface-hub==0.25.0
> 
> Reference: https://replicate.com/blog/how-to-prompt-llama

## Import

In [1]:
import torch
from transformers import pipeline
import re
import os
import gc


from langchain import HuggingFacePipeline, PromptTemplate
from transformers import pipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
import os
from tqdm import tqdm
import pandas as pd
import itertools, csv, json


DEVICE = "cuda" if torch.cuda.is_available() else "CPU"
HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN') #in terminal: export HUGGING_FACE_TOKEN="YOUR_TOKEN"


## Load Model

In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,4"  # or "0,1" for multiple GPUs

In [3]:
'''
Possible Models:
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
- meta-llama/Llama-3.1-70B-Instruct
'''
#model_id = "meta-llama/Llama-3.2-3B-Instruct" 
model_id = "meta-llama/Llama-3.1-70B-Instruct"

# Quanitisize your model dtype (for sparsity)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Set token using ENV variable
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGING_FACE_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=HUGGING_FACE_TOKEN,
    quantization_config=bnb_config
)

augmentation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

# Data

In [4]:
directory_path = "/home/dongkyu/exported_docs"

In [5]:
import os
import re

def preprocess_text(text):
    """
    Cleans and preprocesses text for LLaMA dataset.

    Args:
        text (str): The raw text to preprocess.

    Returns:
        str: Cleaned and normalized text.
    """
    # Normalize whitespace and remove excessive newlines
    text = re.sub(r'\s+', ' ', text)  # Replace all whitespace (including newlines) with a single space
    text = re.sub(r'\n+', '\n', text.strip())  # Replace multiple newlines with a single newline
    # Remove any unwanted characters or patterns (URLs, etc.)
    text = re.sub(r'http[s]?://\S+', '', text)  # Remove URLs
    return text.strip()

def load_txt_files_to_text_format(directory, output_file):
    """
    Converts `.txt` files from a nested directory structure into a single `.txt` file with preprocessing.

    Args:
        directory (str): Root directory containing `.txt` files (including subdirectories).
        output_file (str): Path to save the combined and cleaned text file.

    Returns:
        None
    """
    with open(output_file, 'w', encoding='utf-8') as output_f:
        # Recursively traverse the directory
        for root, _, files in os.walk(directory):
            for file_name in sorted(files):
                if file_name.endswith('.txt'):  # Only process `.txt` files
                    file_path = os.path.join(root, file_name)
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        cleaned_text = preprocess_text(content)
                        # Write the cleaned text to the output file
                        output_f.write(cleaned_text + '\n')  # Add a newline between entries

    print(f"Processed `.txt` files from nested directories saved to {output_file}.")

# Example usage
directory = directory_path  # Replace with your root directory with subdirectories
output_file = "raw_text.txt"  # Output file path
load_txt_files_to_text_format(directory, output_file)


Processed `.txt` files from nested directories saved to raw_text.txt.


In [6]:
def read_text_file_lines(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.readlines()
    except FileNotFoundError:
        print(f"Error: The file at {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

file_path = "raw_text.txt"
lines = read_text_file_lines(file_path)


In [7]:
lines[0]

'Title: (SLAM) Navigating While Mapping\uf0c1 URL:  Section: getting_started/index.html -------------------------------------------------------------------------------- ## Overview\uf0c1 This document explains how to use Nav2 with SLAM. The following steps show ROS 2 users how to generate occupancy grid maps and use Nav2 to move their robot around. This tutorial applies to both simulated and physical robots, but will be completed here on a physical robot. Before completing this tutorial, completing theGetting Startedis highly recommended especially if you are new to ROS and Navigation2. In this tutorial we’ll be using SLAM Toolbox. More information can be found in theROSCon talk for SLAM Toolbox ## Requirements\uf0c1 You must install Navigation2, Turtlebot3, and SLAM Toolbox. If you don’t have them installed, please followGetting Started. SLAM Toolbox can be installed via: or from built from source in your workspace with: ## Tutorial Steps\uf0c1 ## 0- Launch Robot Interfaces\uf0c1 For 

# Generate
- '<|pad|>'
- '<|begin_of_text|>'
- '<|eot_id|>'

In [8]:
def write_instruction(note, pipe, max_new_tokens=100, num_beams=1, temperature=1.0):
    messages = [
    {
        "role": "system",
        "content": ("You are a helpful assistant. Your task is to make a fine-tuning dataset. Given a ANSWER, infer the QUERY. Think carefully and write the appropriate QUERY that would derive this ANSWER. Write only the query. Example: [ANSWER] 4 [/ANSWER] [QUERY] What is 2+2? [/QUERY]"),
    },
    {
        "role": "user",
        "content": (
            "[ANSWER]: {noteA} [/ANSWER]".format(noteA= note)
            ),
    }
]

    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    answer = outputs[0]["generated_text"][-1]
    return answer


In [9]:
def write_instruction_answer(note, pipe, max_new_tokens=800, num_beams=1, temperature=1.0):
    messages = [
    {
        "role": "system",
        "content": ("You are a helpful assistant.Your task is to make a fine-tuning dataset. Your response should be within [CORE] [/CORE], do not add explanations \
        Given a CONTEXT, select the most informative CORE. Example: [CONTEXT] Have you heard of ROS? The Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. ROS is used in many occasions. [/CONTEXT] [CORE] Robot Operating System (ROS) is a set of software libraries and tools for building robot applications. [/CORE]"),
    },
    {
        "role": "user",
        "content": (
            "[CONTEXT]: {noteA} [/CONTEXT]".format(noteA= note)
            ),
    }
]

    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    answer = outputs[0]["generated_text"][-1]
    return answer


In [10]:
def write_instruction_qa(note, pipe, max_new_tokens=800, num_beams=1, temperature=1.0):
    messages = [
    {
        "role": "system",
        "content": ("You are a helpful assistant. Your task is to make pairs of question and answers from a given context. Only write the answers, no explanations.\
        Given a CONTEXT, make question-answer pairs. The answer should be provided in [Q][/Q]--[A][/A] format."),
    },
    {
        "role": "user",
        "content": (
            "[CONTEXT]: {noteA} [/CONTEXT]".format(noteA= note)
            ),
    }
]

    outputs = pipe(messages, max_new_tokens=max_new_tokens)
    answer = outputs[0]["generated_text"][-1]
    return answer

def parse_qna(text):
    """
    Parses text containing question and answer pairs in the format:
    [Q] <question> [/Q]--[A] <answer> [/A]

    Args:
        text (str): Input text containing Q&A pairs.

    Returns:
        list: A list of dictionaries containing 'question' and 'answer'.
    """
    # Define regex patterns to extract questions and answers
    q_pattern = r"\[Q\](.*?)\[/Q\]"
    a_pattern = r"\[A\](.*?)\[/A\]"
    
    # Find all question and answer matches
    questions = re.findall(q_pattern, text, re.DOTALL)
    answers = re.findall(a_pattern, text, re.DOTALL)
    
    # Combine into a list of dictionaries
    qna_pairs = [{"question": q.strip(), "answer": a.strip()} for q, a in zip(questions, answers)]
    
    return qna_pairs

In [12]:
def generate_fine_tuning_dataset(lines, pipe, output_csv_path, max_new_tokens=1200, num_beams=1, temperature=1.0):
    """
    Generates a fine-tuning dataset with queries and input lines, then saves it to a CSV file.
    
    Args:
        lines (list): A list of text strings (answers).
        pipe: The pipeline function for generating queries.
        output_csv_path (str): Path to save the resulting CSV file.
        max_new_tokens (int): Maximum number of new tokens to generate.
        num_beams (int): Beam search parameter for the model.
        temperature (float): Sampling temperature for the model.
    """
    results = []

    #for line in lines:
    lines = lines[325:]
    for line in tqdm(lines, desc="Generating dataset"):
        torch.cuda.empty_cache()
        '''
        query = write_instruction(
                note=line,
                pipe=pipe,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                temperature=temperature
            )

        query= query['content']
        results.append({"query": query, "input": line})
        '''
        for _ in range(1):
            qa = write_instruction_qa(
                    note=line,
                    pipe=pipe,
                    max_new_tokens=max_new_tokens,
                    num_beams=num_beams,
                    temperature=temperature
                )
            
            qa= qa['content']
            qna_pairs= parse_qna(qa)
            
            print("QNA: ", qna_pairs)
            for qna in qna_pairs:
                results.append({"query": qna['question'], "input": qna['answer']})
    # Save results to CSV
    with open(output_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=["query", "input"])
        writer.writeheader()
        writer.writerows(results)

    print(f"Dataset saved to {output_csv_path}")

# Example usage
output_csv_path = "generated_dataset.csv"
generate_fine_tuning_dataset(lines, augmentation_pipeline, output_csv_path)


Generating dataset:   0%|                                                                                                                                                                                                                                                              | 0/120 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Generating dataset:   1%|██                                                                                                                                                                                                                                                  | 1/120 [00:40<1:20:55, 40.81s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'What is the recommended prerequisite for installing the webots_ros2 package?', 'answer': 'Understanding basic ROS principles covered in the beginner Tutorials, particularly Creating a workspace and Creating a package.'}, {'question': 'How do you install the webots_ros2 package from the latest up-to-date sources from Github?', 'answer': 'By running the command "git clone --recurse-submodules src/webots_ros2" in a terminal.'}, {'question': 'What is the purpose of the webots_ros2 package?', 'answer': 'The webots_ros2 package provides an interface between ROS 2 and Webots.'}, {'question': 'What is the environment variable used to specify the location of Webots?', 'answer': 'ROS2_WEBOTS_HOME'}, {'question': "What happens if Webots couldn't be found in the default installation paths?", 'answer': 'webots_ros2 will show a window offering the automatic installation of the latest compatible version of Webots.'}, {'question': 'How do you launch the webots_ros2_universal_robot

Generating dataset:   2%|████                                                                                                                                                                                                                                                | 2/120 [01:45<1:48:22, 55.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'What is the goal of this tutorial?', 'answer': 'Install the webots_ros2 package and run simulation examples on Windows.'}, {'question': 'What is the recommended prerequisite for this tutorial?', 'answer': 'Understanding basic ROS principles, particularly Creating a workspace and Creating a package.'}, {'question': 'What is the webots_ros2 package?', 'answer': 'The webots_ros2 package provides an interface between ROS 2 and Webots.'}, {'question': 'What is Webots?', 'answer': 'Webots is a prerequisite to use the webots_ros2 package, and it can be installed by following the installation procedure or building it from sources.'}, {'question': 'How does ROS 2 find Webots installations?', 'answer': 'ROS 2 looks for Webots at the following locations (in this order): ROS2_WEBOTS_HOME environment variable, WEBOTS_HOME environment variable, and default installation paths for a compatible version.'}, {'question': 'What is the first task in the tutorial?', 'answer': 'Install W

Generating dataset:   2%|██████                                                                                                                                                                                                                                              | 3/120 [02:43<1:49:28, 56.14s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'What is the goal of this tutorial?', 'answer': 'Install the webots_ros2 package and run simulation examples on macOS.'}, {'question': 'What is the recommended tutorial level for this tutorial?', 'answer': 'Advanced'}, {'question': 'How long will it take to complete this tutorial?', 'answer': '10 minutes'}, {'question': 'What is the webots_ros2 package?', 'answer': 'The webots_ros2 package provides an interface between ROS 2 and Webots.'}, {'question': 'What is necessary to install in order to use the webots_ros2 package in the virtual machine?', 'answer': 'Webots should be installed natively on macOS.'}, {'question': 'What is the solution based on UTM virtual machines?', 'answer': 'It provides an improved user experience with ROS 2 compared to native macOS installation, as it runs ROS in a Linux environment.'}, {'question': 'What is the purpose of the shared folder?', 'answer': 'The shared folder allows the script to transfer the world and other resource files from

Generating dataset:   3%|████████▏                                                                                                                                                                                                                                           | 4/120 [04:48<2:41:15, 83.41s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'How to enable multicast for DDS communication?', 'answer': 'Enable multicast by running `ros2 multicast receive` in one terminal and `ros2 multicast send` in another. If no response is received, update the firewall configuration to allow multicast using `ufw`.'}, {'question': 'How to verify if the multicast flag is enabled for the network interface?', 'answer': 'Use the `ifconfig` tool and look for `MULTICAST` in the flags section.'}, {'question': 'What to do if `rclpy` fails to import due to missing C extension libraries?', 'answer': 'Compare the libraries present in the directory with the one mentioned in the error message and ensure that the same Python interpreter is used as the one used to build the binary.'}, {'question': 'How to fix the internal compiler error on a memory-constrained platform like Raspberry PI?', 'answer': 'Build single-threaded by prefixing the build invocation with `MAKEFLAGS=-j1`.'}, {'question': 'What to do if `ros1_bridge` requires 4Gb 

Generating dataset:   4%|██████████▏                                                                                                                                                                                                                                         | 5/120 [05:15<2:00:45, 63.00s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'What is the recommended method for installing ROS 2 on Linux?', 'answer': 'Installing from packages'}, {'question': 'What is the difference between installing from binary packages and building from source?', 'answer': 'Binary packages are for general use and provide an already-built install of ROS 2, while building from source is meant for developers looking to alter or explicitly omit parts of ROS 2’s base'}, {'question': 'Which platforms are supported for building ROS 2 from source?', 'answer': 'Ubuntu Linux 22.04, Windows 10, RHEL-9/Fedora, macOS'}, {'question': 'What is the recommended method for installing ROS 2 on Windows?', 'answer': 'Binary archive'}, {'question': 'Why is installing from packages recommended on Linux?', 'answer': 'It installs necessary dependencies automatically and also updates alongside regular system updates'}, {'question': "What should you do if you don't have root access on Linux?", 'answer': 'Use the binary archive'}, {'question': 'Wh

Generating dataset:   5%|████████████▏                                                                                                                                                                                                                                       | 6/120 [06:17<1:58:58, 62.62s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


QNA:  [{'question': 'What is the purpose of the interface definition language (IDL) in ROS 2?', 'answer': 'To describe interfaces and automatically generate source code for the interface type in several target languages.'}, {'question': 'What are the three types of interfaces used in ROS applications?', 'answer': 'Topics, services, or actions.'}, {'question': 'What is the purpose of.msg files in ROS?', 'answer': 'To describe the fields of a ROS message.'}, {'question': 'What is the format of a field definition in a.msg file?', 'answer': 'fieldtype fieldname'}, {'question': 'What are the two parts of a.srv file?', 'answer': 'A request and a response.'}, {'question': 'What is the purpose of constants in ROS?', 'answer': 'To define a value that can never be changed programmatically.'}, {'question': 'What is the format of a constant definition in ROS?', 'answer': 'constanttype CONSTANTNAME=constantvalue'}, {'question': 'What is the purpose of actions in ROS?', 'answer': 'To provide a long-

Generating dataset:   5%|████████████▏                                                                                                                                                                                                                                       | 6/120 [06:23<2:01:28, 63.94s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 448.00 MiB. GPU 0 has a total capacity of 47.51 GiB of which 208.31 MiB is free. Including non-PyTorch memory, this process has 47.30 GiB memory in use. Of the allocated memory 45.30 GiB is allocated by PyTorch, and 1.49 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

# Make into ShareGPT Format

In [None]:
df_old= pd.read_csv('generated_dataset_early.csv')

In [18]:
df= pd.read_csv('generated_dataset.csv')

In [35]:
df

Unnamed: 0,query,input
0,What is the recommended prerequisite for this ...,"Completing the Getting Started tutorial, espec..."
1,What toolbox is used in this tutorial for SLAM?,SLAM Toolbox
2,What is the command to install SLAM Toolbox?,sudo apt install ros-<ros2-distro>-slam-toolbo...
3,What launch file is used to bring up the turtl...,tb3_simulation_launch.py
4,What is the command to launch Navigation2 with...,ros2 launch nav2_bringup navigation.launch.py
...,...,...
1891,What is the Rolling distribution of ROS 2?,The Rolling distribution of ROS 2 is the rolli...
1892,How often is a new ROS 2 distribution released?,A new ROS 2 distribution is released yearly on...
1893,What is the difference between the Rolling dis...,The Rolling distribution is continuously updat...
1894,What happens to packages released into the Rol...,Packages released into the Rolling distributio...


In [19]:
def clean_special_tokens(df):
    """
    Removes special tokens [QUERY], [/QUERY], [ANSWER], and [/ANSWER] from the 'query' and 'input' columns in a DataFrame.
    
    Args:
        df (pd.DataFrame): The input DataFrame with 'query' and 'input' columns.

    Returns:
        pd.DataFrame: A new DataFrame with cleaned columns.
    """
    # Define the list of special tokens to remove
    special_tokens = ["[QUERY]", "[/QUERY]", "[ANSWER]", "[/ANSWER]", "[TEXT]", "[/TEXT]", "[QUIZ]", "[/QUIZ]", "[CONTEXT]", "[/CONTEXT]", "[CORE]", "[/CORE]"]
    
    # Remove special tokens from both columns
    for token in special_tokens:
        df['query'] = df['query'].str.replace(token, "", regex=False)
        df['input'] = df['input'].str.replace(token, "", regex=False)
    # Optionally strip leading and trailing whitespace
    df['query'] = df['query'].str.strip()
    df['input'] = df['input'].str.strip()

    return df

In [36]:
cleaned_df = clean_special_tokens(df)

In [37]:
cleaned_df

Unnamed: 0,query,input
0,What is the recommended prerequisite for this ...,"Completing the Getting Started tutorial, espec..."
1,What toolbox is used in this tutorial for SLAM?,SLAM Toolbox
2,What is the command to install SLAM Toolbox?,sudo apt install ros-<ros2-distro>-slam-toolbo...
3,What launch file is used to bring up the turtl...,tb3_simulation_launch.py
4,What is the command to launch Navigation2 with...,ros2 launch nav2_bringup navigation.launch.py
...,...,...
1891,What is the Rolling distribution of ROS 2?,The Rolling distribution of ROS 2 is the rolli...
1892,How often is a new ROS 2 distribution released?,A new ROS 2 distribution is released yearly on...
1893,What is the difference between the Rolling dis...,The Rolling distribution is continuously updat...
1894,What happens to packages released into the Rol...,Packages released into the Rolling distributio...


In [38]:
def save_dataframe_to_json(df, output_path):
    """
    Save a DataFrame with 'query' and 'input' columns into a JSON file,
    where each row is represented as a list of dictionaries for 'system', 'user', and 'assistant'.

    Args:
        df (pd.DataFrame): Input DataFrame with 'query' and 'input' columns.
        output_path (str): Path to save the JSON file.

    Returns:
        None
    """
    data = []
    for _, row in df.iterrows():
        print(type(row['input']))
        conversation = [
            {"role": "system", "content": "You are an helpful agent."},
            {"role": "user", "content": row['query']},
            {"role": "assistant", "content": row['input']}
        ]
        data.append(conversation)

    # Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=4)

In [39]:
import pandas as pd
import json
import math

def save_dataframe_to_json(df, output_path):
    """
    Save a DataFrame with 'query' and 'input' columns into a JSON file,
    where each row is represented as a list of dictionaries for 'system', 'user', and 'assistant'.

    Args:
        df (pd.DataFrame): Input DataFrame with 'query' and 'input' columns.
        output_path (str): Path to save the JSON file.

    Returns:
        None
    """
    data = []

    for _, row in df.iterrows():
        # Ensure 'query' and 'input' are valid JSON-compatible values
        query = row['query'] if not (isinstance(row['query'], float) and math.isnan(row['query'])) else None
        user_input = row['input'] if not (isinstance(row['input'], float) and math.isnan(row['input'])) else None

        # Construct the conversation
        conversation = [
            {"role": "system", "content": "You are a helpful agent."},
            {"role": "user", "content": query},
            {"role": "assistant", "content": user_input}
        ]
        data.append(conversation)

    # Save to JSON file
    with open(output_path, 'w') as f:
        json.dump(data, f, indent=4)

# Example usage
# df = pd.DataFrame({'query': ["Hello", "How are you?"], 'input': ["Hi there!", None]})
# save_dataframe_to_json(df, 'output.json')


In [40]:
save_dataframe_to_json(cleaned_df, 'output.json')