In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

True

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="/home/srvadm001/nucleo-ia/finetuning/Fine-tuning/llama3_7b/snptee-instruction-dataset.json", split="train")

In [3]:
from langchain_community.document_loaders.pdf import PyPDFLoader
import glob
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import re
import json
import utils
import os

class FineTuning:

    def __init__(self):
        self.folder_path = '/home/paulo/Python_projects/llama3_7b_fine_tunning/pdfs'
        self.pdf_files = glob.glob(os.path.join(self.folder_path, '*'))

        chat = ChatGroq(
            temperature=0,
            model="llama3-70b-8192",
            max_retries = 30
        )

        system = utils.instruction_for_creating_triples

        human = "{text}"
        prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

        self.chain = prompt | chat

    def loading_pdfs(self):

        self.loaded_pdfs = [PyPDFLoader(pdf).load() for pdf in self.pdf_files]

    def creating_instruction_dataset(self):

        self.list_instructions_unstructured = []
        self.list_pages_processed = []

        for pdf_paper in self.loaded_pdfs:
            for pdf_page in pdf_paper:
                try:
                    answer = self.chain.invoke({"text": f"{pdf_page.page_content}"})
                    self.list_instructions_unstructured.append(answer)
                    self.list_pages_processed.append((pdf_paper, pdf_page))
                    print(f'Page {pdf_page.metadata['page']} from paper {pdf_page.metadata['source']} processed.')

                except Exception as e:
                    print(f'Error on paper {pdf_page.metadata['source']} on page {pdf_page.metadata['page']}: {str(e)}.')
                    continue

    @staticmethod
    def formating_instruction_dataset(text):

        instruction_pattern = re.compile(r'Instruction: (.*?)\n', re.DOTALL)
        input_pattern = re.compile(r'Input: (.*?)\n', re.DOTALL)
        output_pattern = re.compile(r'Output: (.*?)(?=Triple|\Z)', re.DOTALL)

        # Extract matches
        instructions = instruction_pattern.findall(text)
        inputs = input_pattern.findall(text)
        outputs = output_pattern.findall(text)

        # Create a list of dictionaries
        triples = []
        for i in range(len(instructions)):
            triple = {
                "Instruction": instructions[i],
                "Input": inputs[i],
                "Output": outputs[i]
            }
            triples.append(triple)

        return triples
    
    def return_instruction_dataset(self):
        self.loading_pdfs()
        self.creating_instruction_dataset()

        list_triples = []

        for instruction_element in self.list_instructions_unstructured:
            triple = self.formating_instruction_dataset(instruction_element.content)
            list_triples.append(triple)

        self.list_triples = list_triples

        list_of_dicionaries = []
        for triple_ in self.list_triples:
            for instruction_set in triple_:
                list_of_dicionaries.append(instruction_set)

        with open("./arvix_instruction_dataset.json", "w") as f:
            json.dump(list_of_dicionaries, f)

        self.list_of_dicionaries = list_of_dicionaries
        return self.list_of_dicionaries

# finetuning = FineTuning()
# triples = finetuning.return_instruction_dataset()

In [4]:
finetuning = FineTuning()
finetuning.loading_pdfs()
finetuning.loaded_pdfs

[[Document(metadata={'source': '/home/paulo/Python_projects/llama3_7b_fine_tunning/pdfs/2407.02987.pdf', 'page': 0}, page_content='LoRA-Guard : Parameter-Efficient Guardrail Adaptation for Content\nModeration of Large Language Models\nHayder Elesedy Pedro M. Esperança Silviu Vlad Oprea Mete Ozay\nSamsung R&D Institute UK (SRUK), United Kingdom\nCorrespondence: {p.esperanca, m.ozay}@samsung.com\nAbstract\nGuardrails have emerged as an alternative\nto safety alignment for content moderation\nof large language models (LLMs). Exist-\ning model-based guardrails have not been\ndesigned for resource-constrained computa-\ntional portable devices, such as mobile phones,\nmore and more of which are running LLM-\nbased applications locally. We introduce\nLoRA-Guard , a parameter-efficient guardrail\nadaptation method that relies on knowledge\nsharing between LLMs and guardrail mod-\nels. LoRA-Guard extracts language features\nfrom the LLMs and adapts them for the con-\ntent moderation task using 

In [15]:
texto = ' '.join([doc.page_content for doc in finetuning.loaded_pdfs[0]])
texto

'LoRA-Guard : Parameter-Efficient Guardrail Adaptation for Content\nModeration of Large Language Models\nHayder Elesedy Pedro M. Esperança Silviu Vlad Oprea Mete Ozay\nSamsung R&D Institute UK (SRUK), United Kingdom\nCorrespondence: {p.esperanca, m.ozay}@samsung.com\nAbstract\nGuardrails have emerged as an alternative\nto safety alignment for content moderation\nof large language models (LLMs). Exist-\ning model-based guardrails have not been\ndesigned for resource-constrained computa-\ntional portable devices, such as mobile phones,\nmore and more of which are running LLM-\nbased applications locally. We introduce\nLoRA-Guard , a parameter-efficient guardrail\nadaptation method that relies on knowledge\nsharing between LLMs and guardrail mod-\nels. LoRA-Guard extracts language features\nfrom the LLMs and adapts them for the con-\ntent moderation task using low-rank adapters,\nwhile a dual-path design prevents any perfor-\nmance degradation on the generative task. We\nshow that LoRA-Gu

In [22]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50, separators = ['.\n'], keep_separator=False)

# Split the document into chunks
chunks = text_splitter.split_text(texto)

# Print the chunks
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n", end='\n'*5)


import pickle
# Open the file in write mode and save the list as JSON
with open('./list_text.pkl', "wb") as file:
    pickle.dump(chunks, file)

Chunk 1:
LoRA-Guard : Parameter-Efficient Guardrail Adaptation for Content
Moderation of Large Language Models
Hayder Elesedy Pedro M. Esperança Silviu Vlad Oprea Mete Ozay
Samsung R&D Institute UK (SRUK), United Kingdom
Correspondence: {p.esperanca, m.ozay}@samsung.com
Abstract
Guardrails have emerged as an alternative
to safety alignment for content moderation
of large language models (LLMs). Exist-
ing model-based guardrails have not been
designed for resource-constrained computa-
tional portable devices, such as mobile phones,
more and more of which are running LLM-
based applications locally. We introduce
LoRA-Guard , a parameter-efficient guardrail
adaptation method that relies on knowledge
sharing between LLMs and guardrail mod-
els. LoRA-Guard extracts language features
from the LLMs and adapts them for the con-
tent moderation task using low-rank adapters,
while a dual-path design prevents any perfor-
mance degradation on the generative task. We
show that LoRA-Guard outperform

In [23]:
!pip install datasets

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Using cached filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Using cached pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Using cached tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.

In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm
