### Notes: If ImportError occurs, it's probably due to the huggingface-hub. 
> pip install huggingface-hub==0.25.0

## Import

In [1]:
import torch
from transformers import pipeline

from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, TextStreamer, pipeline, BitsAndBytesConfig, AutoModelForCausalLM
import os

DEVICE = "cuda:0" if torch.cuda.is_available() else "CPU"
HUGGING_FACE_TOKEN = os.environ.get('HUGGING_FACE_TOKEN') #in terminal: export HUGGING_FACE_TOKEN="YOUR_TOKEN"


## Load Model

In [11]:
'''
Possible Models:
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.2-11B-Vision-Instruct
'''
model_id = "meta-llama/Llama-3.2-3B-Instruct" 

# Quanitisize your model dtype (for sparsity)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

# Set token using ENV variable
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HUGGING_FACE_TOKEN)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=HUGGING_FACE_TOKEN,
    quantization_config=bnb_config
)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [12]:
prompt = f"What's your name?"
inputs = tokenizer(prompt, return_tensors='pt', truncation=True)
inputs = inputs.to('cpu')  # Ensure inputs are on CPU

output = model.generate(
    **inputs,
    max_new_tokens=50,
    num_beams=1,
    do_sample=False,
    temperature=1
)

answer = tokenizer.decode(output[0], skip_special_tokens=True)

print(answer)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


What's your name? I'm not sure if I should be calling you by a name or not. I'm not sure if I'm allowed to know your name.
I'm not sure if I should be asking you this, but I'm curious. I'm a large


In [13]:
loader = PyPDFDirectoryLoader("/home/dongkyu/RAG/knowledge") #The Knowledge Base Folder
docs = loader.load()
len(docs) # this should give you number of pages

13

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(docs)
len(texts)


51

In [15]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-base",
    model_kwargs={"device": DEVICE}
)

db = Chroma.from_documents(texts, embeddings, persist_directory="db")


load INSTRUCTOR_Transformer
max_seq_length  512


In [16]:
DEFAULT_SYSTEM_PROMPT = """
Based on the information in this document provided in context, answer the question as accurately as possible in 1 or 2 lines. If the information is not in the context,
respond with "I don't know" or a similar acknowledgment that the answer is not available.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

SYSTEM_PROMPT = "Use the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines."

template = generate_prompt(
    """
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

prompt = PromptTemplate(template=template, input_variables=["context", "question"])


In [17]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

llm = HuggingFacePipeline(pipeline=text_pipeline)

In [18]:
ask = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=False,
    chain_type_kwargs={"prompt": prompt},
)

result = ask("Give me a TLDR of this document")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


 

Here is a brief summary:

Plasticity loss refers to the degradation of an agent's ability to learn and adapt to new information over time. This phenomenon was previously overlooked but has been gaining attention in recent years due to its implications for continual learning and deep reinforcement learning.

In both domains, plasticity loss can be attributed to the gradual erosion of neural connections between neurons, leading to decreased performance on subsequent tasks. The causes of plasticity loss are multifaceted, including changes in task distributions, reduced novelty, and increased repetition.

Researchers have proposed various methods to mitigate plasticity loss, such as regularization techniques, knowledge distillation, and transfer learning. However, these approaches often come with trade-offs, and there is still much to be learned about how to effectively address plasticity loss in different contexts.

Overall, addressing plasticity loss remains a pressing challenge in bo

In [19]:
result

{'query': 'Give me a TLDR of this document',
 'result': "[INST] <<SYS>>\nUse the following pieces of context to answer the question at the end. Do not provide commentary or elaboration more than 1 or 2 lines.\n<</SYS>>\n\n\nthe continual learning literature has primarily focused on reducing catastrophic forgetting [Goodfellow et al., 2013,\nKirkpatrick et al., 2017], more recently, the issue of plasticity loss has gained significant attention [Dohare et al.,\n2021, 2023, Abbas et al., 2023]. Dohare et al. [2021] demonstrated that loss of plasticity sometimes becomes evident\nonly after training for long sequences of tasks. Therefore, in continual learning, mitigating plasticity loss becomes\nespecially important as agents encounter many tasks, or more generally a non-stationary data stream, over a long\nlifetime.\nReinforcement Learning. Plasticity loss has also gained significant attention in the deep reinforcement learning\n(RL) literature [Igl et al., 2020, Kumar et al., 2020, Nikis