# RAG

# Installing requirements

In [1]:
!pip install mwparserfromhell --quiet
!pip install datasets --quiet
!pip install torch  transformers accelerate bitsandbytes pypdf chromadb sentence-transformers pydantic --quiet
!pip install llama-index llama-index-embeddings-huggingface llama-index-llms-huggingface llama-index-readers-file llama-index-vector-stores-chroma llama-index-llms-anthropic --quiet
!pip install rouge-score



# Loading Dataset

Wikipédia szedetet fogok használni datasetnek, de annak

In [2]:
from datasets import load_dataset


dataset =load_dataset("wikipedia", "20220301.simple",trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# SET up model

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
import sys
import chromadb
from llama_index.core import VectorStoreIndex, download_loader, ServiceContext, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core.storage.storage_context import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
from llama_index.core import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM
from pathlib import Path
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)
from llama_index.core.response.pprint_utils import pprint_response
from llama_index.core.postprocessor import SentenceTransformerRerank








Itt létrehozom a modelt (A zephyr 7b beta modellt választottam).
Használok quantifikációt, hogy könnyebben fusson a colab gépen.
Ezenkívül a paramétereket olyanra állítottam, hogy pontosabb, fókuszált válaszokat adjon.

In [4]:
import torch
from transformers import BitsAndBytesConfig
from llama_index.core.prompts import PromptTemplate
from llama_index.llms.huggingface import HuggingFaceLLM

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)


llm = HuggingFaceLLM(
    model_name="HuggingFaceH4/zephyr-7b-beta",
    tokenizer_name="HuggingFaceH4/zephyr-7b-beta",
    query_wrapper_prompt=PromptTemplate("<|system|>\n</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
    context_window=3900,
    max_new_tokens=256,
    model_kwargs={"quantization_config": quantization_config},
    # tokenizer_kwargs={},
    generate_kwargs={"temperature": 0.5, "top_k": 25, "top_p": 0.8},

    device_map="auto",
)



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Beállítom az embedding modellt és a változókat.

In [5]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.chunk_size = 1024
Settings.chunk_overlap = 50

Így válaszol a LLM RAG nélkül.

In [6]:
llm.complete("What happened on 20 of April?")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


CompletionResponse(text='Here are a few significant events that occurred on April 20 in history:\n\n1. Columbine High School massacre: On this day in 1999, two students, Eric Harris and Dylan Klebold, entered Columbine High School in Littleton, Colorado, and carried out a shooting rampage that left 12 students and one teacher dead before they committed suicide.\n\n2. Exxon Valdez oil spill: On April 20, 1989, the Exxon Valdez, an oil tanker, ran aground in Prince William Sound, Alaska, spilling over 10 million gallons of crude oil into the ocean and causing significant environmental damage.\n\n3. NASA launches Space Shuttle Discovery: On April 20, 1990, NASA launched the Space Shuttle Discovery on mission STS-31, carrying the Hubble Space Telescope into orbit.\n\n4. Death of Kurt Cobain: On this day in 1994, the lead singer and guitarist of the grunge band Nirvana, Kurt Cobain, was found dead in his Seattle home.\n\n', additional_kwargs={}, raw={'model_output': tensor([[    1,   523, 2

# Dataset feldolgozása
Az adatokat átalakítom Document formára, és utána elmentjük a vectorstoreindexbe őket. (A folyamat közben fel lesznek darabolva, és beágyazva, hogy lehessen később keresni belőle)

Az adatokat ChromeVectorStoreban tároljuk, hogy ha el akarjuk menteni/módosítani, nagy mennyiségű adatnál hasznos.

In [7]:
from llama_index.core import Document
from llama_index.core import VectorStoreIndex


documents = [
    Document(text=f"{row['title']}\n{row['text']}", id = f"doc_id_{i}")
    for i, row in enumerate(dataset["train"])
]

In [8]:
#Gyorsabb futás érdekében csak az adatok részét használjuk
documents = documents[:10000]

In [28]:
client = chromadb.PersistentClient(path="./test")
collection = client.get_or_create_collection(name="firstcollection5")

In [26]:
# Set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [27]:


# Create the VectorStoreIndex from the documents
index = VectorStoreIndex.from_documents(
    documents, show_progress=True,  verbose = True, storage_context =storage_context)

Parsing nodes:   0%|          | 0/10000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2048 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/2021 [00:00<?, ?it/s]

#Eredmény

Itt látszani fog, hogyan használja a RAG-t. Felrakjuk a kérdést, ez alapján a kérdéshez közel álló indexeket előveszi és berakja a contextbe, ami alapján válaszol az LLM.

In [39]:
query="What is the similiraity between December and April?"

query_engine =index.as_query_engine(similarity_top_k=5)
response = query_engine.query(query)
pprint_response(response)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Final Response: December and April share a unique similarity in that
they both begin and end on the same day of the week in a given year,
particularly in common years. In these years, December starts and ends
on the same day of the week as September, and April starts and ends on
the same day of the week as October. This means that the first and
last days of December and April are exactly 35 weeks (245 days) apart.
This similarity is not always present, as in leap years, July also
shares this characteristic with January. However, in common years
immediately before other common years, January starts on the same day
of the week as April and July of the following year, and in leap years
and years immediately before that, September and December of the
following year. January finishes on the same day of the week as July
of the following year, and in leap years and years immediately before
that, April and December of the following year. This unique similarity
is not shared by any other months

Itt látható a context amiből válaszol. Pár context fölösleges

In [40]:
response

Response(response='December and April share a unique similarity in that they both begin and end on the same day of the week in a given year, particularly in common years. In these years, December starts and ends on the same day of the week as September, and April starts and ends on the same day of the week as October. This means that the first and last days of December and April are exactly 35 weeks (245 days) apart. This similarity is not always present, as in leap years, July also shares this characteristic with January. However, in common years immediately before other common years, January starts on the same day of the week as April and July of the following year, and in leap years and years immediately before that, September and December of the following year. January finishes on the same day of the week as July of the following year, and in leap years and years immediately before that, April and December of the following year. This unique similarity is not shared by any other mon

Itt reranking segítségével leszűkitjük a contextet (ebben az esetben 1-re, amitől a példa jól látszik, de valós esetben nem kell ilyen kicsire csökkenteni.)

In [14]:
rerank = SentenceTransformerRerank( model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=1)

In [15]:
query_engine = index.as_query_engine(similarity_top_k=5, node_postprocessors=[rerank] )
response = query_engine.query(query)
response

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Response(response='December and April both end on the same day of the week. This is because December has 31 days and ends on the same calendar date as April, which is exactly 35 weeks (245 days) apart. This means that if December 31 falls on a Tuesday, for example, then April 30 will also fall on a Tuesday. This pattern repeats every year, regardless of leap years.', source_nodes=[NodeWithScore(node=TextNode(id_='2fc89856-d027-41eb-b281-7ff66ee76b33', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='9dcdaa7d-8671-4abf-87da-401646a0d4ee', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='123ce736a765ded1a48581ad7ccd8bcac1337ed6f81f505a31ce243b2833a87c'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b7f1cd0a-4c7f-4514-b9a2-c7cfb7bb49ed', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='ed65d1204aabebf228306973511f22574e4002cba35be37a6d6fbd28d7bc41e4')},

Itt létrehozzuk a chatbotot, adunk promptot neki, hogy milyen magatartást/választ várunk tőle.

In [16]:
from llama_index.core.memory import ChatMemoryBuffer
memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
chat_engine = index.as_chat_engine(chat_mode="context", verbose=True, memory=memory,
    system_prompt=(
        "You are a chatbot, you have to answer the questions asked. Only use the context provided, dont use any previously known information, do not hallucinate."
    ),
    node_postprocessors=[rerank])

#Chatbot
És itt lehet beszélgetni a chatbottal, ami emlékezik a beszélgetés egy részére, és a "bye"-al ki lehet lépni, ami után elfelejti az eddigi beszégetést.

In [17]:
print("If you want to leave the conversation, say bye")
while True:
  user_input = input("Enter your query: ")
  if user_input.lower() == "bye":
    break
  response = chat_engine.chat(user_input)
  pprint_response(response)
chat_engine.reset()

If you want to leave the conversation, say bye
Enter your query: What is the similiraity between December and April?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Final Response: December and April both end on the same day of the
week. This is because each month's last day is exactly 35 weeks (245
days) apart, which results in the same weekday for the final day of
both months.
Enter your query: And february?


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Final Response: February also ends on the same day of the week as
January in common years, as each other's last days are exactly 4 weeks
(28 days) apart. However, in leap years, February is the only month to
begin and end on the same day of the week. Additionally, February
begins on the same day of the week as March and November in common
years, and on the same day of the week as August in leap years.
Enter your query: bye


#Válaszok értékelése:
1.: A felhasználóktól meg lehet kérdezni hogy elégedettek voltak-e a programmal. Figyelni kell rá, hogy a felhasználók gyakrabban jeleznek vissza valamiről, ami rosszul működik, mintha valami jól.

2.: Lehet egy verifikációs adathalmazt tartani elvárt válaszokkal, és ezt össze lehet hasonlítani a kapott válasszal. (pl: ROUGE,  Recall-Oriented Understudy for Gisting Evaluation, vagy embedding alapján.)

3.: Pár, az üzlethez értő szakember leteszteli olyan kérdésekkel, amik szerintük gyakran előfordulnak.

In [18]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score('The quick brown fox jumps over the lazy dog',
                      'The quick brown dog jumps on the log.')
scores

{'rouge1': Score(precision=0.75, recall=0.6666666666666666, fmeasure=0.7058823529411765),
 'rougeL': Score(precision=0.625, recall=0.5555555555555556, fmeasure=0.5882352941176471)}

#Elérhetővé tétel
Ez a program egy API ként futna belső szerveren,hogy könnyen lehessen kezelni, hogy ki és mennyi ideig férhet hozzá. Az API elérhető lehet webes felületről/alkalmazásba beépítve/mobilról is akár, (sőt kiegészíthető hangfelismerés és felolvasással is, üzleti igény függő). Mivel a chatbot használata költséges, ezért csak visszaigazolt felhasználóval szabad használni, és előfizetéstől függően változik a limit és priority ha sokan használják.

#Továbbfejlesztés
A használt modelleket/database-t letölteni és onnan betölteni.

A kód osztályokba és függvényekbe szervezése. Könnyebb bemutatni a működését ahogyan most van egy notebookból, de később össze kell szervezni könnyebb felhasználhatóság/fejlesztés/módosítás miatt.


In [37]:
import pandas as pd
from datetime import datetime


data = {
    "Question": ["What is the similiraity between December and April?", "And february?"],
    "Response": ["December and April both end on the same day of the week. This is because each month's last day is exactly 35 weeks (245 days) apart, which results in the same weekday for the final day of both months.",
                 """February also ends on the same day of the week as
January in common years, as each other's last days are exactly 4 weeks
(28 days) apart. However, in leap years, February is the only month to
begin and end on the same day of the week. Additionally, February
begins on the same day of the week as March and November in common
years, and on the same day of the week as August in leap years."""],
}

dashboard = pd.DataFrame(data)


dashboard.head()


Unnamed: 0,Question,Response
0,What is the similiraity between December and A...,December and April both end on the same day of...
1,And february?,February also ends on the same day of the week...


In [38]:
dashboard.to_csv("dashboard_input_table.csv", index=False)