In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import BitsAndBytesConfig

In [3]:
quantizations_config = BitsAndBytesConfig(load_in_4bit=True,
                                          bnb_4bit_compute_dtype=torch.float16)

In [4]:
from transformers.utils import is_flash_attn_2_available

In [5]:
## teste se a gpu aguenta um score de > 8
if is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8:
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"

In [6]:
attn_implementation

'sdpa'

In [7]:
is_flash_attn_2_available()

False

In [8]:
model_checkpoint = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"

In [9]:
torch.cuda.get_device_capability(0)[0]

8

In [10]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_checkpoint)

In [11]:
use_quantization_config = True

In [13]:
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_checkpoint,
                                              #   torch_dtype= torch.float16,
                                                 quantization_config=quantizations_config,
                                              #   low_cpu_mem_usage = False,
                                                 attn_implementation=attn_implementation,
                                                 )

`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.64s/it]


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
device

device(type='cuda')

In [16]:
llm_model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((409

In [17]:
def get_num_parametros(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

In [18]:
get_num_parametros(model=llm_model)

4540600320

In [19]:
## Carregando o retrieval

In [22]:
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import util, SentenceTransformer
import torch
import textwrap

In [23]:
dataset = pd.read_pickle("Dataset/Embedded_chunks.pkl")
    
## carregando o modelo
model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")

## convertendo nossa lista de embeddings para tensor
embeddings = torch.stack(list(dataset["embedding"]))

## Convertendo o dataset para lista de dicionários
dataset = dataset.to_dict(orient="records")

In [None]:
## RAG is in the order!!

In [24]:
import retrieval

In [25]:
def formatador_prompt(query: str,
                      context_items: list[dict])-> str:
    context = "- "+ "\n- ".join(item["paragrafo_chunk"] for item in context_items)

    prompt = f"""

    Based on the following context items, please answer the query:
    Context items:
    {context}

    Query: {query}

    Answer:

    """
    return prompt

In [30]:
def prompting(query: str):
    _, indices = retrieval.busca_contexto_relevante(query=query,
                                               embeddings=embeddings,
                                               model=model,
                                               numero_de_contextos=10)
    context_items = [dataset[i] for i in indices]
    return formatador_prompt(query=query,context_items=context_items)

In [31]:
prompt = prompting(query="Describe the V's of big data")

In [32]:
inputs_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

In [33]:
outputs = llm_model.generate(**inputs_ids,
                             temperature = 0.5,
                             do_sample= True,
                             max_new_tokens = 512)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [39]:
output_text = tokenizer.decode(outputs[0])

In [40]:
print(output_text)

<｜begin▁of▁sentence｜>

    Based on the following context items, please answer the query:
    Context items:
    - In simple words, big data can be combined as three Vs.The three Vs are volume, variety and velocity.These three can be explained as follows.Volume refers to the large amount of data which is being generated at a constant rate in different industries.So, it simply is large amounts of data.The second ‘V’, variety, means that the data can be of Big Data—A New Technology Trend and Factors … 261
- different types such as documents, images, databases.The third ‘V’ is velocity.As stated previously, the amount of data being generated is very high.For example, the number of images uploaded on Facebook last year is more than the total images in previous years [23].Therefore, it is important to consider this factor thoroughly while working on big data implementation.Apart from that, it is also important to see the speed of retrieval of data as well from these large data sets.However,

In [41]:
output_text = output_text.replace(prompt,"").replace("<｜begin▁of▁sentence｜>","").replace("</think>","").replace("<think>","").replace("<｜end▁of▁sentence｜>","")

In [43]:
print(output_text)

 The three Vs of big data are Volume, Variety, and Velocity. They characterize the unique aspects of big data. Volume refers to the large amount of data generated, Variety refers to the diverse types of data, and Velocity refers to the speed at which data is generated and processed. These Vs distinguish big data from traditional data, ensuring that organizations recognize the challenges and opportunities associated with managing and analyzing big data.


The three Vs of big data are Volume, Variety, and Velocity. These characteristics define the unique aspects of big data, distinguishing it from traditional data management practices. 

- **Volume** refers to the large amount of data generated, often measured in terabytes, petabytes, or even larger units like exabytes and zettabytes.
- **Variety** indicates the diverse types of data, which can include text, images, documents, audio, video, and more, reflecting the broad range of data sources.
- **Velocity** describes the rapid speed at 