## Dataset:

In [None]:
# Import necessary libraries
import re
from tqdm.auto import tqdm
import pandas as pd

# Global variables
ACCESS_LOG_FILE_PATH = "drive/MyDrive/access.log" # Our access data path
LOG_PATTERN = re.compile( # Log pattern in our access.log data
  r'(?P<ip>\S+) - - \[(?P<timestamp>.*?)\] "(?P<method>\S+) (?P<url>\S+) '
  r'(?P<protocol>\S+)" (?P<status>\d{3}) (?P<size>\d+) "(?P<referrer>.*?)" '
  r'"(?P<user_agent>.*?)" "(?P<other>.*?)"'
)
DELETED_KEYS = ["url" ,"embedding", "referrer", "other"] # (Optional)
NUM_EXAMPLES = 1000 # Take the first 1000 examples

def read_access_log_file(access_log_file_path, log_pattern, num_examples=1000):
  """
  A function to read access log file and convert it into a dictionary.
  """
  access_logs_dict = []
  log_pattern = re.compile(log_pattern)  # log_pattern'ı derleyin
  with open(access_log_file_path, "r") as file:
    for _ in tqdm(range(num_examples), desc="Processing lines"):
      line = file.readline()
      if not line:
        break
      match = log_pattern.match(line)
      if match:
        log_data = match.groupdict()
        access_logs_dict.append(log_data)

  return access_logs_dict

def drop_keys(access_logs_dict, deleted_keys):
  """
  A function to delete unnecessary keys in the dictionary
  """
  for key in deleted_keys:
    for idx in tqdm(range(len(access_logs_dict))):
      del access_logs_dict[idx][key]

def extract_browser_and_os(user_agent):
  """
  A function to extrack browser and os name from url
  """
  browser_match = re.search(r'\b(Chrome|Firefox|Safari|Opera|Edge|MSIE|Trident|Googlebot|bing|AhrefsBot)\b', user_agent)
  if browser_match:
    browser_info = browser_match.group(0)
    if browser_info in ["MSIE", "Trident"]:
      browser_info = "Internet Explorer"
  else:
    browser_info = "unknown"

  os_match = re.search(r'\b(Windows NT|Android|iPhone|iPad|Mac OS X|Linux|Windows Phone|Macintosh)\b', user_agent)
  if os_match:
    os_info = os_match.group(0)
    # İşletim sistemi açıklamalarını daha okunabilir hale getir
    if "Windows NT" in os_info:
      windows_version = re.search(r'Windows NT (\d+\.\d+)', user_agent)
      if windows_version:
        version = windows_version.group(1)
        os_info = f"Windows {version}"
      else:
        os_info = "Windows"
    elif os_info == "Mac OS X":
      os_info = "Mac OS"
  else:
    os_info = "unknown OS"

  return browser_info, os_info

def convert_to_context(access_logs_dict):
  """
  A function to create our context using log data.
  """
  for idx, item in enumerate(access_logs_dict):
    url = item.get('url', '')
    user_agent = item.get('user_agent', '')

    is_mobile = any(keyword in user_agent for keyword in ["Mobile", "Android", "iPhone", "iPad", "Windows Phone"])

    if '/static/images/' in url:
      if "amp" in url:
        if 'blog.png' in url:
          action = "accessed a blog image"
        elif 'instagram.png' in url:
          action = "accessed an Instagram image"
        elif 'telegram.png' in url:
          action = "accessed a Telegram image"
      elif 'guarantees/' in url:
        guarantee_type = url.split('/static/images/guarantees/')[1].split('.')[0]
        guarantee_type = guarantee_type.replace('-', ' ')
        action = f"viewed a guarantee image for {guarantee_type}"
      else:
        action = "loaded a static image"
    elif '/static/' in url:
      if 'css' in url:
        action = "accessed a CSS file"
      elif 'js' in url:
        action = "accessed a JavaScript file"
      elif 'png' in url:
        action = "accessed an image"
    elif '/image/' in url:
      product_id_match = re.search(r'/image/(\d+)', url)
      product_id = product_id_match.group(1) if product_id_match else "unknown"
      action = f"viewed a product image with ID {product_id}"
    elif '/product/' in url:
      product_id = url.split('/product/')[1].split('/')[0]
      action = f"viewed product with ID {product_id}"
    elif '/filter/' in url:
      filter_params = url.split('/filter/')[1]
      filter_params = filter_params.replace('%2C', ',')
      action = f"applied filter parameters: {filter_params}"
    elif '/m/product/' in url:
      product_id = url.split('/m/product/')[1].split('/')[0]
      action = f"viewed product with ID {product_id}"
    elif '/m/filter/' in url:
      filter_params = url.split('/m/filter/')[1]
      filter_params = filter_params.replace('%2C', ',')
      action = f"applied filter parameters: {filter_params}"
    elif '/settings/logo' in url:
      action = "accessed logo settings"
    elif '/m/article/' in url:
      article_id = url.split('/m/article/')[1]
      action = f"viewed article with ID {article_id}"
    elif '/m/browse/' in url:
      browse_term = url.split('/m/browse/')[1].replace('-', ' ')
      action = f"searched for {browse_term}"
    elif '/ajaxFilter/' in url:
      filter_params = url.split('/ajaxFilter/')[1].split('?')[0]
      page_number = re.search(r'page=(\d+)', url)
      page_number = page_number.group(1) if page_number else "unknown"
      action = f"applied filter parameters {filter_params} on page {page_number}"
    elif '/m/updateVariation' in url:
      action = "updated product variation"
    elif 'site/ping' in url:
      action = "pinged the site"
    elif '/search' in url:
      action = "searched a word in the site"
    else:
      action = "accessed a page"

    if item['status'] == '200':
      status_info = "successfully accessed the page"
    elif item['status'] == '404':
      status_info = "encountered an error while accessing the page"
    elif item['status'] == '302':
      status_info = "found a result in the page"
    elif item['status'] == '301':
      status_info = "the page user tried to access was moved permanently"
    else:
      status_info = "unknown status"

    device_info = "on a mobile device" if is_mobile else "on a desktop device"
    browser_info, os_info = extract_browser_and_os(user_agent)
    user_agent_info = f"using {browser_info} on {os_info}"

    context = (
        f"Using the {item.get('method', 'unspecified')} method, "
        f"{status_info}. The user {action} {device_info} and was {user_agent_info}."
    )

    item['log_index'] = idx
    item["context"] = context

def dict_to_df(access_logs_dict):
  """
  A function to convert dictionary into a Pandas DataFrame
  """
  df = pd.DataFrame(access_logs_dict)

  return df

# Read the log file and process it
access_logs_dict = list(read_access_log_file(ACCESS_LOG_FILE_PATH, LOG_PATTERN, NUM_EXAMPLES))
convert_to_context(access_logs_dict)
access_logs_dict_df = dict_to_df(access_logs_dict)
random.sample(access_logs_dict, k = 2)

# Optionally drop unnecessary keys
# drop_keys(access_logs_dict, DELETED_KEYS)

Processing lines:   0%|          | 0/1000 [00:00<?, ?it/s]

[{'ip': '5.209.200.218',
  'timestamp': '22/Jan/2019:03:58:22 +0330',
  'method': 'GET',
  'url': '/settings/logo',
  'protocol': 'HTTP/1.1',
  'status': '200',
  'size': '4120',
  'referrer': 'https://www.zanbil.ir/m/filter/b99%2Cp4510%2Cstexists%2Ct116',
  'user_agent': 'Mozilla/5.0 (Linux; Android 5.1.1; SM-G361H Build/LMY48B) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.91 Mobile Safari/537.36',
  'other': '-',
  'log_index': 632,
  'context': 'Using the GET method, successfully accessed the page. The user accessed logo settings on a mobile device and was using Chrome on Linux.'},
 {'ip': '204.18.198.248',
  'timestamp': '22/Jan/2019:03:56:59 +0330',
  'method': 'GET',
  'url': '/image/60550/productModel/200x200',
  'protocol': 'HTTP/1.1',
  'status': '200',
  'size': '5447',
  'referrer': 'https://www.zanbil.ir/m/filter?f=p12,b185',
  'user_agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148

## Embeddings:

In [None]:
!pip install -q sentence_transformers
# Import necessary libraries and data
from sentence_transformers import util, SentenceTransformer
import torch
import numpy as np
import random
from tqdm.auto import tqdm

# Set the device and create the embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"

# Create the embedding model
embedding_model = SentenceTransformer(model_name_or_path = "all-mpnet-base-v2", device = device)

def embed_context(access_logs_dict):
  """
  A function to embed the context and add another 'embedding' key.
  """
  for item in tqdm(access_logs_dict):
    item["embedding"] = embedding_model.encode(item["context"])

def convert_np_array(access_logs_dict):
  """
  A function to convert embeddings into a numpy array.
  """
  for item in tqdm(access_logs_dict):
    item["embedding"] = np.array(item["embedding"])

def random_samples(access_logs_dict, num_examples):
  """
  A function to visualize some samples from the data.
  """
  examples = random.sample(access_logs_dict, k = num_examples)
  print(f"Random examples from the data :\n", examples)

embed_context(access_logs_dict)
convert_np_array(access_logs_dict)
random_samples(access_logs_dict, 1)
access_logs_dict_df = dict_to_df(access_logs_dict)

embeddings = torch.tensor(np.stack(access_logs_dict_df["embedding"].tolist(), axis=0), dtype = torch.float32).to("cuda").cpu().numpy().astype("float32")

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Random examples from the data :
 [{'ip': '91.99.72.15', 'timestamp': '22/Jan/2019:03:56:20 +0330', 'method': 'GET', 'url': '/product/32798/63266/%DB%8C%D8%AE%DA%86%D8%A7%D9%84-%D9%81%D8%B1%DB%8C%D8%B2%D8%B1-%D8%B3%DB%8C%D9%86%D8%AC%D8%B1-%D9%85%D8%AF%D9%84-pearl-SR7', 'protocol': 'HTTP/1.1', 'status': '200', 'size': '40250', 'referrer': '-', 'user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.92 Safari/537.36', 'other': '-', 'log_index': 20, 'context': 'Using the GET method, successfully accessed the page. The user viewed product with ID 32798 on a desktop device and was using Chrome on Linux.', 'embedding': array([ 3.80550027e-02, -2.94137299e-02,  2.41061281e-02,  1.58750396e-02,
        1.21096506e-04, -1.23835690e-02,  5.09412400e-02,  4.86864075e-02,
        7.81349652e-03, -3.51880863e-02, -4.37673330e-02, -2.93808710e-02,
        3.66677227e-03,  5.68401907e-03,  1.66021213e-02,  1.23956045e-02,
       -3.26324184e-03,  1.479320

## FAISS Vector Database:

In [None]:
!pip install faiss-gpu
# Import necessary libraries and data
import faiss
import numpy as np

# Creating global variables
VECTOR_DIM = embeddings.shape[1]

def to_cpu(embeddings):
  """
  A function to move embeddings from GPU to CPU.
  """
  embedding_vectors = embeddings

  return embedding_vectors

def create_faiss_index(embedding_vectors, vector_dimension):
  """
  A function to create a FAISS index.
  """
  index = faiss.IndexFlatL2(vector_dimension)
  index.add(embedding_vectors)
  print(f"Total number of vectors in the index is {index.ntotal}")

  return index

embedding_vectors = to_cpu(embeddings)
index = create_faiss_index(embedding_vectors, VECTOR_DIM)

Total number of vectors in the index is 1000


## Retrieve Data:

### sentence_transformers.util.dot_score Data Retrieval:

In [None]:
# Importing necessary libraries and data
from time import perf_counter as timer

def retrieve_relevant_resources_dot_score(query: str,
                                          embeddings: np.ndarray = embeddings,
                                          model: SentenceTransformer = embedding_model,
                                          n_resources_to_return = 5,
                                          print_time: bool = True):
  """
  Embeds a query with model and returns top 'n_resources_to_return' scores and indices from embeddings.
  """
  # Embed the query
  query_embedding = model.encode(query, convert_to_tensor = True).cpu().numpy().astype('float32')

  if len(query_embedding.shape) == 1:
    query_embedding = query_embedding.reshape(1, -1)

  # Get dot scores
  start_time = timer()
  dot_scores = util.dot_score(a = query_embedding, b = embeddings)
  end_time = timer()

  if print_time:
    print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

  whole = torch.topk(input = dot_scores,
                               k = n_resources_to_return),

  scores, indices = whole[0][0], whole[0][1]

  return scores, indices

def print_top_results_and_scores_dot_score(query: str,
                                 embeddings: torch.tensor,
                                 access_logs_dict: list[dict] = access_logs_dict,
                                 n_resources_to_return: int = 5,
                                 print_time: bool = True):
  """
  Finds relevant passages given a query and prints them out along with their scores.
  """
  scores, indices = retrieve_relevant_resources_dot_score(query = query, embeddings = embeddings, n_resources_to_return = n_resources_to_return, print_time = print_time)

  scores = scores.tolist()[0]
  indices = indices.tolist()[0]

  # Loop through zipped together scores and indices from torch.topk
  for score, idx in zip(scores, indices):
    print(f"Score : {score:.4f}")
    print(f"LOG Context :")
    print(f"{access_logs_dict[idx]['context']}")
    print(f"LOG ID : {idx}")
    print(f"LOG IP : {access_logs_dict[idx]['ip']}")
    print("********************************")

### Faiss Data Retrieval:

In [None]:
def retrieve_relevant_resources(query: str,
                                embeddings: np.ndarray = embeddings,
                                model: SentenceTransformer = embedding_model,
                                n_resources_to_return: int = 5,
                                print_time: bool = True):
  """
  Embeds a query with model and returns top 'n_resources_to_return' distances and indices from embeddings.
  """

  # Embed the query
  query_embedding = model.encode(query, convert_to_tensor=True).cpu().numpy().astype("float32")

  if len(query_embedding.shape) == 1:
    query_embedding = query_embedding.reshape(1, -1)

  # Get the scores and time it
  start_time = timer()
  distances, indices = index.search(query_embedding, k = n_resources_to_return)
  end_time = timer()

  if print_time:
    print(f"[INFO] Time taken to get distances on {len(embeddings)} embeddings: {end_time - start_time:.5f} seconds.")

  return distances, indices

# A function to print the results
def print_top_results_and_scores(query: str,
                                 embeddings: np.ndarray = embeddings,
                                 access_logs_dict: list[dict] = access_logs_dict,
                                 n_resources_to_return: int = 5,
                                 print_time: bool = True):
  """
  Finds relevant passages given a query and prints them out along with their scores.
  """

  distances, indices = retrieve_relevant_resources(query=query,
                                                   embeddings = embeddings,
                                                   model=embedding_model,
                                                   n_resources_to_return=n_resources_to_return,
                                                   print_time=True)

  for distance, index in zip(distances[0], indices[0]):
    print(f"Distance : {distance:.4f}")
    print(f"LOG Context :")
    print(f"{access_logs_dict[index]['context']}")
    print(f"LOG ID : {index}")
    print(f"LOG IP : {access_logs_dict[index]['ip']}")
    print("********************************")

In [None]:
query = "Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?"

print("FAISS RESULTS:")
print_top_results_and_scores(query = query,
                             embeddings = embeddings,
                             access_logs_dict = access_logs_dict,
                             n_resources_to_return = 5,
                             print_time = True)

print("\n"*3)

print("DOT SCORE RESULTS:")
print_top_results_and_scores_dot_score(query = query,
                                       embeddings = embeddings,
                                       access_logs_dict = access_logs_dict,
                                       n_resources_to_return = 5,
                                       print_time = True)

FAISS RESULTS:
[INFO] Time taken to get distances on 1000 embeddings: 0.00127 seconds.
Distance : 0.5019
LOG Context :
Using the GET method, successfully accessed the page. The user viewed a product image with ID 1 on a mobile device and was using Chrome on Linux.
LOG ID : 383
LOG IP : 31.56.96.51
********************************
Distance : 0.5094
LOG Context :
Using the GET method, successfully accessed the page. The user viewed a product image with ID 18781 on a mobile device and was using Chrome on Linux.
LOG ID : 854
LOG IP : 31.56.96.51
********************************
Distance : 0.5126
LOG Context :
Using the GET method, successfully accessed the page. The user viewed a product image with ID 33631 on a mobile device and was using Chrome on Linux.
LOG ID : 338
LOG IP : 5.78.198.52
********************************
Distance : 0.5126
LOG Context :
Using the GET method, successfully accessed the page. The user viewed a product image with ID 33631 on a mobile device and was using Chrom

## LLM:

In [None]:
!pip install -q bitsandbytes accelerate

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) N
Token is valid (permission: fineGr

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig

In [None]:
# 1. Quantization config
quantization_config = BitsAndBytesConfig(load_in_4bit = True,
                                         bnb_4bit_compute_dtype = torch.float16)
use_quantization_config = False

# 2. Attention mechanism setup
if is_flash_attn_2_available() and torch.cuda.get_device_capability(0)[0] >= 8:
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"

# 3. Model ID
model_id = "google/gemma-7b-it"

# 4. Instantiate tokenizer with token for gated model access
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = model_id, use_auth_token = True)

# 5. Instantiate model with quantization and memory settings
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_id, torch_dtype = torch.float16, quantization_config = quantization_config if use_quantization_config else None, low_cpu_mem_usage = False, use_auth_token = True)

# 6. Move to GPU if not using quantization
if not use_quantization_config:
  llm_model.to(device)

# (Optional) Check attention implementation and configure if possible
llm_model.config.attn_implementation = attn_implementation



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
!nvidia-smi

Thu Aug 22 14:53:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   73C    P0              34W /  72W |  21895MiB / 23034MiB |     12%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Generate Prompt and Output, Apply Chat Template:

In [None]:
INPUT_TEXT = "Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?"

# Create the prompt template for instruction-tuned model
DIALOGUE_TEMPLATE = [
    {"role": "user",
     "content": INPUT_TEXT}
]

# A function to get number of model parameters
def get_model_num_params(model: torch.nn.Module):
  return sum([param.numel() for param in model.parameters()])

# A function to apply dialogue_template to the LLM model and generate a prompt for the model
def generate_prompt(tokenizer, template, input_text):
  prompt = tokenizer.apply_chat_template(conversation = template,
                                       tokenize = False,
                                       add_generation_prompt = True)

  return prompt

# A function to generate output with respect to the prompt
def generate_output_tokens(tokenizer, prompt, device, model):
  # Tokenize the input text (turn it into numbers and send it to the GPU)
  input_ids = tokenizer(prompt, return_tensors = "pt").to(device)

  # Generate outputs from local LLM
  output_tokens = model.generate(**input_ids,
                              max_new_tokens = 256)

  return output_tokens[0]

# A function to decode output tokens
def decode_output_tokens(tokenizer, output_tokens):
  output = tokenizer.decode(output_tokens)

  return output

In [None]:
num_params = get_model_num_params(model = llm_model)
print(num_params)

8537680896


In [None]:
prompt = generate_prompt(tokenizer = tokenizer,template = DIALOGUE_TEMPLATE, input_text = INPUT_TEXT)
print(f"Input text : \n", INPUT_TEXT)
print(f"Prompt (formatted):\n", prompt)

Input text : 
 Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?
Prompt (formatted):
 <bos><start_of_turn>user
Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?<end_of_turn>
<start_of_turn>model



In [None]:
output_tokens = generate_output_tokens(tokenizer = tokenizer,
                                prompt = prompt,
                                device = device,
                                model = llm_model)

print(f"Input text : \n", INPUT_TEXT)
print(f"Model output (tokens): \n{output_tokens}")

Input text : 
 Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?
Model output (tokens): 
tensor([     2,      2,    106,   1645,    108,  13033,  27365,   2177,  14104,
          2370,    578,  14491,    573,   2602,  13292,    578,  21840,    476,
          3225,   2416,   2177,  23133,    611,  21415, 235336,    107,    108,
           106,   2516,    108,  21404, 235269,   1517, 235303, 235256,    573,
          3448, 235292,    109,    688,  51799,  66058,    109, 235287,   5231,
         14618,  27365,  66058,   3766,  27365,    877,   1500,    573,  24493,
         14104,   3853,   1644,    577,    573,   6934,    577,   4412,    573,
          3225,   2416, 235269,   3359,    573,   3853,  23122, 235269,   3590,
         23122, 235269,    578,    573,  23241,    576,    573,   3853,    578,
          3590, 235265,    108, 235287,   5231,  12434,  27365,  66058,   1927,
          1104,    708,   1089,  22978,  10266,   

In [None]:
output = decode_output_tokens(tokenizer = tokenizer,
                              output_tokens = output_tokens)

print(f"Input text : \n", INPUT_TEXT)
print(f"Model output (decoded): \n{output}")

Input text : 
 Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?
Model output (decoded): 
<bos><bos><start_of_turn>user
Which logs using GET method and loaded the page successfully and viewed a product image using Chrome on Linux?<end_of_turn>
<start_of_turn>model
Sure, here's the answer:

**Logs:**

* **Network logs:** These logs will show the HTTP GET request made to the server to load the product image, including the request headers, response headers, and the timing of the request and response.
* **Image cache logs:** If the image is being cached by Chrome, the cache logs may show the cached image data and the time it was last cached.

**Page load success:**

* **Network logs:** If the GET request for the product image is successful, the response status code should be 200.
* **Image display:** If the image is displayed correctly, it should be visible in the browser window.

**Additional notes:**

* The specific logs that a

## Augmentation:

In [None]:
drop_keys(access_logs_dict, DELETED_KEYS)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
def index_logs(access_logs_dict):
  """
  A function to index logs
  """
  indexed_logs = []

  for i, log in enumerate(access_logs_dict, start=1):
    indexed_log = {"Number": i, **log}
    indexed_logs.append(indexed_log)

  return indexed_logs

In [None]:
query = "Which logs viewed a product image using Chrome on Windows? What is the total size of the logs?"
print(f"Query: {query}")

# Get relevant resources
distances, indices = retrieve_relevant_resources(query = query,
                                                embeddings = embeddings,
                                                n_resources_to_return = 7)

retrieved_logs = []
for idx in indices[0].tolist():
  retrieved_logs.append(access_logs_dict[idx])

context = index_logs(retrieved_logs)

print(f"Number of retrieved resources : {len(context)}")

Query: Which logs viewed a product image using Chrome on Windows? What is the total size of the logs?
[INFO] Time taken to get distances on 1000 embeddings: 0.00145 seconds.
Number of retrieved resources : 7


In [None]:
def prompt_formatter(query: str,
                     access_logs_list: list[dict]) -> str:
  """
  A function to format our prompt
  """

  formatted_logs = ',\n'.join(
      f"The LOG with the ID {log['Number']} belongs to a user with IP address {log['ip']} with request time {log['timestamp']} and size {log['size']}, {extract_browser_and_os(log['user_agent'])} as user agent. Context={log['context']}"
      for log in access_logs_list
  )

  base_prompt = f"""Based on the following web traffic logs, please answer the query.
  Give yourself room to think by extracting relevant passages from the logs before answering the query.
  Don't return the thinking, only return the answer.
  Make sure your answers are as explanatory as possible.
  Here are some examples:
  Example 1:
  User query: Which logs viewed a picture of a product using Chrome browser and what is the total request size of these logs?
  Answer: The logs with IDs 1,2,3,4 and 5 viewed a picture of a product using Chrome browser and the total request size is: 1024 + 54 + 65 + 12124 + 25 = 13292
  Example 2:
  Which logs used a mobile device to access the website using Firefox browser on Linux and what are their IP adresses?
  Answer: The logs used a mobile device to access the website using Firefox browser on Linux are the logs with IDs 1,2,3,4 and 5 and their IP adresses are:
  192.168.1.1
  254.256.32.15
  58.68.47.12
  56.98.2.4
  124.266.32.1
  \nNow use the following log entries to answer the user query:
  {formatted_logs}
  \nUser query: {query}
  Answer:"""

  # Create prompt template for instruction-tuned model
  dialogue_template = [
    {"role": "user",
    "content": base_prompt}
  ]

  # Apply the chat template
  prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)

  return prompt

In [None]:
# Format our prompt
prompt = prompt_formatter(query, context)

print(prompt)

<bos><start_of_turn>user
Based on the following web traffic logs, please answer the query.
  Give yourself room to think by extracting relevant passages from the logs before answering the query.
  Don't return the thinking, only return the answer.
  Make sure your answers are as explanatory as possible.
  Here are some examples:
  Example 1:
  User query: Which logs viewed a picture of a product using Chrome browser and what is the total request size of these logs?
  Answer: The logs with IDs 1,2,3,4 and 5 viewed a picture of a product using Chrome browser and the total request size is: 1024 + 54 + 65 + 12124 + 25 = 13292
  Example 2:
  Which logs used a mobile device to access the website using Firefox browser on Linux and what are their IP adresses?
  Answer: The logs used a mobile device to access the website using Firefox browser on Linux are the logs with IDs 1,2,3,4 and 5 and their IP adresses are:
  192.168.1.1
  254.256.32.15
  58.68.47.12
  56.98.2.4
  124.266.32.1
  
Now use 

In [None]:
import textwrap

def print_wrapped(text, wrap_length = 80):
  """
  A function to wrap strings in the output.
  """
  wrapped_text = textwrap.fill(text, wrap_length)
  print(wrapped_text)

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors = "pt").to("cuda")

# Generate on output tokens
outputs = llm_model.generate(**input_ids,
                             temperature = 0.7, # from 0 to 1 and the lower the value, the more deterministic text, the higher the value, the more creative text.
                             do_sample = True, # whether or not to use sampling,
                             max_new_tokens = 512)

# Turn output tokens into texts
output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Query: Which logs viewed a product image using Chrome on Windows? What is the total size of the logs?
RAG answer:
Sure, here is the answer to the user query:

The logs with IDs 1, 2, 3, 4, 5, 6, and 7 viewed a product image using Chrome on Windows. The total size of the logs is:

17046 + 22177 + 11 + 11 + 36441 + 11 + 11 = 82016

Therefore, the answer to the user query is:

**The logs with IDs 1, 2, 3, 4, 5, 6, and 7 viewed a product image using Chrome on Windows and the total size of the logs is 82016.**
CPU times: user 13.2 s, sys: 56.8 ms, total: 13.2 s
Wall time: 13.2 s


### Functionizing our LLM answering feature:

In [None]:
def ask(query: str,
        temperature: float = 0.7,
        max_new_tokens: int = 256,
        format_answer_text = True,
        return_answer_only = True):
  """
  Takes a query, finds relevant resources/content and generates an answer to the query based on the relevant resources
  """
  ## RETRIEVAL
  # Get just the scores and indices of top related results
  distances, indices = retrieve_relevant_resources(query = query,
                                                embeddings = embeddings)

  # Create a list of context items
  contexts = [access_logs_dict[i] for i in indices[0].tolist()]
  context_items = index_logs(contexts)

  # Add distance to context item
  for i, item in enumerate(context_items):
    item["distance"] = distances[0][i]

  ## AUGMENTATION
  # Create the prompt and format it with context items
  prompt = prompt_formatter(query = query,
                            access_logs_list = context_items)

  ## GENERATION
  # Tokenize the prompt
  input_ids = tokenizer(prompt, return_tensors = "pt").to("cuda")

  # Generate an output of tokens
  output_tokens = llm_model.generate(**input_ids,
                               temperature = temperature,
                               do_sample = True,
                               max_new_tokens = max_new_tokens)

  # Decode the tokens into text
  output_text = tokenizer.decode(output_tokens[0])

  # Format the answer
  if format_answer_text:
    output_text = output_text.replace(prompt, '').replace("<bos>", '').replace("<eos>", '')

  # Only return the answer without context items
  if return_answer_only:
    return output_text

  return output_text, context_items

In [None]:
def print_context_items(context_items: list[dict]):
  """
  A function to print context items
  """
  print("Context Items:\n")
  for item in context_items:
    print(item)

In [None]:
# Example queries
query_list = [
  "Which logs viewed a product image using Chrome browser on Windows? What is the total size of the logs?",
  "Which logs accessed a CSS file on Linux? What are the IP adresses of the logs?",
  "Which users accessed a product on a desktop device? What are the user agents of these logs?",
  "Which logs accessed a Telegram image? What are the timestamps and operating systems of these logs?",
  "Which logs viewed a product image on a mobile device? What are the ID numbers of the viewed products?",
  "Which logs using Firefox browser to access a product? What are the methods and user agents of these logs??"
]

# Answer questions for each query
for query in query_list:
  output_text, context_items = ask(query = query,
                    temperature = 0.7,
                    max_new_tokens = 512,
                    format_answer_text = True,
                    return_answer_only = False)
  print("Query : ", query)
  print_context_items(context_items)
  print("RAG answer :\n", output_text)
  print("***************************")

[INFO] Time taken to get distances on 1000 embeddings: 0.00113 seconds.
Query :  Which logs viewed a product image using Chrome browser on Windows? What is the total size of the logs?
Context Items:

{'Number': 1, 'ip': '89.47.79.75', 'timestamp': '22/Jan/2019:03:59:10 +0330', 'method': 'GET', 'protocol': 'HTTP/1.1', 'status': '200', 'size': '17046', 'user_agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'log_index': 986, 'context': 'Using the GET method, successfully accessed the page. The user viewed a product image with ID 11671 on a desktop device and was using Chrome on Windows 6.3.', 'distance': 0.94028515}
{'Number': 2, 'ip': '89.47.79.75', 'timestamp': '22/Jan/2019:03:59:10 +0330', 'method': 'GET', 'protocol': 'HTTP/1.1', 'status': '200', 'size': '22177', 'user_agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 'log_index': 987, '