In [1]:
%%capture
!pip install -q vllm \
    -U --no-index --find-links /kaggle/input/vllm-wheel/vllm-wheel-files
!pip install -q langchain-text-splitters ipywidgets \
    -U --no-index --find-links /kaggle/input/text-splitters/text-splitter

## Dependencies

In [2]:
import os
import gc
import re
import ast
from time import time

import torch
import random
import numpy as np
import pandas as pd

from vllm import LLM, SamplingParams
from langchain_text_splitters import RecursiveCharacterTextSplitter

from torch.cuda.amp import autocast
from threading import Thread

from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_math_sdp(False)

if (not torch.cuda.is_available()): print("Sorry - GPU required!")

## Hyperparameters

In [3]:
@dataclass
class constants:
    MODEL_PATH = '/kaggle/input/model/transformers/phi-3-mini-4k-instruct/1'
    TEST_PATH = '/kaggle/input/lmsys-chatbot-arena/test.csv'
#     TEST_PATH = '/kaggle/input/lmsys-chatbot-arena/train.csv'
    SUBMISSION_PATH = 'submission.csv'
    TARGETS = ['winner_model_a','winner_model_b','winner_tie']

In [4]:
@dataclass
class config:
    SPREAD_MAX_LENGTH = False
    MAX_LENGTH=False
    MAX_NEW_TOKENS = 32
    MAX_TEXT_SIZE=4500
    MAX_TOKENS = 4096
    BATCH_SIZE = 50
    USE_CACHE = True

In [5]:
device0 = torch.device('cuda:0')
device1 = torch.device('cuda:1')

In [6]:
def prompt_template(query:str, response_a:str, response_b:str)->str:
    return f'''Act as a discriminator assistant to evaluate and distinguish between two responses for a given query.
    
### Overview:
You will be provided with a query and two responses. Your task is to compare the responses based on various criteria and determine which one is superior in each aspect.

The criteria are:
1. Relevance: Which response directly addresses the query better and provides more pertinent information?
2. Accuracy: Which response contains more reliable and factual information based on credible sources?
3. Clarity: Which response is more straightforward, easy to understand, and free of ambiguity?
4. Logical Flow: Which response presents information in a more organized and coherent manner?
5. Responsiveness: Which response better addresses specific details and aspects of the query?

For each criterion, assign one of the following ratings:
A: Response A is superior
B: Response B is superior
AB: Both responses are equally optimal
NA: Neither response is satisfactory

Output:
Always provide output as a comma-separated list of ratings, corresponding to the order of criteria listed above.

For example: 
Output: A,A,NA,AB,B

where the 1st posn, A means response A is more relevant than the B,
The 2nd posn, A means response A is more accurate compared to B,
then Third, NA means neither of A or B have clarity in their response, 
and so on...

###IMPORTANT NOTES:
- Do not include any explanations or additional text. Simply provide the comma seperated ratings.
- The order of ratings should match the order of criteria listed above.
- There must be exactly 5 ratings in the output.

### Task:
- Query: {query}
- Response A: {response_a}
- Response B: {response_b}
Output:
'''

In [7]:
model_prompt_template = '''<|system|>
You are a helpful assistant.<|end|>
<|user|>
{0}<|end|>
<|assistant|>
'''

In [8]:
# bnb_config = BitsAndBytesConfig(
# #     load_in_4bit=True,
# #     bnb_4bit_compute_dtype=torch.float16,
# #     bnb_4bit_use_double_quant=False
# )

In [9]:
params = {
#     "gpu_memory_utilization": 0.99,
#     "tensor_parallel_size": 1,
    "max_model_len":config.MAX_TOKENS,
    "enable_prefix_caching": False,
    "dtype": torch.float16,
    "trust_remote_code": True
}

In [10]:
sampling_params = {
    "n": 1,
    "best_of": 1,
    "presence_penalty": 0.0,
    "frequency_penalty": 0.0,
    "repetition_penalty": 1.0,
    "temperature": 0.7,
    "top_p": 0.95,
    "top_k": -1,
    "min_p": 0.0,
    "use_beam_search": False,
    "length_penalty": 1.0,
    "early_stopping": False,
    "stop": [],
    "stop_token_ids": [],
    "include_stop_str_in_output": False,
    "ignore_eos": False,
    "max_tokens": config.MAX_NEW_TOKENS,
    "logprobs": None,
    "prompt_logprobs": None,
    "skip_special_tokens": True,
    "spaces_between_special_tokens": True
}

In [11]:
sampling_params = SamplingParams(**sampling_params)

## Defining Models and Functions

In [12]:
def cleaner():
    gc.collect()
    torch.cuda.empty_cache()
cleaner()

In [13]:
model_0 = LLM(constants.MODEL_PATH,
#               device=device0,
              **params
              )       

# model_1 = LLM(constants.MODEL_PATH,
#               device=device1,
#               **params
#               )     

INFO 08-05 15:01:04 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/model/transformers/phi-3-mini-4k-instruct/1', speculative_config=None, tokenizer='/kaggle/input/model/transformers/phi-3-mini-4k-instruct/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/model/transformers/phi-3-mini-4k-instruct/1, use_v2_block_manager=False, enable_prefix_caching=False)
INFO 08-05 15:01:04 selector.py:151

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 08-05 15:01:48 model_runner.py:692] Loading model weights took 7.1183 GB
INFO 08-05 15:01:50 gpu_executor.py:102] # GPU blocks: 978, # CPU blocks: 682
INFO 08-05 15:01:53 model_runner.py:980] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 08-05 15:01:53 model_runner.py:984] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 08-05 15:02:08 model_runner.py:1181] Graph capturing finished in 15 secs.


In [14]:
cleaner()

In [15]:
def chunk_text(text):
# Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=config.MAX_TEXT_SIZE,  # desired chunk size
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    # Split the text
    chunks = text_splitter.split_text(text)
    return chunks[0]

In [16]:
def make_prompt(queries, responses_a, responses_b):
    text = [
        model_prompt_template.format(
            prompt_template(chunk_text(query), 
                            chunk_text(response_a), 
                            chunk_text(response_b))
        ) for query, response_a, response_b in zip(queries, responses_a, responses_b)
    ]
        
    return text

In [17]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def get_llm_response(df, model, device, batch_size=config.BATCH_SIZE):
    results_list = []
    predictions = []
    cleaner()
    
    for start_idx in range(0, len(df), batch_size):
        cleaner()
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        prompts = tmp['prompt'].to_list()
        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
            outputs = model.generate(prompts, sampling_params)
            gen_texts = [output.outputs[0].text for output in outputs]
        cleaner()
    
        results_list.extend(gen_texts)  
        predictions.extend([compute_predctions(gen_text) for gen_text in gen_texts])
        
    # Return results if needed
    return {'labels': results_list, 'predictions': predictions}

In [18]:
def llm_responses(df, model, device, results, index):
    results[index] = get_llm_response(df, model, device)

In [19]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

In [20]:
def compute_predctions(response: str) -> list[float]:
    counts = {'A': 0, 'B': 0, 'AB': 0}
    elements = response.split(',')
    elements = [ele.strip() for ele in elements]
    for element in elements:
        if element in counts:
            counts[element] += 1
    counts_list = [counts['A'], counts['B'], counts['AB']]
#     print(counts)
    probabilities = np.round(softmax(counts_list), 3)
    return probabilities.tolist()

In [21]:
def expand_dataframe(row):
    prompts = eval(row['prompt'], {"null": ['null']})
    response_as = eval(row['response_a'], {"null": ['null']})
    response_bs = eval(row['response_b'], {"null": ['null']})
    
    new_rows = []
    for idx, (prompt, response_a, response_b) in enumerate(zip(prompts, response_as, response_bs)):
        new_row = row.copy()
        new_row['prompt'] = prompt
        new_row['response_a'] = response_a
        new_row['response_b'] = response_b
        new_rows.append(new_row)
    
    return pd.DataFrame(new_rows)

## Create Prompts


In [22]:
df_ = pd.read_csv(constants.TEST_PATH)
# df_ = df_[500:1000]

In [23]:
%%time
# Apply the function to each row and concatenate the results
split_df = pd.concat([expand_dataframe(row) for _, row in df_.iterrows()], ignore_index=True)

CPU times: user 6.17 ms, sys: 294 µs, total: 6.46 ms
Wall time: 12.4 ms


In [24]:
%%time
data = pd.DataFrame()
data["id"] = split_df["id"]
data['prompt'] = make_prompt(split_df["prompt"], split_df["response_a"], split_df["response_b"])

CPU times: user 4.74 ms, sys: 184 µs, total: 4.92 ms
Wall time: 7.56 ms


In [25]:
print(len(data))
data.head()

4


Unnamed: 0,id,prompt
0,136060,<|system|>\nYou are a helpful assistant.<|end|...
1,211333,<|system|>\nYou are a helpful assistant.<|end|...
2,1233961,<|system|>\nYou are a helpful assistant.<|end|...
3,1233961,<|system|>\nYou are a helpful assistant.<|end|...


In [26]:
cleaner()

## Inferencing through Threading

In [27]:
%%time
st = time()

N_SAMPLES = data.shape[0]
half = round(N_SAMPLES)
sub1 = data[0:half].copy()
# sub2 = data[half:N_SAMPLES].copy()

results = {}

t0 = Thread(target=llm_responses, args=(sub1, model_0, device0, results, 0))
# t1 = Thread(target=llm_responses, args=(sub2, model_1, device1, results, 1))

t0.start()
# t1.start()

t0.join()
# t1.join()

print(f"Processing complete. Total time: {time() - st:.2f} seconds")

Processed prompts: 100%|██████████| 4/4 [00:02<00:00,  1.99it/s, est. speed input: 2047.99 toks/s, output: 19.94 toks/s]

Processing complete. Total time: 2.49 seconds
CPU times: user 2.47 s, sys: 35.2 ms, total: 2.51 s
Wall time: 2.49 s





In [28]:
# results

In [29]:
predictions = results[0]['predictions'].copy()
# predictions.extend(results[1]['predictions'].copy())
data['predictions'] = predictions

In [30]:
len(data['predictions'])

4

In [31]:
def sum_lists(list_of_lists):
    return [round(sum(x), 3) for x in zip(*list_of_lists)]

# Group by 'id' and sum the 'predictions' lists, then round
result_df = data.groupby('id')['predictions'].apply(sum_lists).reset_index()
result_df['predictions'] = result_df['predictions'].apply(
    lambda x: [round(p, 3) for p in softmax(x)]
)

## submission

In [32]:
sub_df = result_df[["id"]].copy()

sub_df[constants.TARGETS] = result_df['predictions'].to_list()
sub_df.to_csv(constants.SUBMISSION_PATH, index=False)
sub_df.head()


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.252,0.497,0.252
1,211333,0.297,0.465,0.238
2,1233961,0.709,0.148,0.143
