Evaluation of Llama not Finetuned

In [2]:
!pip install fsspec==2024.10.0
!pip cache purge

[0mFiles removed: 0


In [1]:
!pip install -q -U datasets
!pip install -q -U torch auto-gptq transformers optimum
!pip install -q -U peft trl einops accelerate xformers bitsandbytes
! pip install -q -U rouge_score
! pip install -q -U langchain

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m46.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[0m

In [2]:
!pip install evaluate
from evaluate import load
!pip install langchain_community

[0mCollecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
[0mCollecting langchain_community
  Downloading langchain_community-0.3.11-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.7.0-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3.23.1-py3-none-any.

### Imports

In [3]:
import pandas as pd
import json
import torch
import os
import time

from datasets import load_dataset, Dataset, load_from_disk
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, AutoPeftModelForCausalLM
from transformers import GPTQConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer
from time import perf_counter
from rich import print

from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

import textwrap

### Global Constants

In [4]:
model_id = "TheBloke/Llama-2-7B-GPTQ"

In [5]:
from google.colab import drive

# Mounting Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
DATA_PATH ="/content/drive/MyDrive/Text2SQL/Data/"
DS_DIR = "sql_train_test"
PKL_DIR = "test/"
PKL_FILE ="sql_test.pkl"
LLMRES_PKL_FILE="sql_test_Llama2.pkl"

### Common Functions

In [7]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [8]:
rouge = load("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [9]:
def parse(text):
    start_marker = '### Response:'
    end_marker = '### End'
    start_index = text.find(start_marker)
    end_index = text.find(end_marker, start_index + len(start_marker))

    return (text[start_index + len(start_marker):].strip() if start_index != -1 and end_index == -1
            else text[start_index + len(start_marker):end_index].strip() if start_index != -1
            else None)

In [10]:
def gen_eval_res(test_df):
  eval_res = []
  time_res = []
  row_no = 1
  for row in test_df.itertuples(index=False):
    start_time = perf_counter()
    text = row.text
    llm_res = pipe(text)[0]['generated_text']
    parse_res = parse(llm_res)
    end_time = perf_counter()
    eval_res.append(parse_res)
    time_res.append(end_time-start_time)
    if row_no % 10 == 0:
      print(f"*** Row {row_no} Done ***")
    row_no += 1

  test_df['eval_res'] = eval_res
  test_df['eval_time'] = time_res

### Load and Check Data

In [11]:
dataset = load_from_disk(DATA_PATH + DS_DIR)

In [12]:
test_df = pd.read_pickle(DATA_PATH + PKL_DIR + PKL_FILE)

In [13]:
display(dataset['train'])
display(dataset['test'])

Dataset({
    features: ['response', 'question', 'context', 'text', '__index_level_0__'],
    num_rows: 4086
})

Dataset({
    features: ['response', 'question', 'context', 'text', '__index_level_0__'],
    num_rows: 454
})

In [35]:
display(test_df.head(2))
display(test_df.shape)

Unnamed: 0,response,question,context,__index_level_0__,text,eval_res,eval_time
0,"SELECT T2.name, COUNT(*) FROM race AS T1 JOIN ...",Show the name of track and the number of races...,"CREATE TABLE track (name VARCHAR, track_id VAR...",429,### Instruction:\n You are a powerful text-...,1. SELECT COUNT(*) FROM race WHERE track_id = ...,4.275261
1,"SELECT T3.Shop_Name, T2.Carrier FROM stock AS ...",Show names of shops and the carriers of device...,"CREATE TABLE shop (Shop_Name VARCHAR, Shop_ID ...",2907,### Instruction:\n You are a powerful text-...,"SELECT s.shop_name AS 'Shop Name',s.shop_id as...",13.616137


(454, 7)

In [15]:
quantization_config_loading = GPTQConfig(bits=4, disable_exllama=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
          model_id,
          quantization_config=quantization_config_loading,
          device_map="auto"
        )

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


config.json:   0%|          | 0.00/784 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/3.90G [00:00<?, ?B/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Some weights of the model checkpoint at TheBloke/Llama-2-7B-GPTQ were not used when initializing LlamaForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'model.layers.10.self_attn.q_proj.bias', 'model

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [16]:
print(model.get_memory_footprint()/1e9)

In [17]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=180,
    do_sample = True,
    temperature=0.1,
    top_k=4,
    repetition_penalty=1.2
)

llm = HuggingFacePipeline(pipeline=pipe)

  llm = HuggingFacePipeline(pipeline=pipe)


#### Check Model on Sample Test Question

In [18]:
ID = 10

In [19]:
test_df['text'][ID]

'### Instruction:\n    You are a powerful text-to-SQL model.     Your job is to answer questions about a database.     You are given a question and context regarding one or more tables.\n\n    You must output the SQL query that answers the question.\n\n    ### Input:\n    Show ids for all students who have advisor 1121.\n    ### Context:\n    CREATE TABLE Student (StuID VARCHAR, Advisor VARCHAR)\n    ### Response:\n    '

In [20]:
display(test_df['response'][ID])

'SELECT StuID FROM Student WHERE Advisor = 1121'

In [21]:
text = test_df['text'][ID]
llm_res = pipe(text)[0]['generated_text']
parse_res = parse(llm_res)
print(parse_res)

In [22]:
eval_start_time = perf_counter()
gen_eval_res(test_df)
eval_end_time = perf_counter()
print('Evaluation Took: ', round(eval_end_time-eval_start_time, 2))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [31]:
import os
DATA_PATH = "/content/drive/MyDrive/Text2SQL/Data/"
PKL_DIR = "test/"
LLMRES_PKL_FILE = "sql_test_Llama2.pkl"
target_dir = os.path.join(DATA_PATH, PKL_DIR)
os.makedirs(target_dir, exist_ok=True)
file_path = os.path.join(target_dir, LLMRES_PKL_FILE)
with open(file_path, "wb") as f:
    pass

In [32]:
test_df.to_pickle('/content/drive/MyDrive/Text2SQL/Data/test/sql_test_Llama2.pkl')

In [34]:
import os

file_path = "/content/drive/MyDrive/Text2SQL/Data/test/sql_test_Llama2.pkl"
if os.path.exists(file_path):
    print(f"File exists: {file_path}")
    print(f"File size: {os.path.getsize(file_path)} bytes")
else:
    print(f"File not found: {file_path}")


In [33]:
test_df2=pd.read_pickle('/content/drive/MyDrive/Text2SQL/Data/test/sql_test_Llama2.pkl')
display(test_df2.head(2))

Unnamed: 0,response,question,context,__index_level_0__,text,eval_res,eval_time
0,"SELECT T2.name, COUNT(*) FROM race AS T1 JOIN ...",Show the name of track and the number of races...,"CREATE TABLE track (name VARCHAR, track_id VAR...",429,### Instruction:\n You are a powerful text-...,1. SELECT COUNT(*) FROM race WHERE track_id = ...,4.275261
1,"SELECT T3.Shop_Name, T2.Carrier FROM stock AS ...",Show names of shops and the carriers of device...,"CREATE TABLE shop (Shop_Name VARCHAR, Shop_ID ...",2907,### Instruction:\n You are a powerful text-...,"SELECT s.shop_name AS 'Shop Name',s.shop_id as...",13.616137
