In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/rebuttal

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("awalesushil/DBLP-QuAD")
print(raw_datasets)

train_dataset = raw_datasets.get("train")
eval_dataset = raw_datasets.get("validation")
test_dataset = raw_datasets.get("test")

In [5]:
def clean(st):
    st = st.replace("\n", " ")
    st = st.replace("?", " ?")
    st = st.replace("{", " { ")
    st = st.replace("}", " } ")
    st = st.replace("\\'", "'")
    st = st.strip()

    while "  " in st:
        st = st.replace("  ", " ")
    return st


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig

from google.colab import userdata
token = userdata.get('mistral')

from huggingface_hub import login
login(token=token)

base_model_id = "mistralai/Mistral-7B-v0.1"
fine_tuned_model_id = "finetuned_mistral_7B"
# base_model_id = "mistralai/Mistral-7B-Instruct-v0.1"

adapter_path = "/content/drive/MyDrive/rebuttal/finetuned_mistral_7b"
config = PeftConfig.from_pretrained(adapter_path)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    # use_auth_token=True
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the adapter
ft_model = PeftModel.from_pretrained(base_model, adapter_path)



In [7]:
import json

def save_json(filename,data):
    with open(filename, "w", encoding="utf-8") as json_file:
        print(json.dumps(data), file=json_file)


def load_json(file__name):
    try:
        data_file = open(file__name, "r", encoding='utf-8')
        file_data = json.loads(data_file.read())
        data_file.close()
        return file_data
    except FileNotFoundError:
        return None


def check_result(res):
    st = "output (Sparql query):"
    if st in res:
        loc = res.index(st)+len(st)
        res = res[loc:]
    return res

def get_question(data_point):
    full_prompt = f'''<|endoftext|>Return a Sparql query for DBLP for the English text given in input, using entities and relations provided. Return only the SPARQL query.
                        input (English text): {data_point['question']['string']}
                        entities: {data_point['entities']}
                        relations: {data_point['relations']}
                        output (Sparql query):
                        '''
    return full_prompt


def main():
    data = load_json("ft_mistral-7B.json")
    if data is None:
        questions = [x['question']['string'] for x in test_dataset]
        sparql = [clean(x["query"]["sparql"]) for x in test_dataset]
        ready_questions = [get_question(x) for x in test_dataset]
        suggestions = [x["template_id"] for x in test_dataset]
        gs = []
    else:
        questions = data["questions"]
        ready_questions = data["ready_questions"]
        sparql = data["sparql"]
        gs = data["generated_sparql"]
        suggestions = data["suggestions"]

    for q in ready_questions[len(gs):]:
        model_input = tokenizer(q, return_tensors="pt").to("cuda")
        ft_model.eval()
        with torch.no_grad():
            res = tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=512)[0], skip_special_tokens=True)
        res = clean(res)
        res = check_result(res).strip()
        print(len(gs), res)
        gs.append(res)
        result = {"questions": questions, "ready_questions": ready_questions, "sparql": sparql, "generated_sparql": gs, "suggestions": suggestions}
        save_json("ft_mistral-7B.json", result)

In [None]:
main()