## Generating SPARQL using Code LlaMa

### Setup of Code LLaMa 34B Quantized

In [6]:
from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import time

#model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model_name_or_path = "TheBloke/CodeLlama-34B-Instruct-GPTQ"
model_basename = "model"

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
        model_basename=model_basename,
        use_safetensors=True,
        trust_remote_code=True,
        device="cuda:0",
        skip_special_tokens= True, 
        use_triton=use_triton,        
        quantize_config=None)

tokenizer_config.json: 100%|██████████| 824/824 [00:00<00:00, 4.70MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 12.0MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 4.05MB/s]
special_tokens_map.json: 100%|██████████| 411/411 [00:00<00:00, 1.08MB/s]
config.json: 100%|██████████| 1.25k/1.25k [00:00<00:00, 9.85MB/s]
configuration_llama.py: 100%|██████████| 8.56k/8.56k [00:00<00:00, 19.0MB/s]
quantize_config.json: 100%|██████████| 187/187 [00:00<00:00, 838kB/s]
model.safetensors: 100%|██████████| 18.3G/18.3G [02:01<00:00, 150MB/s] 
modeling_llama.py: 100%|██████████| 45.9k/45.9k [00:00<00:00, 521kB/s]
skip module injection for FusedLlamaMLPForQuantizedModel not support integrate without triton yet.


In [7]:
# Prevent printing spurious transformers error when using pipeline with AutoGPTQ
logging.set_verbosity(logging.CRITICAL)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    use_cache=True,
    device_map="auto",    
    do_sample=True,
    top_k=10,
    temperature=0.1,
    max_length=300,
    repetition_penalty=1.1,  
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,)

In [121]:
instruction= """
You are given an input question, convert it to SPARQL code to query for Wikidata. Please wrap your code answer using ```:
"""
    
def generate_sparql(question): 
    
    sparql_prompt=f'''[INST] {instruction} {question} [/INST]'''
    
    #response=pipe(sparql_prompt)
    response = pipe(sparql_prompt)[0]['generated_text']
    #print(response)
    #sparql= response[0]["generated_text"].split("<</SYS>>")[1]
    
    return response

In [123]:
#generate_sparql("Who is the country for head of state of Mahmoud Abbas")

### Importing Dataset 1 QALD 9 Test and converting to Dataframe

In [16]:
import pandas as pd
df = pd.read_json('data/test.json', orient='records')
df['id'] = range(1, len(df) + 1)
df.rename(columns={'en_ques': 'questions', 'sparql': 'sparql_qald', 'fil_sparql': 'sparql_gen'}, inplace=True)
df = df[['id', 'questions', 'sparql_qald', 'sparql_gen']]



#### Running Experiment 1 for Dataset 1 Qald9 Test data

In [18]:
import time

In [19]:
start_time = time.time()
df['output'] = df['questions'].apply(generate_sparql)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")



Execution time: 2070.9248554706573 seconds


In [30]:
df['output'].head()[0]

'[INST] \nYou are given an input question, convert it to SPARQL code to query for DBpedia. Please wrap your code answer using ```:\n what is the time zone of salt lake city? [/INST]  Here\'s the SPARQL query to find the time zone of Salt Lake City:\n```sql\nPREFIX dbpedia-owl: <http://dbpedia.org/ontology/>\nSELECT DISTINCT?timeZone\nWHERE {\n ?city rdf:type dbpedia-owl:City.\n ?city dbpedia-owl:name "Salt Lake City"@en.\n ?city dbpedia-owl:timeZone?timeZone.\n}\n```\nThis query uses the `rdf:type` and `dbpedia-owl:name` properties to identify the specific city we\'re interested in (Salt Lake City), and then uses the `dbpedia-owl:timeZone` property to retrieve the time zone information associated with that city. The `DISTINCT` keyword is used to remove any duplicate results from the output.'

In [23]:
df.to_csv('Output/exp1_prompt_output_qald.csv')

In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

#nltk.download('punkt')

for index, row in df_lcquad.iterrows():
    
    reference = nltk.word_tokenize(row['sparql_lcquad'].lower())
    if len(row['output_cleaned']) < 1:
        print(index)
        continue
    candidate = nltk.word_tokenize(row['output_cleaned'][0].lower())
    bleu_score = sentence_bleu([reference], candidate[1:])
    df_lcquad.at[index, 'Org_bleu_score'] = bleu_score


In [75]:
def cleanText(text):
    prefix_index = text.find('PREFIX')
    last_bracket_index = text.rfind(']')
    
    if prefix_index != -1 and last_bracket_index != -1:
        content_between = text[prefix_index:last_bracket_index].strip()
        clean_content = re.sub(r'^\[\'sql|\'\]$', '', content_between).strip()

        return clean_content
    else:
        print("No match found.")
        return "False"

In [100]:
average_bleu = df['Org_bleu_score'].mean()

In [101]:
average_bleu ### Without any postprocessing

0.08278954133622182

In [63]:

def extract_content(text):
    parts = text.split('[/INST]')
    if len(parts) > 1:
        matches = re.findall(r'```(.*?)```', parts[1], re.DOTALL)
        return matches
    else:
        return None



In [81]:
df['output_cleaned'] = df['output'].apply(lambda x: extract_content(x))

In [102]:
df.to_csv('Output/exp1_prompt_outputcleaned_qald_pp.csv')

### Vquanda Experiment Dataset 2

In [105]:
df_vquad = pd.read_json('data/test_vquanda.json', orient='records')
df_vquad.rename(columns={'en_ques': 'questions', 'sparql': 'sparql_vquanda'}, inplace=True)
df_vquad = df_vquad[['questions', 'sparql_vquanda']]

In [108]:
start_time = time.time()
df_vquad['output'] = df_vquad['questions'].apply(generate_sparql)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")



Execution time: 8275.652656793594 seconds


In [110]:
df_vquad.to_csv('Output/exp1_prompt_output_vquanda.csv')

In [113]:
df_vquad['output_cleaned'] = df_vquad['output'].apply(lambda x: extract_content(x))

In [118]:
average_bleu = df_vquad['Org_bleu_score'].mean()
average_bleu

0.05610608613652875

In [119]:
df_vquad.to_csv('Output/exp1_prompt_outputcleaned_vquanda_pp.csv')

### lcquad Experiment Dataset 3

In [124]:
df_lcquad = pd.read_json('data/test_lcquad2.json', orient='records')
df_lcquad.rename(columns={'question': 'questions', 'sparql_wikidata': 'sparql_lcquad'}, inplace=True)
df_lcquad = df_lcquad[['questions', 'sparql_lcquad']]

In [126]:
start_time = time.time()
df_lcquad['output'] = df_lcquad['questions'].apply(generate_sparql)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Execution time: {elapsed_time} seconds")



Execution time: 56246.60027384758 seconds


In [128]:
df_lcquad.to_csv('Output/exp1_prompt_output_lcquad.csv')

In [129]:
df_lcquad['output_cleaned'] = df_lcquad['output'].apply(lambda x: extract_content(x))

In [133]:
average_bleu = df_lcquad['Org_bleu_score'].mean()
average_bleu

0.06574901938786067

In [134]:
df_lcquad.to_csv('Output/exp1_prompt_outputcleaned_lcquad_pp.csv')

### Extra Code