In [1]:
import sys
print(sys.executable)

/home/mehrzad/anaconda3/envs/dbpedia/bin/python


In [2]:
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f'\nDevice: {device}')


Device: cuda:0


#
##  starcoderbase-1b

In [3]:
## needs HF token with accepted terms of use for this repo
# !huggingface-cli logout
# !huggingface-cli login
!huggingface-cli whoami

mehrzad-shahin


In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/starcoderbase-1b"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(checkpoint, use_auth_token=True).to(device)


In [5]:
model

GPTBigCodeForCausalLM(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49152, 2048)
    (wpe): Embedding(8192, 2048)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPTBigCodeBlock(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeAttention(
          (c_attn): Linear(in_features=2048, out_features=2304, bias=True)
          (c_proj): Linear(in_features=2048, out_features=2048, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=2048, out_features=8192, bias=True)
          (c_proj): Linear(in_features=8192, out_features=2048, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e

In [6]:
## candidate prompt format

prompt = """
# write a SPARQL query for;
# what is the population of Italy?
# use following prefixes:

PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

"""


inputs = tokenizer.encode_plus(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs['input_ids'], max_new_tokens=100)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.



# write a SPARQL query for;
# what is the population of Italy?
# use following prefixes:

PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>


SELECT?country?population
WHERE {
?country dbr:population?population.
?country dbo:countryName "Italy".
}
<|endoftext|>


#
## Data 

In [7]:
import pandas as pd

df = pd.read_csv("data/prompt_tuning_100k.csv")

df = df[['prompt', 'completion']]
df

Unnamed: 0,prompt,completion
0,# write a SPARQL query for:\n# How many ICD9 d...,select count(*) as ?x where{dbr:Friedreich's_a...
1,# write a SPARQL query for:\n# What was the fa...,select ?x where{dbr:Destrehan_High_School dbo:...
2,# write a SPARQL query for:\n# Does adventist ...,ask where{dbr:Adventist_Girls_High_School dbo:...
3,# write a SPARQL query for:\n# Did aliens area...,ask where{dbr:Aliens_Area dbo:lastPublicationD...
4,# write a SPARQL query for:\n# Did bridget jon...,ask where{dbr:Bridget_Jones:_The_Edge_of_Reaso...
...,...,...
99995,# write a SPARQL query for:\n# What is the pop...,select ?x where{dbr:Southern_Yukaghir_language...
99996,# write a SPARQL query for:\n# Is washington c...,"ask where{dbr:Mitchell_County,_North_Carolina ..."
99997,# write a SPARQL query for:\n# How many marrie...,select count(*) as ?x where{dbr:Bernardine_Doh...
99998,# write a SPARQL query for:\n# What are the ba...,select ?x where{dbr:Battle_of_Magdala dbo:resu...


In [8]:
for index, row in df.sample(n=5).iterrows():
    print(f"Prompt:\n{row['prompt']}\n\nSPARQL:\n{row['completion']}\n{'='*80}")

Prompt:
# write a SPARQL query for:
# How do you calculate 1622 chacornac albedo ?

# use following prefixes:
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SPARQL:
select ?x where{dbr:1622_Chacornac dbo:albedo ?x . <B> dbo:albedo ?x }
Prompt:
# write a SPARQL query for:
# What is the minimum elevation of mentonasc dialect's in mentonasc dialect language and reference ?

# use following prefixes:
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SPARQL:
select ?x where{dbr:Mentonasc_dialect dbo:spokenIn ?x1 . ?x1 dbo:minimumElevation ?x }
Prompt:
# write a SPARQL query for:
# What does it mean if the frozen of bor lake existed ?

# use following prefixes:
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SPARQL:
select ?x where{dbr:Bor_Lake dbo:frozen ?x }
Prompt:
# write a SPARQL query for:
# Where let's switch!'s birthplace was found ?

# use following prefixes:
PREFIX db

In [9]:
## Load Local CSV Filex to Dataset Format using HugginFace datasets library 

from datasets import load_dataset

dataset = load_dataset('csv', data_files={
    'train': '/home/mehrzad/repos/mehrz/dbpedia/data/prompt_tuning/nspm_100k_train.csv', 
    'test': '/home/mehrzad/repos/mehrz/dbpedia/data/prompt_tuning/nspm_100k_test.csv'
})


print(dataset)

Found cached dataset csv (/home/mehrzad/.cache/huggingface/datasets/csv/default-5506e1d2153c5efa/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 70000
    })
    test: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 30000
    })
})


In [10]:
print(dataset['train'][0]['prompt'])
print(dataset['train'][0]['completion'])

# write a SPARQL query for:
# Did bernard de dryver have any championships ?

# use following prefixes:
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>
ask where{dbr:Bernard_de_Dryver dbo:championships ?x }


In [11]:
tokenizer.eos_token

'<|endoftext|>'

In [12]:
import argparse
import os

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, set_peft_model_state_dict
from torch.utils.data import IterableDataset
from tqdm import tqdm
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, logging, set_seed
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

In [13]:
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        input_column_name="prompt",
        output_column_name="completion"
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else 0
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.input_column_name = input_column_name
        self.output_column_name = output_column_name

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(prepare_sample_text(next(iterator), self.input_column_name, self.output_column_name))
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }



                    
                    
def prepare_sample_text(example, input_column_name="prompt", output_column_name="completion"):
    """Prepare the text from a sample of the dataset."""
    text = f"Question: {example[input_column_name]}\n\nAnswer: {example[output_column_name]}"
    return text                    
                    

    
                    
def chars_token_ratio(dataset, tokenizer, input_column_name="prompt", output_column_name="completion", nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        text = prepare_sample_text(example, input_column_name, output_column_name)
        total_characters += len(text)
        if tokenizer.is_fast:
            total_tokens += len(tokenizer(text).tokens())
        else:
            total_tokens += len(tokenizer.tokenize(text))

    return total_characters / total_tokens                    
 
    
    
    
                    
def create_datasets(tokenizer):
    dataset = load_dataset(
        'csv', data_files={
            'train': '/home/mehrzad/repos/mehrz/dbpedia/data/prompt_tuning/nspm_100k_train.csv',
            'test': '/home/mehrzad/repos/mehrz/dbpedia/data/prompt_tuning/nspm_100k_test.csv'
}
#         args.dataset_name,
#         data_dir=args.subset,
#         split=args.split,
#         use_auth_token=True,
#         num_proc=None, ##
#         streaming=args.streaming,
    )
#     if args.streaming:
#         print("Loading the dataset in streaming mode")
#         valid_data = dataset.take(args.size_valid_set)
#         train_data = dataset.skip(args.size_valid_set)
#         train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
#     else:
    train_data = dataset["train"]
    valid_data = dataset["test"]
    print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")

    chars_per_token = chars_token_ratio(train_data, tokenizer, "prompt", "completion")
    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

    train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        infinite=True,
        seq_length=2048, ##
        chars_per_token=chars_per_token,
        input_column_name="prompt", ##
        output_column_name="completion" ##
    )
    valid_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        infinite=False,
        seq_length=2048, ##
        chars_per_token=chars_per_token,
        input_column_name="prompt", ##
        output_column_name="completion" ##
    )
    return train_dataset, valid_dataset


In [14]:
train_dataset, valid_dataset = create_datasets(tokenizer)

Found cached dataset csv (/home/mehrzad/.cache/huggingface/datasets/csv/default-5506e1d2153c5efa/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/2 [00:00<?, ?it/s]

Size of the train set: 70000. Size of the validation set: 30000


100%|███████████████████████████████████████| 400/400 [00:00<00:00, 6041.51it/s]

The character to token ratio of the dataset is: 2.98





In [15]:
valid_dataset.dataset[0]

{'prompt': '# write a SPARQL query for:\n# What is the power output of a mercedes-benz m117 engine machine ?\n\n# use following prefixes:\nPREFIX dbr: <http://dbpedia.org/resource/>\nPREFIX dbo: <http://dbpedia.org/ontology/>',
 'completion': 'select ?x where{dbr:Mercedes-Benz_M117_engine dbo:powerOutput ?x }'}

# 
## Evaluate fine-tuned model with QALD-9-plus

In [16]:
import json

with open('data/qald_9_plus_train_dbpedia.json', 'r') as f:
    qald_9_plus_data = json.load(f)
    
qald_9_plus_data['questions'][0]

{'id': '1',
 'question': [{'language': 'en', 'string': 'List all boardgames by GMT.'},
  {'language': 'de', 'string': 'Liste die Brettspiele von GMT auf.'},
  {'language': 'de', 'string': 'Zeige mir alle Brettspiele von GMT.'},
  {'language': 'ru', 'string': 'Перечислите все игры GMT.'},
  {'language': 'lt', 'string': 'Išvardinkite visus stalo žaidimus pagal GMT.'},
  {'language': 'uk', 'string': 'Перерахуйте всі ігри GMT.'},
  {'language': 'lt', 'string': 'Išvardykite visus GMT žaidimus.'},
  {'language': 'fr', 'string': 'Listez tous les jeux de société de GMT.'},
  {'language': 'es',
   'string': '¿Qué juegos de mesa fueron hechos por GMT?',
   'keywords': 'juego de mesa ,  GMT '}],
 'query': {'sparql': 'PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX res: <http://dbpedia.org/resource/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT ?uri WHERE { ?uri dbo:publisher res:GMT_Games }'},
 'answers': [{'head': {'link': [], 'vars': ['uri']},
   'results': {'bindings': [{'uri'

In [17]:
# extracting pairs of question/sparql    
    
nl_questions, sparql_queries =[],[]

for data in qald_9_plus_data['questions']:
    nl_questions.append(
        next((item['string'] for item in data['question'] if item['language'] == 'en'), None)
    )
    sparql_queries.append(data['query']['sparql'])
    

print(f'{len(nl_questions)} pairs extracted from train set: ', "\n")
print(nl_questions[0])
print('\n', sparql_queries[0].replace(">", '>\n'))

408 pairs extracted from train set:  

List all boardgames by GMT.

 PREFIX dbo: <http://dbpedia.org/ontology/>
 PREFIX res: <http://dbpedia.org/resource/>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT ?uri WHERE { ?uri dbo:publisher res:GMT_Games }


# 

### Calculate BELUE score

In [18]:
## TODO 
# debug the script  


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_metric
import pandas as pd
import torch


def read_dataset(path):
    dataset = pd.read_csv(path)
    return dataset


def generate_sparql(model, tokenizer, question):
    inputs = tokenizer.encode(question, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs, max_length=100, num_beams=5, early_stopping=True)
    return tokenizer.decode(outputs[0])


def calculate_bleu(actual, generated):
    bleu_metric = load_metric('bleu')
    bleu_score = bleu_metric.compute(predictions=[generated.split()], references=[[actual.split()]])
    return bleu_score['bleu']


def main():
    
    models = [
#         "Salesforce/codegen-350M-multi", "Salesforce/codegen25-7b-instruct", 
#         "bigcode/starcoderbase-3b", "bigcode/starcoderbase-1b",
#         "EleutherAI/gpt-neo-2.7B", "tiiuae/falcon-rw-1b"
              
            ]  

    # TODO
    dataset_path = "/home/mehrzad/repos/mehrz/dbpedia/data/prompt_tuning/qald-9-plus.csv"

    dataset = read_dataset(dataset_path)

    for model_name in models:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

        
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(device)

        results = []
        total_bleu_score = 0

        for idx, row in dataset.iterrows():
            question, actual_sparql = row['question'], row['sparql_query']
            generated_sparql = generate_sparql(model, tokenizer, question)

            bleu_score = calculate_bleu(actual_sparql, generated_sparql)

            results.append((question, actual_sparql, generated_sparql, bleu_score))

            total_bleu_score += bleu_score

        average_bleu_score = total_bleu_score / len(dataset)

        print(f"Results for {model_name}:\n", results)
        print(f"Average BLEU score for {model_name}: ", average_bleu_score)
        
        
        # delete the model and tokenizer to free up memory
        del model
        del tokenizer

        # clear the GPU memory
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

            

