In [7]:
# Import necessary libraries
import re
import os
import json
import torch
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util

In [2]:
# Load the config file to get the ontology categories
config_path = 'config/dbpedia_webnlg_prompt_gen_config.json'
with open(config_path, 'r') as f:
    config = json.load(f)
ontology_list = config['onto_list']

In [3]:
# Initialize the SentenceTransformer model
# Note: T5-XXL is a very large model (11B parameters) and may not run on standard hardware.
# Make sure you have the necessary resources before loading this model.
model_name = 'sentence-t5-xxl'
model = SentenceTransformer(model_name)

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
# Define paths
test_data_path = '../../data/dbpedia_webnlg/test/'
train_data_path = '../../data/dbpedia_webnlg/train/'
output_path = '../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/'

# Create output directory if it doesn't exist
os.makedirs(output_path, exist_ok=True)

In [8]:
# Number of top similar sentences to retrieve
top_k = 5

# Process each ontology category
for ontology in ontology_list:
    print(f'Processing ontology: {ontology}')
    
    # Extract the number from the ontology string
    ontology_num = re.search(r'(\d+)_', ontology).group(1)
    category = ontology.split('_', 1)[1]  # Get category name after number
    
    # Load test data
    test_file = os.path.join(test_data_path, f'ont_{ontology_num}_{category}_test.jsonl')
    test_sentences = []
    test_ids = []
    with open(test_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            test_sentences.append(data['sent'])
            test_ids.append(data['id'])

    # Load train data
    train_file = os.path.join(train_data_path, f'ont_{ontology_num}_{category}_train.jsonl')
    train_sentences = []
    train_ids = []
    with open(train_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line.strip())
            train_sentences.append(data['sent'])
            train_ids.append(data['id'])

    # Compute embeddings for test and train sentences
    print('Computing embeddings for test sentences...')
    test_embeddings = model.encode(test_sentences, convert_to_tensor=True, show_progress_bar=True)
    print('Computing embeddings for train sentences...')
    train_embeddings = model.encode(train_sentences, convert_to_tensor=True, show_progress_bar=True)

    # Compute similarities and find top-k similar sentences for each test sentence
    similarity_results = {}
    print('Computing similarities and finding top similar sentences...')
    for idx, test_embedding in enumerate(tqdm(test_embeddings)):
        cosine_scores = util.cos_sim(test_embedding, train_embeddings)[0]
        top_results = torch.topk(cosine_scores, k=top_k)
        similar_train_ids = [train_ids[i] for i in top_results[1]]
        similarity_results[test_ids[idx]] = similar_train_ids

    # Save the results to the output file
    output_file = os.path.join(output_path, f'{ontology}_test_train_similarity.json')
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(similarity_results, f, indent=4)

    print(f'Results saved to {output_file}\n')

print('Processing completed.')

Processing ontology: 10_city
Computing embeddings for test sentences...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches: 100%|██████████| 7/7 [01:41<00:00, 14.53s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 5/5 [01:23<00:00, 16.78s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 217/217 [00:00<00:00, 4421.73it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/10_city_test_train_similarity.json

Processing ontology: 11_meanoftransportation
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [00:55<00:00, 18.50s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 7/7 [02:21<00:00, 20.22s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 92/92 [00:00<00:00, 3714.52it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/11_meanoftransportation_test_train_similarity.json

Processing ontology: 12_company
Computing embeddings for test sentences...


Batches: 100%|██████████| 2/2 [00:27<00:00, 13.96s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 4/4 [00:56<00:00, 14.02s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 56/56 [00:00<00:00, 4708.26it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/12_company_test_train_similarity.json

Processing ontology: 13_celestialbody
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [01:03<00:00, 21.06s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 4/4 [01:40<00:00, 25.05s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 72/72 [00:00<00:00, 3996.74it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/13_celestialbody_test_train_similarity.json

Processing ontology: 14_musicalwork
Computing embeddings for test sentences...


Batches: 100%|██████████| 7/7 [02:13<00:00, 19.05s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 3/3 [00:56<00:00, 18.84s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 209/209 [00:00<00:00, 4663.96it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/14_musicalwork_test_train_similarity.json

Processing ontology: 15_athlete
Computing embeddings for test sentences...


Batches: 100%|██████████| 4/4 [01:11<00:00, 17.94s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 6/6 [02:11<00:00, 21.88s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 107/107 [00:00<00:00, 4055.73it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/15_athlete_test_train_similarity.json

Processing ontology: 16_university
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [01:34<00:00, 31.35s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 3/3 [00:57<00:00, 19.17s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 71/71 [00:00<00:00, 4664.79it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/16_university_test_train_similarity.json

Processing ontology: 17_sportsteam
Computing embeddings for test sentences...


Batches: 100%|██████████| 4/4 [01:45<00:00, 26.43s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 4/4 [01:27<00:00, 21.82s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 110/110 [00:00<00:00, 4553.94it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/17_sportsteam_test_train_similarity.json

Processing ontology: 18_politician
Computing embeddings for test sentences...


Batches: 100%|██████████| 5/5 [01:21<00:00, 16.32s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 6/6 [01:53<00:00, 18.92s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 135/135 [00:00<00:00, 4347.23it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/18_politician_test_train_similarity.json

Processing ontology: 19_food
Computing embeddings for test sentences...


Batches: 100%|██████████| 5/5 [01:51<00:00, 22.38s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 8/8 [02:15<00:00, 16.95s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 153/153 [00:00<00:00, 3833.89it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/19_food_test_train_similarity.json

Processing ontology: 1_writtenwork
Computing embeddings for test sentences...


Batches: 100%|██████████| 4/4 [01:21<00:00, 20.46s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 7/7 [02:07<00:00, 18.27s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 127/127 [00:00<00:00, 4709.91it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/1_writtenwork_test_train_similarity.json

Processing ontology: 2_airport
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [00:45<00:00, 15.17s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 8/8 [02:46<00:00, 20.86s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 79/79 [00:00<00:00, 4621.73it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/2_airport_test_train_similarity.json

Processing ontology: 3_artist
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [00:44<00:00, 14.92s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 10/10 [02:49<00:00, 16.97s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 84/84 [00:00<00:00, 4568.48it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/3_artist_test_train_similarity.json

Processing ontology: 4_film
Computing embeddings for test sentences...


Batches: 100%|██████████| 4/4 [01:21<00:00, 20.30s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 5/5 [01:26<00:00, 17.31s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 127/127 [00:00<00:00, 4999.03it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/4_film_test_train_similarity.json

Processing ontology: 5_monument
Computing embeddings for test sentences...


Batches: 100%|██████████| 1/1 [00:14<00:00, 14.63s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 3/3 [01:00<00:00, 20.07s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 19/19 [00:00<00:00, 3875.31it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/5_monument_test_train_similarity.json

Processing ontology: 6_comicscharacter
Computing embeddings for test sentences...


Batches: 100%|██████████| 2/2 [00:19<00:00,  9.89s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 3/3 [00:43<00:00, 14.44s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 36/36 [00:00<00:00, 4174.71it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/6_comicscharacter_test_train_similarity.json

Processing ontology: 7_scientist
Computing embeddings for test sentences...


Batches: 100%|██████████| 5/5 [01:33<00:00, 18.71s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 4/4 [01:09<00:00, 17.29s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 149/149 [00:00<00:00, 4833.79it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/7_scientist_test_train_similarity.json

Processing ontology: 8_astronaut
Computing embeddings for test sentences...


Batches: 100%|██████████| 3/3 [00:46<00:00, 15.63s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 3/3 [00:43<00:00, 14.62s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 68/68 [00:00<00:00, 4276.05it/s]


Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/8_astronaut_test_train_similarity.json

Processing ontology: 9_building
Computing embeddings for test sentences...


Batches: 100%|██████████| 4/4 [00:57<00:00, 14.47s/it]


Computing embeddings for train sentences...


Batches: 100%|██████████| 6/6 [01:43<00:00, 17.19s/it]


Computing similarities and finding top similar sentences...


100%|██████████| 103/103 [00:00<00:00, 4642.46it/s]

Results saved to ../../data/dbpedia_webnlg/baselines/test_train_sent_similarity/9_building_test_train_similarity.json

Processing completed.



