In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
device = torch.device("cuda:1")
torch.cuda.set_device(device)
torch.set_default_device(device)
import pandas as pd
import os
import gc
from utils.graph import KGraphPreproc
from utils.preprocessing import preprocess_text
import networkx as nx
os.environ["HF_HOME"] = "/models"
os.environ["TRANSFORMERS_CACHE"] = "/models"

In [4]:
def clear_cache():
    gc.collect()
    torch.cuda.empty_cache()
clear_cache()

In [3]:
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
model_path = "/models/hub/models--Qwen--Qwen2.5-0.5B-Instruct/snapshots/7ae557604adf67be50417f59c2c2f167def9a775"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
mqa = pd.read_csv(f"/datasets/MetaQA/meta.csv")
mqa_graph = KGraphPreproc.get_metaqa_graph()

In [5]:
mqa.dropna(inplace=True)

In [6]:
mqa["chain"] = mqa.apply(
    lambda r: nx.shortest_path(
        mqa_graph._graph,
        mqa_graph.name2mid[r["topic_entity"]],
        mqa_graph.preprocessed_nodes[r.Answer]
    ),
    axis=1
)

In [7]:
row = mqa.iloc[1]
row

id                                               1-1
hop                                                1
Question        [Joe Thomas] appears in which movies
topic_entity                              Joe Thomas
Answer                               the inbetween 2
actual_hops                                        2
chain                                 [20874, 20869]
Name: 1, dtype: object

In [8]:
def get_tails(node, edge):
    tails = set()
    for u,v,attrs in mqa_graph._graph.edges(node, data=True):
        if attrs.get("relation") == edge:
            tails.add(v)
    return tails

In [9]:
def max_candidates_per_chain(row):
    max_cands = -1
    for start, target in zip(row.chain, row.chain[1:]):
        edge = mqa_graph._graph[start][target]["relation"]
        num_cands = len(get_tails(start, edge))
        if num_cands > max_cands:
            max_cands = num_cands
    return max_cands

In [10]:
mqa["max_tail_cands"] = mqa.apply(
    max_candidates_per_chain,
    axis=1
)

In [23]:
# mqa[mqa["max_tail_cands"] < 1000]
mqa["max_tail_cands"].describe()

count    302514.000000
mean        391.867252
std        1095.544542
min          -1.000000
25%           9.000000
50%          21.000000
75%          42.000000
max        4176.000000
Name: max_tail_cands, dtype: float64

In [88]:
start = row.chain[0]
target = row.chain[1]
edge = mqa_graph._graph[start][target]["relation"]
cand_tails = get_tails(start, edge)
print(
    mqa_graph.mid2name[start],
    edge,
    list(map(lambda t: mqa_graph.mid2name[t], cand_tails)),
    sep="\n"
)

Joe Thomas
starred_actors
['The Inbetweeners Movie', 'The Inbetweeners 2']


### Reranking problem
*For now, the next best thing to do is to use the same embedding similarity algorithm, but this poses an issue as semantic similarity of the candidates and preceding part of the triplet tells us nothing about the relevance to the actual question (types of qs), it also is likely biased towards the training data* 

### Tail prediction
Given a question, a relation, and candidates?

In [None]:
PREDICT_EDGE_PROMPT = """
You are an expert in knowledge graphs and natural language understanding. Your task is to help explore relevant relationships from given topic entities that can aid in answering a question.
Instructions:
Input: You will be provided with a natural language question and a list of topic entities extracted from that question.
Objective: Analyze the question to understand its context and what information might be needed to answer it. Then, generate a list of 5 candidate relationship labels (i.e., edge types) that could be used to navigate a knowledge graph starting from each entity.
Requirements:
Relevance: The candidate relationship labels must be pertinent to the context of the question.
Conciseness: Provide a brief description (1–2 sentences) of why each relationship label might help answer the question.
Format: Return your answer as a numbered list in the following format: 1. (Entity; Relationship label; Reason)
Do not produce any other text.

Question: “What awards has Albert Einstein received?”
Topic Entities: Albert Einstein;
Candidate relationship labels (2 items):
1. (Albert Einstein; awardReceived; Connects a person to the awards they have received.)
2. (Albert Einstein; honorificAward; Links individuals to awards given in honor of their achievements.)

Question: “{question}”
Topic Entities: {entities}
Candidate relationship labels (5 items):
"""

### Link prediction

In [None]:
q = "Which Dickens novel features the character 'Uriah Heep'?"
e = ["Charles Dickens", "David_Copperfield-GB"]

In [38]:
predict_link_prompt = '''
You are an expert in knowledge graphs and natural language understanding. Your task is to help explore relevant relationships from given topic entities that can aid in answering a question.
Instructions:
Input: You will be provided with a natural language question and a list of topic entities extracted from that question.
Objective: Analyze the question to understand its context and what information might be needed to answer it. Then, generate a list of 5 candidate relationship labels (i.e., edge types) that could be used to navigate a knowledge graph starting from each entity.
Requirements:
Relevance: The candidate relationship labels must be pertinent to the context of the question.
Conciseness: Provide a brief description (1–2 sentences) of why each relationship label might help answer the question.
Format: Return your answer as a numbered list in the following format: 1. (Entity; Relationship label; Reason)
Do not produce any other text.

Question: “What awards has Albert Einstein received?”
Topic Entities: Albert Einstein;
Candidate relationship labels (2 items):
1. (Albert Einstein; awardReceived; Connects a person to the awards they have received.)
2. (Albert Einstein; honorificAward; Links individuals to awards given in honor of their achievements.)

Question: “{question}”
Topic Entities: {entities}
Candidate relationship labels (5 items):
'''.format(question=q, entities="; ".join(e[::-1]))

In [8]:
decompose_question_prompt = """
Given a question, decompose it into a potential reasoning chain that can be used to navigate the knowledge graph to find the answer.
Output the list of potential relation names
Question: {question}
""".format(question=q)

In [29]:
import re
def extract_rel_answer(answer_string, group=0):
    pattern = re.compile("\d+\.\s*\(([^;]+);\s*([^;]+);\s*(.+?)\)")
    pos = 0
    rels = []
    while m := pattern.search(answer_string, pos):
        pos = m.start() + 1
        entity, rel, reason = m[group].split(";")
        rels.append(rel.strip())
        # if entity in entity_set:
        #     scored_entities.append((entity, float(rank)))
    return rels

In [37]:
extract_rel_answer(text)

['bookTitle',
 'authorName',
 'locationOfBook',
 'nationality',
 'genre',
 'occupation',
 'birthPlace',
 'publicationDate',
 'deathDate',
 'biography',
 'biography']

In [41]:
inputs = tokenizer(predict_link_prompt, return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs, max_length=400, temperature=0.15)
text = tokenizer.batch_decode(outputs[:, inputs["input_ids"].shape[1]:])[0]
print(text)

1. (David_Copperfield-GB; bookTitle; Identifies the title of the book featuring Uriah Heep.)
2. (Charles_Dickens; authorName; Establishes the author's name for reference purposes.)
3. (Charles_Dickens; genreOfNovel; Specifies the genre of the book.)
4. (Charles_Dickens; characterName; Identifies the character named Uriah Heep.)
5. (Charles_Dickens; plotElement; Mentions the plot element of Uriah Heep being involved in
