In [192]:
import gc
import re
import csv
import os
import gc
import torch
# set to "cuda:1" for running in parallel on both GPUs
device = torch.device("cuda:0")
torch.cuda.set_device(device)
torch.set_default_device(device)
import networkx as nx
import Stemmer
import pandas as pd
from tqdm import tqdm
from langchain_community.graphs.networkx_graph import get_entities
from utils.preprocessing import preprocess_text
from utils.graph import KGraphPreproc
from utils.graph.chain import GraphChain
from utils.llm.mistral import MistralLLM
from utils.prompt import GRAPH_QA_PROMPT, ENTITY_PROMPT
from utils.file import export_results_to_file

### Load and generate a subset

In [8]:
cwq = pd.read_json("/datasets/CWQ/ComplexWebQuestions_dev.json")
cwq.head(1)

Unnamed: 0,ID,answers,composition_answer,compositionality_type,created,machine_question,question,sparql,webqsp_ID,webqsp_question
0,WebQTrn-1430_ac053cda0a7424c48e4809c71171fbed,"[{'aliases': [], 'answer': 'Muhammad Zia-ul-Ha...",pakistan,composition,2018-02-12T22:17:21,who was the president of the country that cont...,Who was the president in 1980 of the country t...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-1430,who was the president of pakistan in 1980


### Extract answer ids

In [79]:
def convert_entity(entity, reverse=False):
    """
    converts: m.07j6w into /m/07j6w
    reverse does the opposite
    """
    if reverse:
        return entity[1:].replace("/", ".")
    return "/" + entity.replace(".", "/")

In [None]:
def get_answer_ids(answers):
    return list(map(lambda x: convert_entity(x["answer_id"]), answers))

In [84]:
def ids_in_graph(id_list):
    return any([id_ in fbkb_graph._graph for id_ in id_list])

### Counts of questions with at least one answer ids in graph

In [167]:
cwq["answer_ids"] = cwq["answers"].apply(lambda x: get_answer_ids(x))

In [None]:
cwq_answer_present = cwq[cwq["answer_ids"].apply(ids_in_graph)]
cwq_answer_present.groupby("compositionality_type").count()

Unnamed: 0_level_0,ID,answers,composition_answer,created,machine_question,question,sparql,webqsp_ID,webqsp_question,topic_entities,topic_ids,answer_ids
compositionality_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
comparative,124,124,124,124,124,124,124,124,124,0,0,124
composition,926,926,926,926,926,926,926,926,926,0,0,926
conjunction,1070,1070,1070,1070,1070,1070,1070,1070,1070,0,0,1070
superlative,107,107,107,107,107,107,107,107,107,0,0,107


### Load the graph and entity extraction chain

In [None]:
####### load the graph
fbkb_graph = KGraphPreproc.get_fbkb_graph()

In [None]:
mistral = MistralLLM()

In [None]:
chain = GraphChain.from_llm(
    llm=mistral,
    graph=fbkb_graph,
    qa_prompt=GRAPH_QA_PROMPT,
    entity_prompt=ENTITY_PROMPT,
    verbose=False,
)

### Infer topic entities from the questions

In [98]:
def extract_entity(row):
    entity_string = chain.entity_extraction_chain.run(row.question)
    entities = get_entities(entity_string)
    pbar.update(1)
    return entities

In [None]:
pbar = tqdm(total=len(cwq_answer_present))
cwq_answer_present["topic_entities"] = cwq_answer_present.apply(extract_entity, axis=1)

  2%|▏         | 54/2227 [00:31<21:01,  1.72it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cwq_answer_present["topic_entities"] = cwq_answer_present.apply(extract_entity, axis=1)


### Save entities

In [102]:
cwq_answer_present[["ID", "topic_entities"]].to_csv("/datasets/CWQ/topic_entities.csv")

### Load entities

In [183]:
topic_ents = pd.read_csv("/datasets/CWQ/topic_entities.csv")["topic_entities"].values
cwq_answer_present["topic_entities"] = topic_ents

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cwq_answer_present["topic_entities"] = topic_ents


In [186]:
def entities_to_ids(entities):
    mid_list = []
    for entity in entities:
        proc_entity = preprocess_text(entity)
        mid = fbkb_graph.preprocessed_nodes.get(proc_entity, None)
        if mid:
            mid_list.append(mid)
    return mid_list

In [187]:
cwq_answer_present["topic_ids"] = cwq_answer_present["topic_entities"].apply(entities_to_ids)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cwq_answer_present["topic_ids"] = cwq_answer_present["topic_entities"].apply(entities_to_ids)


In [188]:
cwq_both_present = cwq_answer_present[cwq_answer_present["topic_ids"].apply(lambda x: len(x) > 0)]

### Check path present

In [170]:
def has_graph_path(row):
    for start in row.topic_ids:
        for target in row.answer_ids:
            try:
                if nx.has_path(fbkb_graph._graph, start, target):
                    return True
            except nx.NodeNotFound:
                continue
    return False

In [194]:
cwq_paths = cwq_both_present[cwq_both_present.apply(has_graph_path, axis=1)]

In [196]:
cwq_paths.groupby("compositionality_type").count()

Unnamed: 0_level_0,ID,answers,composition_answer,created,machine_question,question,sparql,webqsp_ID,webqsp_question,topic_entities,topic_ids,answer_ids
compositionality_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
comparative,124,124,124,124,124,124,124,124,124,124,124,124
composition,922,922,922,922,922,922,922,922,922,922,922,922
conjunction,1062,1062,1062,1062,1062,1062,1062,1062,1062,1062,1062,1062
superlative,107,107,107,107,107,107,107,107,107,107,107,107


### Sample

In [None]:
sample_size = {
    "comparative": 100,
    "composition": 400,
    "conjunction": 400,
    "superlative": 100,
}
cwq_subset = cwq_paths.groupby("compositionality_type", group_keys=False).apply(lambda x: x.sample(n=sample_size[x.name], random_state=42))

  cwq_subset = cwq_paths.groupby("compositionality_type", group_keys=False).apply(lambda x: x.sample(n=sample_size[x.name], random_state=42))


Unnamed: 0,ID,answers,composition_answer,compositionality_type,created,machine_question,question,sparql,webqsp_ID,webqsp_question,topic_entities,topic_ids,answer_ids
894,WebQTest-1311_e920e31a99d6b7dfbeef110668d3103d,"[{'aliases': ['W. Morris'], 'answer': 'William...",,comparative,2017-10-26T10:08:36,what was antoni gaudi inspired by and the pers...,What inspiration of Antoni Gaudi died later th...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTest-1311,what was antoni gaudi inspired by,"['Antoni Gaudi', 'death date (not explicitly m...","[/m/0g84t93, /m/0g84t93, /m/0g84t93, /m/0g84t9...",[/m/08304]
1329,WebQTest-1382_b9b879060be6df6cb7cd937a7996f9d9,"[{'aliases': ['Brazilian ', 'República Federat...",,comparative,2017-10-26T10:25:29,what other countries border argentina and numb...,What country borders Argentina and has an army...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTest-1382,what other countries border argentina,"['Chile', 'Brazil']","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/0g84t93...",[/m/015fr]
1235,WebQTest-1382_edc19b9010b39a6a7a1e3926399c8522,"[{'aliases': ['Republic of Chile'], 'answer': ...",,comparative,2017-10-26T10:25:29,what other countries border argentina and the ...,What country bordering Argentina has populatio...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTest-1382,what other countries border argentina,"['Argentina', 'border', 'population', 'less th...","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/0g84t93...","[/m/01p1v, /m/0165v, /m/05v10, /m/015fr, /m/07..."
1778,WebQTrn-3252_34f533dd75026e91fc9acd350e6eeffb,"[{'aliases': [], 'answer': 'Benin', 'answer_id...",,comparative,2017-10-26T05:14:40,what countries does the niger river flow throu...,What countries in which the Niger River flows ...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-3252,what countries does the niger river flow through,"['Niger', 'Mali', 'Guinea']","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/0g84t93...",[/m/0164v]
1471,WebQTrn-2177_dec2523c78124e170c353878876cff1e,"[{'aliases': ['Harvard University, main campus...",,comparative,2017-10-26T02:27:02,where did caroline kennedy go to school and th...,"Where did Caroline Kennedy attend university, ...",PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-2177,where did caroline kennedy go to school,"['Caroline Kennedy', 'University (name unknown)']","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/0g84t93...","[/m/03ksy, /m/01mpwj, /m/01n951]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2898,WebQTest-1382_73e475413d79895e98983bff8b926f21,"[{'aliases': ['Brazilian ', 'República Federat...",,superlative,2017-10-26T10:25:29,what other countries border argentina and the ...,Which country bordering Argentina has the lowe...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTest-1382,what other countries border argentina,"['Chile', 'calling code']","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/0g84t93...",[/m/015fr]
3254,WebQTrn-2525_6f63302a5c1425c7a4f31bd93c423f2b,"[{'aliases': ['TU', 'Temple', 'Temple Universi...",,superlative,2017-10-26T03:14:46,where did jerry spinelli go to college and is ...,"What college, that has the largest number of u...",PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-2525,where did jerry spinelli go to college,"['College', 'Jerry Spinelli', 'largest number ...","[/m/0g84t93, /m/0g84t93, /m/02lw5z, /m/02lw5z,...",[/m/01jt2w]
1901,WebQTrn-909_86f42ab931739ed1c6ba88a8db93fd0d,"[{'aliases': ['Minneapolis, Minnesota', 'Henne...",,superlative,2017-10-25T23:23:14,where did bob dylan live and the location GNIS...,What location with the smallest GNIS feature I...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-909,where did bob dylan live,"['Bob Dylan', 'location']","[/m/0g84t93, /m/0g84t93, /m/0g84t93, /m/0g84t9...",[/m/0fpzwf]
2507,WebQTrn-3358_887e6cbf6fd62ad83a3815bd45a6b28d,"[{'aliases': ['Long 'Un', 'The Flatboat Man', ...",,superlative,2017-10-26T05:32:25,what influenced whitman 's poetry and is the p...,Which politician who held office most recently...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTrn-3358,what influenced whitman 's poetry,"['Politician', 'most recently held office', 'W...","[/m/0g84t93, /m/0g84t93, /m/0g84t93, /m/0g84t9...",[/m/0gzh]


In [205]:
def extract_answers(answer_list):
    return list(map(lambda x: x["answer"], answer_list))

In [209]:
cwq_subset["answer_list"] = cwq_subset["answers"].apply(extract_answers)
cwq_subset.head(1)

Unnamed: 0,ID,answers,composition_answer,compositionality_type,created,machine_question,question,sparql,webqsp_ID,webqsp_question,topic_entities,topic_ids,answer_ids,answer_list
894,WebQTest-1311_e920e31a99d6b7dfbeef110668d3103d,"[{'aliases': ['W. Morris'], 'answer': 'William...",,comparative,2017-10-26T10:08:36,what was antoni gaudi inspired by and the pers...,What inspiration of Antoni Gaudi died later th...,PREFIX ns: <http://rdf.freebase.com/ns/>\nSELE...,WebQTest-1311,what was antoni gaudi inspired by,"['Antoni Gaudi', 'death date (not explicitly m...","[/m/0g84t93, /m/0g84t93, /m/0g84t93, /m/0g84t9...",[/m/08304],[William Morris]


In [210]:
cwq_subset[["ID", "compositionality_type", "question", "answers", "topic_ids", "answer_ids"]].to_csv("/datasets/CWQ/cwq-1000.csv")