In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gc
import re
import csv
import os
import torch
import Stemmer
import pandas as pd
from tqdm import tqdm
from typing import Any, Dict, Iterator, List, Mapping, Optional
from langchain_core.callbacks.manager import CallbackManagerForChainRun
from langchain_core.language_models import BaseLanguageModel
from langchain_core.prompts import BasePromptTemplate
from langchain_community.graphs.networkx_graph import get_entities
from langchain.chains import GraphQAChain
from langchain.chains.llm import LLMChain
import bm25s
import networkx as nx
from utils.preprocessing import stemmer, preprocess_text
from utils.graph import KGraphPreproc
from utils.llm.mistral import MistralLLM
from utils.prompt import GRAPH_QA_PROMPT, ENTITY_PROMPT, NO_CONTEXT_PROMPT, EVALUATE_CONTEXT_PROMPT

In [3]:
fbqa = pd.read_json("/datasets/FreebaseQA/FreebaseQA-eval.json")
fbkb_graph = KGraphPreproc.get_fbkb_graph()

## 1. Calculating average degree and number of repeating edges

In [4]:
nodes = fbkb_graph._graph.number_of_nodes()
edges = fbkb_graph._graph.number_of_edges()

In [8]:
edges/nodes

17.097242280448388

In [24]:
deg_dict = dict(fbkb_graph._graph.degree())
deg_df = pd.DataFrame(
    list(deg_dict.items()),
    columns=["Node", "Degree"]
)
deg_df.head()

Unnamed: 0,Node,Degree
0,/m/027rn,49
1,/m/06cx9,84
2,/m/017dcd,10
3,/m/06v8s0,12
4,/m/07s9rl0,1301


In [30]:
fbkb_graph.mid2name["/m/09c7w0"]

'United_States_of_America'

In [40]:
deg_df[deg_df.Degree>250].sort_values(by="Degree")

Unnamed: 0,Node,Degree
597,/m/01vj9c,254
4709,/m/03bwzr4,259
1302,/m/015qh,261
2940,/m/01g5v,264
2149,/m/04kxsb,266
...,...,...
90,/m/09nqf,3275
141,/m/05zppz,3567
434,/m/04ztj,3679
62,/m/08mbj5d,4364


In [38]:
deg_df[deg_df.Degree>300].describe()

Unnamed: 0,Degree
count,127.0
mean,731.062992
std,877.439165
min,301.0
25%,384.0
50%,462.0
75%,714.5
max,6953.0


Some nodes might have a single relation connected to many other nodes. Poses a bottleneck for LLM processing (needs partitioning similar to relations)

In [58]:
high_deg_df = deg_df[deg_df.Degree>250].sort_values(by="Degree")
high_deg_df.reset_index(inplace=True)
for i, r in high_deg_df.iterrows():
    node = r.Node
    # get all unique rels
    rels = [edge[2].get("relation") for edge in fbkb_graph._graph.edges(node, data=True)]
    max_cnt = 0
    for rel in set(rels):
        cnt = rels.count(rel)
        if cnt > max_cnt:
            max_cnt = cnt
        high_deg_df.loc[i, "max_rel_count"] = max_cnt
high_deg_df

Unnamed: 0,index,Node,Degree,max_rel_count
0,597,/m/01vj9c,254,76.0
1,4709,/m/03bwzr4,259,204.0
2,1302,/m/015qh,261,153.0
3,2940,/m/01g5v,264,145.0
4,2149,/m/04kxsb,266,128.0
...,...,...,...,...
142,90,/m/09nqf,3275,1409.0
143,141,/m/05zppz,3567,3552.0
144,434,/m/04ztj,3679,3162.0
145,62,/m/08mbj5d,4364,4364.0


### 3010 - clearly need partitioning

# 2. Unique edges

In [5]:
edges

248611