In [2]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.7.0


# **Flan-T5-large + tas-b**

In [6]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Load infoboxes
# Update the file path to the location where you uploaded the file in Kaggle
with open('/kaggle/input/infoboxes-v1/extracted_infoboxes.json') as f:
    infoboxes = json.load(f)

# Extract relevant text from infoboxes and create a searchable corpus
corpus = []
for item in infoboxes:
    title = item.get("title", "")
    infobox_text = item.get("infobox", "")
    if title and infobox_text:
        corpus.append(f"{title}: {infobox_text}")

# Initialize the retriever model
retriever = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
corpus_embeddings = retriever.encode(corpus, convert_to_tensor=True)

# Function to retrieve top-k context
def retrieve_context(query, top_k):
    query_embedding = retriever.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    return [corpus[hit['corpus_id']] for hit in hits[0]]

# Initialize the generative model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

# Function to generate answer using the retrieved context
def generate_answer(query, top_k):
    contexts = retrieve_context(query, top_k=top_k)
    
    # Clean and format the contexts
    contexts_cleaned = []
    for context in contexts:
        context_cleaned = context.replace("[[", "").replace("]]", "").replace("'''", "").replace("|", ": ")
        context_cleaned = context_cleaned.replace("\n", " ").replace("  ", " ")
        contexts_cleaned.append(context_cleaned)
    
    combined_context = " ".join(contexts_cleaned)
    
    # Ensure combined context does not exceed max length
    max_length = 512
    combined_tokens = tokenizer.encode(combined_context)
    if len(combined_tokens) > max_length:
        combined_tokens = combined_tokens[:max_length]
        combined_context = tokenizer.decode(combined_tokens, skip_special_tokens=True)
    
    input_text = f"Context: {combined_context}\n\nQuestion: {query}\n\nAnswer (please format as a timeline):"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Function to run experiments with different top_k values
def run_experiments(query):
    for top_k in [3, 5, 7, 10]:
        print(f"Running experiment with top_k={top_k}")
        answer = generate_answer(query, top_k)
        print(f"Answer with top_k={top_k}:\n{answer}\n")

# Test the system with the sample question
query = "What were the major historical periods of Nauru and their respective timelines?"
run_experiments(query)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Running experiment with top_k=3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (947 > 512). Running this sequence through the model will result in indexing errors


Answer with top_k=3:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=5:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=7


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=7:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=10


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=10:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption



# **Flan-T5-XL + tas-b**

In [5]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util

# Load infoboxes
# Update the file path to the location where you uploaded the file in Kaggle
with open('/kaggle/input/infoboxes-v1/extracted_infoboxes.json') as f:
    infoboxes = json.load(f)

# Extract relevant text from infoboxes and create a searchable corpus
corpus = []
for item in infoboxes:
    title = item.get("title", "")
    infobox_text = item.get("infobox", "")
    if title and infobox_text:
        corpus.append(f"{title}: {infobox_text}")

# Initialize the retriever model
retriever = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
corpus_embeddings = retriever.encode(corpus, convert_to_tensor=True)

# Function to retrieve top-k context
def retrieve_context(query, top_k):
    query_embedding = retriever.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    return [corpus[hit['corpus_id']] for hit in hits[0]]

# Initialize the generative model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl")

# Function to generate answer using the retrieved context
def generate_answer(query, top_k):
    contexts = retrieve_context(query, top_k=top_k)
    
    # Clean and format the contexts
    contexts_cleaned = []
    for context in contexts:
        context_cleaned = context.replace("[[", "").replace("]]", "").replace("'''", "").replace("|", ": ")
        context_cleaned = context_cleaned.replace("\n", " ").replace("  ", " ")
        contexts_cleaned.append(context_cleaned)
    
    combined_context = " ".join(contexts_cleaned)
    
    # Ensure combined context does not exceed max length
    max_length = 512
    combined_tokens = tokenizer.encode(combined_context)
    if len(combined_tokens) > max_length:
        combined_tokens = combined_tokens[:max_length]
        combined_context = tokenizer.decode(combined_tokens, skip_special_tokens=True)
    
    input_text = f"Context: {combined_context}\n\nQuestion: {query}\n\nAnswer (please format as a timeline):"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Function to run experiments with different top_k values
def run_experiments(query):
    for top_k in [3, 5, 7, 10]:
        print(f"Running experiment with top_k={top_k}")
        answer = generate_answer(query, top_k)
        print(f"Answer with top_k={top_k}:\n{answer}\n")

# Test the system with the sample question
query = "What were the major historical periods of Nauru and their respective timelines?"
run_experiments(query)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Running experiment with top_k=3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (947 > 512). Running this sequence through the model will result in indexing errors
2024-05-26 11:11:36.319572: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 11:11:36.319670: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 11:11:36.442220: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Answer with top_k=3:
Pre-history : data2 = until 1888 : German Empire: German Rule : data3 = 1888–1919 : Australia trust : data4 = 1920–1967 : Japanese occupation of Nauru: Japanese Rule : data5 = 1942–45 : Republic : data6 = 1968–present

Running experiment with top_k=5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=5:
Pre-history : data2 = until 1888 : German Empire: German Rule : data3 = 1888–1919 : Australia trust : data4 = 1920–1967 : Japanese occupation of Nauru: Japanese Rule : data5 = 1942–45 : Republic : data6 = 1968–present

Running experiment with top_k=7


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=7:
Pre-history : data2 = until 1888 : German Empire: German Rule : data3 = 1888–1919 : Australia trust : data4 = 1920–1967 : Japanese occupation of Nauru: Japanese Rule : data5 = 1942–45 : Republic : data6 = 1968–present

Running experiment with top_k=10


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=10:
Pre-history : data2 = until 1888 : German Empire: German Rule : data3 = 1888–1919 : Australia trust : data4 = 1920–1967 : Japanese occupation of Nauru: Japanese Rule : data5 = 1942–45 : Republic : data6 = 1968–present



# **Flan-T5-large + tas-b + evaluation?**

In [7]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer, util
import torch

# Load infoboxes
with open('/kaggle/input/infoboxes-v1/extracted_infoboxes.json') as f:
    infoboxes = json.load(f)

# Extract relevant text from infoboxes and create a searchable corpus
corpus = []
for item in infoboxes:
    title = item.get("title", "")
    infobox_text = item.get("infobox", "")
    if title and infobox_text:
        corpus.append(f"{title}: {infobox_text}")

# Initialize the retriever model
retriever = SentenceTransformer('sentence-transformers/msmarco-distilbert-base-tas-b')
corpus_embeddings = retriever.encode(corpus, convert_to_tensor=True)

# Function to retrieve top-k context
def retrieve_context(query, top_k):
    query_embedding = retriever.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)
    return [corpus[hit['corpus_id']] for hit in hits[0]]

# Initialize the generative model
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

# Function to generate answer using the retrieved context
def generate_answer(query, top_k):
    contexts = retrieve_context(query, top_k=top_k)
    
    # Clean and format the contexts
    contexts_cleaned = []
    for context in contexts:
        context_cleaned = context.replace("[[", "").replace("]]", "").replace("'''", "").replace("|", ": ")
        context_cleaned = context_cleaned.replace("\n", " ").replace("  ", " ")
        contexts_cleaned.append(context_cleaned)
    
    combined_context = " ".join(contexts_cleaned)
    
    # Ensure combined context does not exceed max length
    max_length = 512
    combined_tokens = tokenizer.encode(combined_context)
    if len(combined_tokens) > max_length:
        combined_tokens = combined_tokens[:max_length]
        combined_context = tokenizer.decode(combined_tokens, skip_special_tokens=True)
    
    input_text = f"Context: {combined_context}\n\nQuestion: {query}\n\nAnswer (please format as a timeline):"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(inputs.input_ids, max_length=200)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

# Function to run experiments with different top_k values
def run_experiments(query):
    results = {}
    for top_k in [3, 5, 7, 10]:
        print(f"Running experiment with top_k={top_k}")
        answer = generate_answer(query, top_k)
        results[top_k] = answer
        print(f"Answer with top_k={top_k}:\n{answer}\n")
    return results

# Function to evaluate performance
def evaluate_performance(results, ground_truth):
    # Implement evaluation metrics: EM, F1, TimeMetric, Completeness
    evaluation_results = {}
    for top_k, answer in results.items():
        # Placeholder for actual evaluation logic
        evaluation_results[top_k] = {"EM": 0, "F1": 0, "TimeMetric": 0, "Completeness": 0}
    return evaluation_results

# Test the system with the sample question
query = "What were the major historical periods of Nauru and their respective timelines?"
results = run_experiments(query)
ground_truth = "Provide the actual ground truth here"  # Placeholder
evaluation_results = evaluate_performance(results, ground_truth)
print(f"Evaluation Results: {evaluation_results}")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Running experiment with top_k=3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (947 > 512). Running this sequence through the model will result in indexing errors


Answer with top_k=3:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=5


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=5:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=7


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=7:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Running experiment with top_k=10


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Answer with top_k=10:
Pre-history : until 1888 : German Empire : German Rule : 1888–1919 : Australia trust : 1920–1967 : Japanese occupation of Nauru : Japanese Rule : 1942–45 : Republic : 1968–present : Major Events : Phosphate originally found : 1900 : Collapse of phosphate industry : 2002 The Goodies : Infobox : name = : title : The Goodies : image = File:TheGoodies.jpg : 240px : caption

Evaluation Results: {3: {'EM': 0, 'F1': 0, 'TimeMetric': 0, 'Completeness': 0}, 5: {'EM': 0, 'F1': 0, 'TimeMetric': 0, 'Completeness': 0}, 7: {'EM': 0, 'F1': 0, 'TimeMetric': 0, 'Completeness': 0}, 10: {'EM': 0, 'F1': 0, 'TimeMetric': 0, 'Completeness': 0}}
