In [10]:
import torch
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import hashlib
from tqdm import tqdm
import random

In [11]:
# Initialize Elasticsearch client and model
es = Elasticsearch(["http://localhost:9200"])
model = SentenceTransformer("all-MiniLM-L6-v2")
index_name = "documents"

In [3]:
doc_count = es.count(index=index_name)['count']
print(f"Number of documents in the index: {doc_count}")

Number of documents in the index: 29348


In [4]:
# Load topics from a CSV file (assuming the CSV has a column named 'Topic')
topics_df= pd.read_csv('../dataset/TopRelevant_topics.csv')
# Select a random topic
random_topic = random.choice(topics_df['Topic'].to_list())

In [5]:
random_topic

'Does lowering the federal corporate income tax rate create jobs?'

In [6]:
# Encode the selected topic to get its embedding
topic_embedding = model.encode(random_topic).tolist()

In [7]:
# Elasticsearch query to find relevant documents based on cosine similarity
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
            "params": {"query_vector": topic_embedding}
        }
    }
}
    
# Execute the search query
response = es.search(index=index_name, body={
    "size": 3,  # Fetch top 5 relevant documents
    "query": script_query,
    "_source": ["title", "content"]  # Adjust fields based on your document structure
})

In [21]:
# Extract and print the top 5 relevant documents
print(f"Top 3 relevant documents for the topic '{random_topic}':\n")

for i, hit in enumerate(response['hits']['hits'], start=1):
    doc = hit['_source']
    print(f"{i}. Title: {doc['title']}\n   Content: {doc['content']}\n   Score: {hit['_score']}\n")

Top 3 relevant documents for the topic 'Are social media platforms doing enough to prevent cyberbullying?':

1. Title: a majority of teens have been the target of cyberbullying, with name-calling fand rumor-spreading being the most common forms of harassment % of U.S. teens who say they have experienced online or on their cellphone
   Content: A majority of U.S. teens (59%) have experienced some form of cyberbullying. About four-in-ten teens ages 13 to 17 (42%) say they have been called offensive names online or on their cellphone, 32% say they have had false rumors spread about them and one-quarter report that they have received explicit images they didn’t ask for. At the same time, nine-in-ten teens say online harassment is a problem that affects their peers. And while a majority of teens think parents are doing a good job addressing the issue, they are critical of the way teachers, social media companies and politicians are tackling cyberbullying.
   Score: 1.6866469

2. Title: 
   

In [25]:
from elasticsearch import Elasticsearch
import random

es = Elasticsearch(["http://localhost:9200"])
index_name = "documents"

es.indices.put_settings(index=index_name, body={
    "index": {
        "max_result_window": 50000  # Increase this value cautiously
    }
})

# First, get the count of all documents in the index
count_response = es.count(index=index_name)
total_docs = count_response['count']

# Generate a random offset within the number of documents
random_offset = random.randint(0, max(0, total_docs - 1))

# Fetch one document at a random offset
response = es.search(index=index_name, body={
    "size": 1,
    "query": {"match_all": {}},
    "from": random_offset  # Use the random offset for pagination
})

# Extract and print the document if available
if response['hits']['hits']:
    doc = response['hits']['hits'][0]
    print(f"Title: {doc['_source']['title']}")
    print(f"Content: {doc['_source']['content']}")
else:
    print("No documents found.")

Title: 
                        Boat ownership rate by household income in the U.S. in 2013
                    
Content:  The statistic depicts the boat ownership rate by household income in the U.S. in 2013. 9.4 percent of the respondents with an annual household income of greater than 100,000 U.S. dollars own a boat. 
