In [2]:
import torch
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import hashlib
from tqdm import tqdm


In [3]:
# Correctly formatted URL for a local Elasticsearch instance
es = Elasticsearch(["http://localhost:9200"])

In [4]:
# Check the health of the cluster
health = es.cluster.health()
print(health)

{'cluster_name': 'elasticsearch', 'status': 'yellow', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'active_primary_shards': 1, 'active_shards': 1, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 1, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 50.0}


In [5]:
# Ensure the entire text in each cell is displayed without truncation
pd.set_option('display.max_colwidth', None)

# Adjusting display width might not be necessary depending on your pandas version, but it's here for completeness
pd.set_option('display.width', None)

# Load your DataFrame
pew = pd.read_csv('../dataset/pew_dataset/metadata.csv')
statista = pd.read_csv('../dataset/statista_dataset/metadata.csv')
columns = ['title','caption']

# Filtering the DataFrame to include only the specified columns
pew_df = pew[columns]
statista_df = statista[columns]
combined_df = pd.concat([pew_df, statista_df], ignore_index=True)

In [6]:
print(f"Number of rows in pew_dataset: {len(pew_df)}")
print(f"Number of rows in statista_df: {len(statista_df)}")
print(f"Number of rows in statista_df: {len(combined_df)}")

Number of rows in pew_dataset: 1486
Number of rows in statista_df: 27868
Number of rows in statista_df: 29354


In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
index_name = "documents"

In [9]:
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body={
        "mappings": {
            "properties": {
                "title": {"type": "text"},
                "content": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": 384}
            }
        }
    })

In [10]:
# Check if the index exists
exists = es.indices.exists(index=index_name)
print(f"Index Exists: {exists}")

# Get index information (settings and mappings)
if exists:
    index_info = es.indices.get(index=index_name)
    print(index_info)

Index Exists: True
{'documents': {'aliases': {}, 'mappings': {'properties': {'content': {'type': 'text'}, 'embedding': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}, 'title': {'type': 'text'}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'documents', 'creation_date': '1709672865336', 'number_of_replicas': '1', 'uuid': '4U_yVTmYTziNKUy-6RPndg', 'version': {'created': '8500010'}}}}}


In [11]:
# indexing the documents into embeddings using Elasticsearch with its unique ID, ensuring no duplicates are created.
for _, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    # Concatenate title and caption with a space or some delimiter
    combined_text = f"{row['title']}. {row['caption']}"
    
    # Generate embedding for the combined text
    embedding = model.encode(combined_text).tolist()

    # Generate a unique ID for the document using a hash of the title and caption
    unique_id = hashlib.sha256(combined_text.encode('utf-8')).hexdigest()
    
    # Index the document with the combined embedding and use the unique_id as the document ID
    es.index(index=index_name, id=unique_id, body={
        "title": row['title'],
        "content": row['caption'],
        "embedding": embedding
    })

100%|██████████| 29354/29354 [05:01<00:00, 97.40it/s] 


In [12]:
doc_count = es.count(index=index_name)['count']
print(f"Number of documents in the index: {doc_count}")

Number of documents in the index: 29348


In [21]:
import random
# Load topics from a CSV file (assuming the CSV has a column named 'Topic')
topics_df= pd.read_csv('../dataset/topics.csv')
# Select a random topic
random_topic = random.choice(topics_df['Topic'].to_list())

In [26]:
random_topic='Are gas prices too high?'

In [27]:
# Encode the selected topic to get its embedding
topic_embedding = model.encode(random_topic).tolist()

In [28]:
# Elasticsearch query to find relevant documents based on cosine similarity
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
            "params": {"query_vector": topic_embedding}
        }
    }
}
    
# Execute the search query
response = es.search(index=index_name, body={
    "size": 6,  # Fetch top 5 relevant documents
    "query": script_query,
    "_source": ["title", "content"]  # Adjust fields based on your document structure
})

In [29]:
# Extract and print the top 5 relevant documents
print(f"Top 5 relevant documents for the topic '{random_topic}':\n")

for i, hit in enumerate(response['hits']['hits'], start=1):
    doc = hit['_source']
    print(f"{i}. Title: {doc['title']}\n   Content: {doc['content']}\n   Score: {hit['_score']}\n")

Top 5 relevant documents for the topic 'Are gas prices too high?':

1. Title: 
                        Average quarterly retail price of regular gasoline in the United States between 1st quarter of 2015 and 1st quarter of 2020 (in U.S. dollars per gallon)*
                    
   Content:  A glut in oil supply between 2014 and 2016 forced down prices and led to a low average U.S. gasoline price of roughly 1.9 U.S. dollars per gallon in the first quarter. Gasoline prices fluctuated considerably between 2019 and 2020 as a result of tensions between the United States and other oil exporters, such as Iran, and stifling oil demand during the COVID-19 pandemic. The price of West Texas Intermediate briefly dipped in the negative in April 2020. Seasonal price variations There are periodic fluctuations in gasoline prices in the United States, where the second and third quarters are typically more expensive than the rest of the year. One of the factors contributing to changing gasoline prices is