In [1]:
import torch
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import hashlib
from tqdm import tqdm
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Correctly formatted URL for a local Elasticsearch instance
es = Elasticsearch(["http://localhost:9200"])

In [4]:
# Check the health of the cluster
health = es.cluster.health()
print(health)

{'cluster_name': 'elasticsearch', 'status': 'yellow', 'timed_out': False, 'number_of_nodes': 1, 'number_of_data_nodes': 1, 'active_primary_shards': 1, 'active_shards': 1, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 1, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 50.0}


In [5]:
# Ensure the entire text in each cell is displayed without truncation
pd.set_option('display.max_colwidth', None)

# Adjusting display width might not be necessary depending on your pandas version, but it's here for completeness
pd.set_option('display.width', None)

# Load your DataFrame
pew = pd.read_csv('../dataset/pew_dataset/metadata.csv')
statista = pd.read_csv('../dataset/statista_dataset/metadata.csv')
columns = ['title','caption']

# Filtering the DataFrame to include only the specified columns
pew_df = pew[columns]
statista_df = statista[columns]
combined_df = pd.concat([pew_df, statista_df], ignore_index=True)

In [6]:
print(f"Number of rows in pew_dataset: {len(pew_df)}")
print(f"Number of rows in statista_df: {len(statista_df)}")
print(f"Number of rows in statista_df: {len(combined_df)}")

Number of rows in pew_dataset: 1486
Number of rows in statista_df: 27868
Number of rows in statista_df: 29354


In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [8]:
index_name = "documents"

In [9]:
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body={
        "mappings": {
            "properties": {
                "title": {"type": "text"},
                "content": {"type": "text"},
                "embedding": {"type": "dense_vector", "dims": 384}
            }
        }
    })

In [10]:
# Check if the index exists
exists = es.indices.exists(index=index_name)
print(f"Index Exists: {exists}")

# Get index information (settings and mappings)
if exists:
    index_info = es.indices.get(index=index_name)
    print(index_info)

Index Exists: True
{'documents': {'aliases': {}, 'mappings': {'properties': {'content': {'type': 'text'}, 'embedding': {'type': 'dense_vector', 'dims': 384, 'index': True, 'similarity': 'cosine'}, 'title': {'type': 'text'}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'documents', 'creation_date': '1709672865336', 'number_of_replicas': '1', 'uuid': '4U_yVTmYTziNKUy-6RPndg', 'version': {'created': '8500010'}}}}}


In [11]:
# indexing the documents into embeddings using Elasticsearch with its unique ID, ensuring no duplicates are created.
for _, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    # Concatenate title and caption with a space or some delimiter
    combined_text = f"{row['title']}. {row['caption']}"
    
    # Generate embedding for the combined text
    embedding = model.encode(combined_text).tolist()

    # Generate a unique ID for the document using a hash of the title and caption
    unique_id = hashlib.sha256(combined_text.encode('utf-8')).hexdigest()
    
    # Index the document with the combined embedding and use the unique_id as the document ID
    es.index(index=index_name, id=unique_id, body={
        "title": row['title'],
        "content": row['caption'],
        "embedding": embedding
    })

100%|██████████| 29354/29354 [04:32<00:00, 107.65it/s]


In [12]:
doc_count = es.count(index=index_name)['count']
print(f"Number of documents in the index: {doc_count}")

Number of documents in the index: 29348


In [13]:
# Assuming initialization and index creation as before

topics_df= pd.read_csv('../dataset/topics.csv')

# Dictionary to hold total similarity scores for each topic
topic_similarity_scores = {}

for _, row in topics_df.iterrows():
    topic = row['Topic']
    topic_embedding = model.encode(topic).tolist()
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": topic_embedding}
            }
        }
    }
    
    response = es.search(index=index_name, body={
        "size": 10000,  # You might want to adjust this size based on your needs
        "query": script_query,
        "_source": {"excludes": ["embedding"]}
    })

    # Sum up the scores of all hits for this topic
    total_score = sum(hit['_score'] for hit in response['hits']['hits'])
    topic_similarity_scores[topic] = total_score

In [14]:
# Rank topics by their total similarity score
sorted_topics = sorted(topic_similarity_scores.items(), key=lambda x: x[1], reverse=True)

# Output the top 10 topics based on cumulative similarity scores
print("Top 10 Relevant Topics Based on Similarity Score:")
for topic, score in sorted_topics[:10]:
    print(f"Topic: {topic}, Total Similarity Score: {score}")

Top 10 Relevant Topics Based on Similarity Score:
Topic: Are gas prices too high?, Total Similarity Score: 12017.70943549993
Topic: Are social networking sites good for our society?, Total Similarity Score: 11854.20004799998
Topic: Are social media platforms doing enough to prevent cyberbullying?, Total Similarity Score: 11723.159291899972
Topic: Should government spending be reduced?, Total Similarity Score: 11606.382903299978
Topic: Should prescription drugs be advertised directly to consumers?, Total Similarity Score: 11604.013122200024
Topic: Does legal prostitution increase human trafficking?, Total Similarity Score: 11567.266546400002
Topic: Is capitalism the best form of economy?, Total Similarity Score: 11557.294478599968
Topic: Do violent video games contribute to youth violence?, Total Similarity Score: 11515.799558000026
Topic: Do we need cash?, Total Similarity Score: 11477.134343400057
Topic: Does lowering the federal corporate income tax rate create jobs?, Total Similarit

In [15]:
# Create a DataFrame from the sorted topics
topics_df = pd.DataFrame(sorted_topics[:10], columns=['Topic', 'Total Similarity Score'])

# Specify the directory and filename
save_dir = "../dataset"  # Adjust the path as needed
filename = "TopRelevant_topics.csv"
save_path = os.path.join(save_dir, filename)

# Ensure the directory exists
#os.makedirs(save_dir, exist_ok=True)

# Save the DataFrame to a CSV file
topics_df.to_csv(save_path, index=False)

print(f"Top relevant topics have been saved as a DataFrame to {save_path}")


Top relevant topics have been saved as a DataFrame to ../dataset/TopRelevant_topics.csv
