In [13]:
import torch
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
import hashlib
from tqdm import tqdm
import random


In [11]:
# Initialize Elasticsearch client and model
es = Elasticsearch(["http://localhost:9200"])
model = SentenceTransformer("all-MiniLM-L6-v2")
index_name = "documents"

In [12]:
doc_count = es.count(index=index_name)['count']
print(f"Number of documents in the index: {doc_count}")

Number of documents in the index: 29348


In [14]:
# Load topics from a CSV file (assuming the CSV has a column named 'Topic')
topics_df= pd.read_csv('../dataset/TopRelevant_topics.csv')
# Select a random topic
random_topic = random.choice(topics_df['Topic'].to_list())

In [15]:
random_topic

'Are social media platforms doing enough to prevent cyberbullying?'

In [16]:
# Encode the selected topic to get its embedding
topic_embedding = model.encode(random_topic).tolist()

In [19]:
# Elasticsearch query to find relevant documents based on cosine similarity
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
            "params": {"query_vector": topic_embedding}
        }
    }
}
    
# Execute the search query
response = es.search(index=index_name, body={
    "size": 3,  # Fetch top 5 relevant documents
    "query": script_query,
    "_source": ["title", "content"]  # Adjust fields based on your document structure
})

In [21]:
# Extract and print the top 5 relevant documents
print(f"Top 3 relevant documents for the topic '{random_topic}':\n")

for i, hit in enumerate(response['hits']['hits'], start=1):
    doc = hit['_source']
    print(f"{i}. Title: {doc['title']}\n   Content: {doc['content']}\n   Score: {hit['_score']}\n")

Top 3 relevant documents for the topic 'Are social media platforms doing enough to prevent cyberbullying?':

1. Title: a majority of teens have been the target of cyberbullying, with name-calling fand rumor-spreading being the most common forms of harassment % of U.S. teens who say they have experienced online or on their cellphone
   Content: A majority of U.S. teens (59%) have experienced some form of cyberbullying. About four-in-ten teens ages 13 to 17 (42%) say they have been called offensive names online or on their cellphone, 32% say they have had false rumors spread about them and one-quarter report that they have received explicit images they didn’t ask for. At the same time, nine-in-ten teens say online harassment is a problem that affects their peers. And while a majority of teens think parents are doing a good job addressing the issue, they are critical of the way teachers, social media companies and politicians are tackling cyberbullying.
   Score: 1.6866469

2. Title: 
   

In [1]:
# app.py
from flask import Flask, request, jsonify, render_template
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
import pandas as pd
import random

app = Flask(__name__)
es = Elasticsearch(["http://localhost:9200"])
model = SentenceTransformer("all-MiniLM-L6-v2")
index_name = "documents"

# Load topics DataFrame globally
topics_df = pd.read_csv('../dataset/TopRelevant_topics.csv')

@app.route('/')
def index():
    # Render the main page
    return render_template('index.html')

@app.route('/get-random-topic', methods=['GET'])
def get_random_topic():
    random_topic = random.choice(topics_df['Topic'].to_list())
    return jsonify({"topic": random_topic})

@app.route('/search', methods=['POST'])
def search():
    topic = request.json['topic']
    topic_embedding = model.encode(topic).tolist()

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": topic_embedding}
            }
        }
    }

    response = es.search(index=index_name, body={
        "size": 3,  # Fetch top 3 relevant documents
        "query": script_query,
        "_source": ["title", "content"]
    })

    documents = [{"title": hit["_source"]["title"], "content": hit["_source"]["content"], "score": hit["_score"]} for hit in response['hits']['hits']]
    return jsonify(documents)

if __name__ == '__main__':
    app.run(debug=True)

  from .autonotebook import tqdm as notebook_tqdm


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
Traceback (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "/home/mleshashi/thesis-sharma/sraEnv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/home/mleshashi/thesis-sharma/sraEnv/lib/python3.10/site-packages/traitlets/config/application.py", line 1074, in launch_instance
    app.initialize(argv)
  File "/home/mleshashi/thesis-sharma/sraEnv/lib/python3.10/site-packages/traitlets/config/application.py", line 118, in inner
    return method(app, *args, **kwargs)
  File "/home/mleshashi/thesis-sharma/sraEnv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 692, in initialize
    self.init_sockets()
  File "/home/mleshashi/thesis-sharma/sra

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [2]:
%tb

SystemExit: 1