## Objective of notebook:

- To explore the keyword search and retrieval for the relevant articles.
- In order to make the search process faster, likely need some mechanism to generate some embeddings for the keywords or tags.
- Currently, I am using a hierarchical clustering to run clustering on all the articles.

In [1]:
import json
import yaml
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import torch
import requests
from torch import nn
from tqdm import trange

# Normally where to do this? (in which function?)
with open("../gradio_config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)
    
load_dotenv()
hf_key = os.getenv('HUGGINGFACE_API_KEY')
dense_embedder_api = os.getenv("HF_API_URL")

In [2]:
# Explore the current data 
files = ["../data/test_data/test.json", "../data/test_data/train.json"]
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            # Load data from the file and append it to the combined list
            data = json.load(fin)
            combined_data.extend(data)
    return combined_data
db = combine_json(files)

In [3]:
db[0]

 'Title': "'Extremely dangerous' Hurricane Lidia takes aim at Mexico's Pacific coast",
 'embeddings': '[-0.037888, 0.045683, -0.056672, 0.033054, 0.030117, 0.008756, -0.046498, -0.013752, -0.01705, 0.001012, -0.051107, -0.034596, 0.00603, 0.026832, 0.029468, 0.00693, -0.004397, -0.030406, -0.085586, -0.019203, -0.01197, -0.039596, -0.015063, -0.038689, 0.046689, 0.051168, -0.026357, 0.007962, -0.007839, -0.047726, 0.029933, 0.023759, 0.048617, -0.013678, 3e-06, -0.007626, 0.002103, -0.033598, 0.070793, -0.064289, -0.025603, -0.11244, -0.043352, 0.020507, -0.021366, -0.050824, -0.016674, 0.014246, 0.006658, 0.006147, 0.01222, -0.023218, 0.007297, 0.020698, 0.038888, 0.000493, -0.031275, 0.000926, -0.06436, -0.088952, 0.049344, 0.011625, 0.011707, 0.008515, 0.033215, 0.009605, -0.01627, -0.019946, -0.01995, 0.020128, 0.024457, -0.013117, 0.03441, 3.9e-05, -0.009643, -0.001638, -0.01274, -0.037272, 0.015636, -0.000866, 0.015652, 0.027185, 0.016962, 0.006075, -0.005968, 0.045111, -0.005765

In [4]:
cos_sim = nn.CosineSimilarity(dim=0)

def get_cosine_tags(test_article, train_article, limit):
    # Convert lists to tensors
    embeddings_1_tensor = torch.tensor(eval(test_article['phrase_Bert_tags_embeddings']))
    embeddings_2_tensor = torch.tensor(eval(train_article['phrase_Bert_tags_embeddings']))
    
    # Store cosine similarities
    cosine_similarities = []

    # Compute cosine similarity for each pair of embeddings and store the results
    for emb1 in embeddings_1_tensor:
        for emb2 in embeddings_2_tensor:
            cosine_similarities.append(cos_sim(emb1, emb2).item())

    # Sort the cosine similarities and select the 10 smallest
    sorted_cosine_similarities = sorted(cosine_similarities)
    sum_similarities = sum(sorted_cosine_similarities[:limit])
    return sum_similarities

In [5]:
train_ids = ['st_1164990',
'st_1165295',
'st_1164227',
'st_1158236',
'st_1158485',
'st_1159581',
'st_1158779',
'st_1157082',
'st_1160289']
test_indexes = []
for id in train_ids:
    for i in range(len(db)):
        if db[i]['st_id'] == id:
            test_indexes.append(i)
        elif db[i]['st_id'] == "st_1159793":
            train_index = i        

print(train_index)
print(test_indexes)

1033
[578, 1932, 413, 1842, 1101, 45, 1097, 539, 1646]


In [6]:
train_db = []
test_article = db[train_index]
for index in test_indexes:
    train_db.append(db[index])

In [7]:
# Go through the train_db to get cosine similarities of each article with respect to the test
def get_similar_by_PBtags(test_article, db):
    print("Computing similarities...\n")
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Tags'] = db[i]['tags']
        dic['cosine_score'] = get_cosine_tags(test_article, db[i], 7)
        by_tags_records.append(dic)

    print("Title of test article: " + test_article['Title'])
    print("Tags of test article: " + str(test_article['tags']) + "\n")

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    i = 0
    for tag in by_tags_records:
        if i >=10:
            break
        print(tag['Title'])
        print(tag['Tags'])
        print(tag['cosine_score'])
        print()
        i += 1
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:10]

def get_similar_by_MPtags(test_article, db):
    print("Computing similarities...\n")
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Tags'] = db[i]['tags']
        dic['cosine_score'] = get_cosine_tags_mpnet(test_article, db[i])
        by_tags_records.append(dic)

    print("Title of test article: " + test_article['Title'])
    print("Tags of test article: " + str(test_article['tags']) + "\n")

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    i = 0
    for tag in by_tags_records:
        if i >=10:
            break
        print(tag['Title'])
        print(tag['Tags'])
        print(tag['cosine_score'])
        print()
        i += 1
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:10]


In [8]:
def dense_embed(payload: str) -> str:
        response = requests.post(dense_embedder_api, headers={"Authorization": f"Bearer {hf_key}"}, json=payload)
        return response.json()

def get_cosine_titles(timeline_embed, train_article):
    similarity = cos_sim(torch.tensor(timeline_embed), torch.tensor(eval(train_article['Title_embeddings'])))
    return similarity

def get_similar_by_titles(test_article, timeline_header, db):
    print("Computing similarities...\n")
    timeline_heading_embed = dense_embed(timeline_header)
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Tags'] = db[i]['tags']
        dic['cosine_score'] = get_cosine_titles(timeline_heading_embed, db[i])
        by_tags_records.append(dic)

    print("Title of test article: " + test_article['Title'])
    print(f"Below are the best articles that are closest to this desired timeline based on the titles: {timeline_header}\n")

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    i = 0
    for tag in by_tags_records:
        if i >=20:
            break
        print(tag['Title'])
        print(tag['cosine_score'])
        print()
        i += 1
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:20] 

def get_cosine_text(timeline_embed, train_article):
    similarity = cos_sim(torch.tensor(timeline_embed), torch.tensor(eval(train_article['embeddings'])))
    return similarity

def get_similar_by_text(test_article, timeline_header, db):
    print("Computing similarities...\n")
    timeline_heading_embed = dense_embed(timeline_header)
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Tags'] = db[i]['tags']
        dic['cosine_score'] = get_cosine_text(timeline_heading_embed, db[i])
        by_tags_records.append(dic)

    print("Title of test article: " + test_article['Title'])
    print(f"Below are the best articles that are closest to this desired timeline based on the texts: \n{timeline_header}\n")

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    i = 0
    for tag in by_tags_records:
        if i >=20:
            break
        print(tag['Title'])
        print(tag['cosine_score'])
        print()
        i += 1
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:20]

def get_cosine_tags_mpnet(timeline_embed, train_article):
    similarity = cos_sim(torch.tensor(timeline_embed), torch.tensor(eval(train_article['tags_embeddings'])))
    return similarity

def get_similar_by_MPtags(test_article, timeline_header,db):
    print("Computing similarities...\n")
    timeline_heading_embed = dense_embed(timeline_header)

    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Tags'] = db[i]['tags']
        dic['cosine_score'] = get_cosine_tags_mpnet(timeline_heading_embed, db[i])
        by_tags_records.append(dic)

    print("Title of test article: " + test_article['Title'])
    print("Tags of test article: " + str(test_article['tags']) + "\n")

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    i = 0
    for tag in by_tags_records:
        if i >=10:
            break
        print(tag['Title'])
        print(tag['Tags'])
        print(tag['cosine_score'])
        print()
        i += 1
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:10]

timeline_header = "The Escalating conflict in Israel and Gaza"
similar_articles_titles = get_similar_by_titles(test_article, timeline_header, db)
# similar_article_mp_tags = get_similar_by_MPtags(test_article, timeline_header, db)
similar_article_text = get_similar_by_text(test_article, timeline_header, db)

Computing similarities...



100%|██████████| 2007/2007 [00:01<00:00, 1149.90it/s]


Title of test article: Japan to provide $88 million in additional humanitarian aid to Palestinians
Below are the best articles that are closest to this desired timeline based on the titles: The Escalating conflict in Israel and Gaza

Israel-Hamas war and the dangers of a creeping occupation
tensor(0.7613)

The Hamas tunnel city beneath Gaza – a hidden front line for Israel
tensor(0.6825)

I negotiated Israel’s hardest hostage deal. Here’s what’s next in Gaza
tensor(0.6524)

The nightmare of delivering aid during this Israel-Hamas war
tensor(0.6482)

Gaza struggles with dead and wounded from intensified Israeli assault
tensor(0.6215)

World reacts to Israel-Hamas war
tensor(0.6205)

Scaling up Gaza aid effort faces tangle of challenges
tensor(0.6166)

Gaza hospital for Palestinians’ medical needs, not Hamas operations: Indonesia
tensor(0.6162)

More questions than answers as Israeli PM Netanyahu seeks security control over Gaza
tensor(0.6153)

Israeli and Hamas fighters in close combat 

100%|██████████| 2007/2007 [00:01<00:00, 1133.42it/s]

Title of test article: Japan to provide $88 million in additional humanitarian aid to Palestinians
Below are the best articles that are closest to this desired timeline based on the texts: 
The Escalating conflict in Israel and Gaza

More questions than answers as Israeli PM Netanyahu seeks security control over Gaza
tensor(0.7321)

Israeli army to confront resilient foe in anticipated Gaza invasion
tensor(0.7204)

Israel-Hamas war and the dangers of a creeping occupation
tensor(0.7076)

As Israel bombards Gaza, bakeries run out of bread, water runs low
tensor(0.6970)

Palestinian Americans fundraise for Gaza, as aid groups receive record donations
tensor(0.6842)

I negotiated Israel’s hardest hostage deal. Here’s what’s next in Gaza
tensor(0.6809)

Israel makes first raids into Gaza; Netanyahu says it is ‘only the beginning’
tensor(0.6670)

While You Were Sleeping: 5 stories you might have missed, Oct 16
tensor(0.6604)

Israel ramps up strikes on Hamas, US urges ‘continuous flow’ of a




In [9]:
def get_titles_str(similar_articles_titles, similar_article_text):
    combined_similars = []
    for i in range(len(similar_article_text)):
        combined_similars.append(similar_article_text[i])
        combined_similars.append(similar_articles_titles[i])
    combined_titles = []
    for article in combined_similars:
        combined_titles.append(article['Title'])
    return combined_titles
combined_titles = get_titles_str(similar_articles_titles, similar_article_text)
print(timeline_header)
combined_titles

The Escalating conflict in Israel and Gaza


['More questions than answers as Israeli PM Netanyahu seeks security control over Gaza',
 'Israel-Hamas war and the dangers of a creeping occupation',
 'Israeli army to confront resilient foe in anticipated Gaza invasion',
 'The Hamas tunnel city beneath Gaza – a hidden front line for Israel',
 'Israel-Hamas war and the dangers of a creeping occupation',
 'I negotiated Israel’s hardest hostage deal. Here’s what’s next in Gaza',
 'As Israel bombards Gaza, bakeries run out of bread, water runs low',
 'The nightmare of delivering aid during this Israel-Hamas war',
 'Palestinian Americans fundraise for Gaza, as aid groups receive record donations',
 'Gaza struggles with dead and wounded from intensified Israeli assault',
 'I negotiated Israel’s hardest hostage deal. Here’s what’s next in Gaza',
 'World reacts to Israel-Hamas war',
 'Israel makes first raids into Gaza; Netanyahu says it is ‘only the beginning’',
 'Scaling up Gaza aid effort faces tangle of challenges',
 'While You Were Slee

## Using a re ranker model to re rank these. 

In [32]:
from langchain_groq import ChatGroq

# Load environment variables
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
hf_key = os.getenv('HUGGINGFACE_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')
dense_embedder_api = os.getenv("HF_API_URL")


In [11]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder(
    "cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512, device="cpu"
)

  from tqdm.autonotebook import tqdm, trange


config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/525 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [27]:
query = "I love you"
documents = ["I like cats", "I love you", "I like dogs"]
# Get the scores
scores = cross_encoder.predict([
    (query, "I like cats"), 
    (query, "I love you"), 
    (query, "I like dogs")
]).tolist()

# Zip the documents with the scores
doc_scores = list(zip(documents, scores))

# Sort the documents by score in descending order
sorted_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)

# Extract the sorted documents
reranked_docs = [doc for doc, score in sorted_docs]

print(reranked_docs)

['I love you', 'I like cats', 'I like dogs']


In [26]:
scores

[-10.89382553100586, 9.117002487182617, -11.03879451751709]

## Idea for the use of a re ranker with the hybrid search
- The clustering could retrieve maybe around 40 ish articles about some topic. However, not all of them are relevant. 
- The re ranker and a threshold of a similarity score would be used to find some sort of similarity score to the required timeline. 

In [None]:
ids = [
'st_1159793',
'st_1164990',
'st_1165295',
'st_1164227',
'st_1158236',
'st_1158485',
'st_1159581',
'st_1158779',
'st_1157082',
'st_1160289']

In [None]:
# i = 0
# for d in data:
#     try:
#         if d['_source']['identification']['drupal']['id'] in ids:
#             print(d['_source']['content_metadata']['context']['body_en'])
#             print(d['_source']['entities']['body_en']['flair-fast']['content'])  
#             print(d['_source']['keywords']['body_en']['positionrank']['content'])
#             print()
#             if i == 2:
#                 break
            
#     except KeyError:
#         if d['_source']['identification']['cue']['id'] in ids:
#             print(d['_source']['content_metadata']['context']['body_en'])
#             print(d['_source']['entities']['body_en']['flair-fast']['content']) 
#             print(d['_source']['keywords']['body_en']['positionrank']['content'])
#             print() 
#     i += 1
  