## Objective of notebook:

- To explore the keyword search and retrieval for the relevant articles.
- In order to make the search process faster, likely need some mechanism to generate some embeddings for the keywords or tags.
- Currently, I am using a hierarchical clustering to run clustering on all the articles.

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
import json
import yaml
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import torch
from torch import nn

# Normally where to do this? (in which function?)
with open("../gradio_config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

In [24]:
# Explore the current data 
files = ["../data/test_data/test.json", "../data/test_data/train.json"]
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            # Load data from the file and append it to the combined list
            data = json.load(fin)
            combined_data.extend(data)
    return combined_data
db = combine_json(files)

In [25]:
db[0]

 'Title': "'Extremely dangerous' Hurricane Lidia takes aim at Mexico's Pacific coast",
 'embeddings': '[-0.037888, 0.045683, -0.056672, 0.033054, 0.030117, 0.008756, -0.046498, -0.013752, -0.01705, 0.001012, -0.051107, -0.034596, 0.00603, 0.026832, 0.029468, 0.00693, -0.004397, -0.030406, -0.085586, -0.019203, -0.01197, -0.039596, -0.015063, -0.038689, 0.046689, 0.051168, -0.026357, 0.007962, -0.007839, -0.047726, 0.029933, 0.023759, 0.048617, -0.013678, 3e-06, -0.007626, 0.002103, -0.033598, 0.070793, -0.064289, -0.025603, -0.11244, -0.043352, 0.020507, -0.021366, -0.050824, -0.016674, 0.014246, 0.006658, 0.006147, 0.01222, -0.023218, 0.007297, 0.020698, 0.038888, 0.000493, -0.031275, 0.000926, -0.06436, -0.088952, 0.049344, 0.011625, 0.011707, 0.008515, 0.033215, 0.009605, -0.01627, -0.019946, -0.01995, 0.020128, 0.024457, -0.013117, 0.03441, 3.9e-05, -0.009643, -0.001638, -0.01274, -0.037272, 0.015636, -0.000866, 0.015652, 0.027185, 0.016962, 0.006075, -0.005968, 0.045111, -0.005765

In [26]:
db[1]

{'Text': 'COLOMBO – Sri Lanka’s Cabinet approved issuing free tourist visas to visitors from seven countries, including China, India and Russia, a statement issued by the Media Ministry said on Tuesday, to boost tourism and help revive the country’s crisis-hit economy.  Tourists from China, India, Russia, Japan, Thailand, Indonesia, and Malaysia will be issued free visas till March 31, 2024, under a pilot programme, the statement detailing Cabinet decisions said. The scheme is part of attempts by Sri Lanka to boost tourism recovery and hit a target of five million arrivals by 2026, the statement added.  The country of 22 million people, famed for its beaches, ancient temples and aromatic tea, saw its tourism industry pummelled first by the Covid-19 pandemic and then by a severe financial crisis in 2022 that saw mass protests and shortages of essentials such as fuel.  But the tourism industry is seeing a turnaround in 2023, with Sri Lanka clocking a million arrivals by September, for th

In [64]:
cos_sim = nn.CosineSimilarity(dim=0)

def get_cosine_tags(test_article, train_article):
    # Convert lists to tensors
    embeddings_1_tensor = torch.tensor(eval(test_article['phrase_Bert_tags_embeddings']))
    embeddings_2_tensor = torch.tensor(eval(train_article['phrase_Bert_tags_embeddings']))

    # Store cosine similarities
    cosine_similarities = []

    # Compute cosine similarity for each pair of embeddings and store the results
    for emb1 in embeddings_1_tensor:
        for emb2 in embeddings_2_tensor:
            cosine_similarities.append(cos_sim(emb1, emb2).item())

    # Sort the cosine similarities and select the 10 smallest
    sorted_cosine_similarities = sorted(cosine_similarities)
    sum_similarities = sum(sorted_cosine_similarities[:13])
    return sum_similarities

In [41]:
train_ids = ['st_1164990',
'st_1165295',
'st_1164227',
'st_1158236',
'st_1158485',
'st_1159581',
'st_1158779',
'st_1157082',
'st_1160289']
test_indexes = []
for id in train_ids:
    for i in range(len(db)):
        if db[i]['st_id'] == id:
            test_indexes.append(i)
        elif db[i]['st_id'] == "st_1159793":
            train_index = i        

print(train_index)
print(test_indexes)

1033
[578, 1932, 413, 1842, 1101, 45, 1097, 539, 1646]


In [50]:
train_db = []
test_article = db[train_index]
for index in test_indexes:
    train_db.append(db[index])

In [69]:
# go through the train_db to get cosine similarities of each article with respect to the test
by_tags_records = []
for train_article in train_db:
    dic = {}
    dic['id'] = train_article['st_id']
    dic['Title'] = train_article['Title']
    dic['Tags'] = train_article['tags']
    dic['cosine_score'] = get_cosine_tags(test_article, train_article)
    by_tags_records.append(dic)

print("Title of test article: " + test_article['Title'])
print("Tags of test article: " + str(test_article['tags']) + "\n")

by_tags_records.sort(key = lambda x: x['cosine_score'])
for tag in by_tags_records:
    print(tag['Title'])
    print(tag['cosine_score'])
    print()

Title of test article: Japan to provide $88 million in additional humanitarian aid to Palestinians
Tags of test article: ['Japan', 'Palestine', 'Humanitarian aid', 'Gaza conflict', 'Two-state solution', 'G7 foreign ministers']

White House suggests 'pauses' in Israel-Hamas conflict to get people out
5.122819602489471

Situation in Gaza growing more desperate by the hour, UN chief warns  
5.498552858829498

Gazans call for truce to be extended, Israelis divided on the issue
5.518810510635376

G-7 foreign ministers support extension of pause in fighting in Gaza
5.554603606462479

Israel PM Netanyahu rejects Gaza ceasefire, says it amounts to ‘surrendering to Hamas’
5.556248426437378

UN bodies make united call for humanitarian ceasefire in Gaza
5.566239207983017

Netanyahu accuses UN of being slow to provide Gaza refugee relief
5.57658314704895

EU continues talks on humanitarian ceasefire in Israel-Hamas war
5.58854615688324

Singapore supports humanitarian aid, calls for civilian lives

In [39]:
ids = [
'st_1159793',
'st_1164990',
'st_1165295',
'st_1164227',
'st_1158236',
'st_1158485',
'st_1159581',
'st_1158779',
'st_1157082',
'st_1160289']

In [None]:
i = 0
for d in data:
    try:
        if d['_source']['identification']['drupal']['id'] in ids:
            print(d['_source']['content_metadata']['context']['body_en'])
            print(d['_source']['entities']['body_en']['flair-fast']['content'])  
            print(d['_source']['keywords']['body_en']['positionrank']['content'])
            print()
            if i == 2:
                break
            
    except KeyError:
        if d['_source']['identification']['cue']['id'] in ids:
            print(d['_source']['content_metadata']['context']['body_en'])
            print(d['_source']['entities']['body_en']['flair-fast']['content']) 
            print(d['_source']['keywords']['body_en']['positionrank']['content'])
            print() 
    i += 1
  