In [2]:
import os
import ast
import sys
import json
import yaml
import re
from json import JSONDecodeError
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pymongo import MongoClient
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

import gradio as gr
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import cdist

# Normally where to do this? (in which function?)
with open("../gradio_config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

# Initialise mongo client.
mongo_client = MongoClient(config["database"]["uri"])

In [3]:
def load_mongodb():
    # Connect to the MongoDB client
    try:
        db = mongo_client[config["database"]["name"]]
        train_documents = db[config["database"]["train_collection"]].find()
        print("Train data successfully fetched from MongoDB\n")
    except Exception as error: 
        print(f"Unable to fetch train data from MongoDB. Check your connection the database...\nERROR: {error}\n")
        sys.exit()   
    try:
        test_docs = db[config["database"]["test_collection"]].find()
        print("Test data successfully fetched from MongoDB\n")
    except:
        print(f"Unable to fetch test data from MongoDB. Check your connection the database...\nERROR: {error}\n")
        sys.exit()
    df_train = pd.DataFrame.from_dict(list(train_documents))
    df_test = pd.DataFrame.from_dict(list(test_docs))
    
    return df_train, df_test

def scale_body_embeddings(df, scaler=None):
    print("Processing embedding data and scaling data...\n")
    # Deserializing the embeddings
    body_embeddings = np.array(df['embeddings'].apply(ast.literal_eval).tolist())
    
    if scaler is None:
        # Standardize embeddings for the training data
        scaler = StandardScaler()
        embeddings = scaler.fit_transform(body_embeddings)
        return embeddings, scaler
    else:
        # Standardize embeddings for the test data
        embeddings = scaler.transform(body_embeddings)
        return embeddings

df_train, df_test = load_mongodb()
# Change type of embeddings accordingly
train_embeddings, scaler = scale_body_embeddings(df_train)

# Use the fitted scaler to transform the test embeddings
test_embeddings = scale_body_embeddings(df_test, scaler=scaler)

# Pre computed hierarchical clustering
Z_train = linkage(train_embeddings, method='average', metric='cosine')

Train data successfully fetched from MongoDB

Test data successfully fetched from MongoDB

Processing embedding data and scaling data...

Processing embedding data and scaling data...



In [4]:
test_article_index = 2
if test_embeddings[test_article_index-1].ndim == 1:
    test_point_embeddings = test_embeddings[test_article_index-1].reshape(1, -1)

In [5]:
# for now we will use 0.6
max_d = 0.58
clusters_train = fcluster(Z_train, max_d, criterion='distance')
labels = clusters_train.tolist()
df_train['Cluster_labels'] = labels
print(f"There are {len(set(labels))} clusters in this clustering of max_d = {max_d}\n")

There are 635 clusters in this clustering of max_d = 0.58



In [6]:
# Step 1: Fit the Nearest Neighbors Model
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(train_embeddings)

# Step 2: Find the 5 Nearest Neighbors for Each Test Point
distances, indices = knn.kneighbors(test_point_embeddings)
df_indexes = indices.tolist()

In [7]:
from collections import Counter

assigned_cluster = Counter(df_train.loc[df_indexes[0]]['Cluster_labels'].values.tolist()).most_common(1)[0][0]
assigned_cluster

418

In [8]:
# combine the test article with the articles in the predicted cluster
cluster_df = pd.concat([df_test.iloc[[test_article_index]], df_train[df_train['Cluster_labels'] == assigned_cluster]], axis=0, ignore_index=True)
cluster_df

Unnamed: 0,_id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,article_url,st_id,phrase_Bert_tags_embeddings,Cluster_labels
0,6666ac3f6619e3e180cbbadf,BRUSSELS - European Council president Charles ...,EU leaders to hold emergency virtual summit on...,"[-0.015096, 0.010564, 0.004007, 0.049611, -0.0...",Title: EU leaders to hold emergency virtual su...,"[EU, Emergency summit, Israel-Hamas, Gaza Stri...","[0.002247289987280965, 0.0027428099419921637, ...","[0.010236, -0.079974, -0.027459, 0.040262, -0....",2023-10-15,https://www.straitstimes.com/world/middle-east...,st_1155048,"[[-0.014690160751342773, 0.142982617020607, 0....",
1,6666ac3d6619e3e180cbb35c,BEIJING – The Chinese audience liked the convi...,What did the Chinese get out of Biden-Xi meeti...,"[-0.002229, 0.080196, -0.008359, 0.010345, 0.0...",Title: What did the Chinese get out of Biden-X...,"[US-China Relations, Biden-Xi Meeting, Taiwan ...","[0.013200688175857067, 0.007806617766618729, -...","[0.012667, 0.065951, -0.002377, 0.032497, 0.00...",2023-11-17,https://www.straitstimes.com/asia/east-asia/wh...,st_1162958,"[[-0.10989061743021011, 0.4417957365512848, -0...",418.0
2,6666ac3d6619e3e180cbb4a6,WASHINGTON – The US believes Chinese Defence M...,US believes China Defence Minister Li Shangfu ...,"[0.05121, 0.097314, 0.010616, 0.020364, 0.0419...",Title: US believes China Defence Minister Li S...,"[US, China, Corruption, Foreign Affairs, Milit...","[0.020512491464614868, 0.04342474043369293, 0....","[0.035579, 0.05068, 0.014631, 0.020361, 0.0636...",2023-09-15,https://www.straitstimes.com/world/united-stat...,st_1147812,"[[0.2872277498245239, 0.7776200771331787, 0.68...",418.0
3,6666ac3d6619e3e180cbb4b3,ABOARD AIR FORCE ONE - US President Joe Biden ...,Biden says goal of Xi meeting is to resume US-...,"[0.005888, 0.091609, -0.010379, 0.012567, 0.00...",Title: Biden says goal of Xi meeting is to res...,"[Biden, Xi Jinping, US-China relations, Diplom...","[0.027303652837872505, 0.023961307480931282, -...","[0.015934, 0.050481, 0.006019, 0.007713, 0.027...",2023-11-15,https://www.straitstimes.com/asia/biden-says-g...,st_1162215,"[[-0.29569754004478455, -0.22132907807826996, ...",418.0
4,6666ac3d6619e3e180cbb4e7,BEIJING - China’s top security agency has hint...,Chinese spy agency suggests that a Biden-Xi me...,"[0.02897, 0.096915, -0.007345, 0.025401, 0.017...",Title: Chinese spy agency suggests that a Bide...,"[China, Diplomacy, International Relations, Un...","[0.04063139483332634, 0.04388551414012909, -0....","[0.025174, 0.087059, -0.013848, 0.032197, -0.0...",2023-09-05,https://www.straitstimes.com/asia/chinese-spy-...,st_1145263,"[[-0.43678149580955505, 0.6484148502349854, -1...",418.0
5,6666ac3d6619e3e180cbb56d,MOSCOW – Russian President Vladimir Putin said...,Russia's Putin says he will meet China's Xi soon,"[0.050952, 0.006066, -0.010865, -0.00157, -0.0...",Title: Russia's Putin says he will meet China'...,"[Russia, China, Diplomatic Relations, Ukraine ...","[0.029017142951488495, 0.009823884814977646, -...","[0.025647, -0.009588, -0.03389, 0.014954, 0.02...",2023-09-02,https://www.straitstimes.com/asia/russias-puti...,st_1144580,"[[-0.6540979743003845, 0.2755714952945709, -1....",418.0
6,6666ac3d6619e3e180cbb65b,BEIJING -China's Foreign Minister Wang Yi sai...,China's foreign minister suggests road to Xi-B...,"[0.022126, 0.072078, -0.012084, 0.017353, 0.01...",Title: China's foreign minister suggests road ...,"[China, Diplomacy, Xi-Biden Summit, US-China R...","[0.02889561839401722, 0.013993826694786549, -0...","[0.046906, 0.053139, -0.030886, 0.046352, 0.01...",2023-10-29,https://www.straitstimes.com/world/chinas-fore...,st_1158418,"[[-0.43678149580955505, 0.6484148502349854, -1...",418.0
7,6666ac3d6619e3e180cbb66a,NEW YORK – US and Chinese military officials m...,"US, Chinese military officials meet in Fiji in...","[0.021454, 0.060772, -0.00247, 0.046723, 0.049...","Title: US, Chinese military officials meet in ...","[Fiji, China, United States, Military Engageme...","[0.039296552538871765, 0.011443652212619781, -...","[0.039526, 0.000501, 0.002826, 0.014907, 0.063...",2023-09-01,https://www.straitstimes.com/asia/east-asia/us...,st_1144349,"[[-0.1764063984155655, 0.36731410026550293, -1...",418.0
8,6666ac3d6619e3e180cbb6fb,SAN FRANCISCO - The commerce chiefs of th...,US Commerce chief Raimondo to meet Chinese cou...,"[-0.005543, 0.091566, -0.028505, 0.051399, 0.0...",Title: US Commerce chief Raimondo to meet Chin...,"[US-China relations, APEC, Biden-Xi Jinping me...","[-0.00592433288693428, 0.0367533303797245, -0....","[0.004025, 0.036948, -0.025945, 0.06042, 0.068...",2023-11-14,https://www.straitstimes.com/asia/us-commerce-...,st_1162012,"[[-0.10989061743021011, 0.4417957365512848, -0...",418.0
9,6666ac3d6619e3e180cbb716,China’s leader Xi Jinping flew into San Franci...,What Xi got out of his meeting with Biden,"[-0.002319, 0.068781, -0.017686, 0.021379, 0.0...",Title: What Xi got out of his meeting with Bid...,"[US-China Relations, Taiwan, Trade War, Cold W...","[-0.003666245611384511, 0.0008870568126440048,...","[0.016278, 0.056853, -0.008531, 0.040193, 0.01...",2023-11-20,https://www.straitstimes.com/opinion/what-xi-g...,st_1163492,"[[-0.10989061743021011, 0.4417957365512848, -0...",418.0


In [17]:
s = cluster_df['Text'][0]

In [1]:
# Groq to generate main event for each article
import re
import json
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
chat_model = "llama3-8b-8192"
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
def groq_event(title, text):
    # Define your desired data structure.
    # class summarized_event(BaseModel):
    #     main_event: str = Field(description="Main event of the article")
    
    # parser = JsonOutputParser(pydantic_object=summarized_event)

    chat = ChatGroq(temperature=0, model_name=chat_model)
    
    template = '''
You are a news article editor. Describe the main event of the article below in one short sentence.

Article Title:
{title}
Article Text:
{text}

Answer Format example:
{{"main_event": "Israel-Hamas Conflict and Gaza Crisis"}}
{{"main_event": "Tennis players throw rackets"}}

Before you return the answer, ensure and double check that you have adhered the answer format instructions strictly.
'''
    prompt = PromptTemplate(
        template=template,
        input_variables=["text"],
    )
    
    chain = prompt | chat 
    result = chain.invoke({"title": title, "text": text}).content
    result = json.loads(result)
    return result['main_event']
event = groq_event(s)
event

NameError: name 'load_dotenv' is not defined

In [20]:
cluster_df['Title'][0]

'EU leaders to hold emergency virtual summit on Israel-Hamas conflict on Tuesday  '

## CONTINUE HERE:
- After the generation of the events per article, then use all of it (max 30 articles) to generate a timeline header for the articles. 
- 
- (one concern is finding the proper corresponding date from the events)

In [1]:
# Display the titles so that we can parse it into the re ranker and then get the top result
unranked_titles = df_train.loc[df_indexes[0]]['Text'].values.tolist()
unranked_titles

NameError: name 'df_train' is not defined

In [116]:
test_title = df_test.iloc[test_article_index].Text
test_title

'BRUSSELS - European Council president Charles Michel said on Saturday that he had convened a video conference summit of European Union leaders on Tuesday to discuss the Hamas attacks on Israelis and Israel’s response.Mr Michel said the bloc stood in “full solidarity” with the people of Israel after the “brutal terrorist attacks” of a week ago.In an invitation letter to EU leaders, Mr Michel said Israel had the right to defend itself in compliance with international law.He said the siege of the Gaza Strip was raising alarm bells in the international community, prompting him to convene a video conference meeting on Tuesday at 5.30pm Central European Time (11.30pm Singapore time).“It is of utmost importance that the European Council, in line with the treaties and our values, sets our common position and establishes a clear unified course of action that reflects the complexity of the unfolding situation,” he wrote.Mr Michel said the EU had to be an advocate of peace and respect for intern

In [101]:
# load the desired cross encoder model
from sentence_transformers import CrossEncoder

model_name = "cross-encoder/ms-marco-TinyBERT-L-2-v2"
cross_encoder = CrossEncoder(
    model_name, max_length=512, device="cpu"
)



In [117]:
unranked_docs = [(test_title, doc)for doc in unranked_titles]
unranked_docs

[('BRUSSELS - European Council president Charles Michel said on Saturday that he had convened a video conference summit of European Union leaders on Tuesday to discuss the Hamas attacks on Israelis and Israel’s response.Mr Michel said the bloc stood in “full solidarity” with the people of Israel after the “brutal terrorist attacks” of a week ago.In an invitation letter to EU leaders, Mr Michel said Israel had the right to defend itself in compliance with international law.He said the siege of the Gaza Strip was raising alarm bells in the international community, prompting him to convene a video conference meeting on Tuesday at 5.30pm Central European Time (11.30pm Singapore time).“It is of utmost importance that the European Council, in line with the treaties and our values, sets our common position and establishes a clear unified course of action that reflects the complexity of the unfolding situation,” he wrote.Mr Michel said the EU had to be an advocate of peace and respect for inte

In [118]:
unranked_docs = [(test_title, doc)for doc in unranked_titles]
# Get the scores
scores = cross_encoder.predict(unranked_docs).tolist()

# Zip the documents with the scores
doc_scores = list(zip(unranked_titles, scores))

# Sort the documents by score in descending order
sorted_docs = sorted(doc_scores, key=lambda x: x[1], reverse=True)

# Extract the sorted documents
reranked_titles = [doc for doc, score in sorted_docs]

reranked_titles

['ABOARD AIR FORCE ONE - US President Joe Biden said on Tuesday his goal during talks with Chinese President Xi Jinping this week is to resume normal communications between the two superpowers, including military-to-military contacts.Speaking to reporters at the White House before departing for San Francisco, Mr Biden said he is trying to alter the US relationship with China for the better, after a period of strained ties.Mr Biden and Mr Xi are to meet on Wednesday in the San Francisco area during the Asia-Pacific Economic Cooperation summit. It will be only the second in-person meeting between the two leaders since Mr Biden took office in January 2021.Asked how he would measure success at the talks with Mr Xi, Mr Biden said:“To get back on a normal course, corresponding and being able to pick up the phone and talk to one another in a crisis, and being able to make sure that our militaries still have contact with one another.”Aboard Air Force One en route to San Francisco, White House 

In [119]:
scores

[-9.355789184570312,
 -7.4297027587890625,
 -8.69251537322998,
 -8.382255554199219,
 -9.092162132263184]

In [107]:
test_title

'EU leaders to hold emergency virtual summit on Israel-Hamas conflict on Tuesday  '

In [111]:
df_train.iloc[df_indexes[0]]

Unnamed: 0,_id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,article_url,st_id,phrase_Bert_tags_embeddings,Cluster_labels
740,6666ac3d6619e3e180cbb5f1,"Synopsis: Every fourth Friday of the month, Th...","US cements ties with India, Vietnam and why th...","[0.060281, 0.036623, -0.007349, -0.010667, -0....","Title: US cements ties with India, Vietnam and...","[US, India, Vietnam, G20, East Asia Summit, Di...","[0.024326080456376076, 0.023225942626595497, -...","[0.009447, 0.087782, -0.008752, -0.017337, -0....",2023-09-22,https://www.straitstimes.com/asia/asian-inside...,st_1149633,"[[0.2872277498245239, 0.7776200771331787, 0.68...",416
422,6666ac3d6619e3e180cbb4b3,ABOARD AIR FORCE ONE - US President Joe Biden ...,Biden says goal of Xi meeting is to resume US-...,"[0.005888, 0.091609, -0.010379, 0.012567, 0.00...",Title: Biden says goal of Xi meeting is to res...,"[Biden, Xi Jinping, US-China relations, Diplom...","[0.027303652837872505, 0.023961307480931282, -...","[0.015934, 0.050481, 0.006019, 0.007713, 0.027...",2023-11-15,https://www.straitstimes.com/asia/biden-says-g...,st_1162215,"[[-0.29569754004478455, -0.22132907807826996, ...",418
474,6666ac3d6619e3e180cbb4e7,BEIJING - China’s top security agency has hint...,Chinese spy agency suggests that a Biden-Xi me...,"[0.02897, 0.096915, -0.007345, 0.025401, 0.017...",Title: Chinese spy agency suggests that a Bide...,"[China, Diplomacy, International Relations, Un...","[0.04063139483332634, 0.04388551414012909, -0....","[0.025174, 0.087059, -0.013848, 0.032197, -0.0...",2023-09-05,https://www.straitstimes.com/asia/chinese-spy-...,st_1145263,"[[-0.43678149580955505, 0.6484148502349854, -1...",418
836,6666ac3d6619e3e180cbb651,WASHINGTON - US Secretary of State Antony Blin...,Blinken to tour Asia after latest Middle East ...,"[0.017731, 0.085117, -0.00877, -0.000977, -0.0...",Title: Blinken to tour Asia after latest Middl...,"[Antony Blinken, Asia, Middle East, India, Sou...","[0.03027254343032837, 0.0032462086528539658, -...","[0.013972, -0.017507, -0.012021, 0.006288, -0....",2023-11-02,https://www.straitstimes.com/world/united-stat...,st_1159361,"[[-0.39704984426498413, 0.4320709705352783, -0...",578
846,6666ac3d6619e3e180cbb65b,BEIJING -China's Foreign Minister Wang Yi sai...,China's foreign minister suggests road to Xi-B...,"[0.022126, 0.072078, -0.012084, 0.017353, 0.01...",Title: China's foreign minister suggests road ...,"[China, Diplomacy, Xi-Biden Summit, US-China R...","[0.02889561839401722, 0.013993826694786549, -0...","[0.046906, 0.053139, -0.030886, 0.046352, 0.01...",2023-10-29,https://www.straitstimes.com/world/chinas-fore...,st_1158418,"[[-0.43678149580955505, 0.6484148502349854, -1...",418
