In [5]:
import os
import ast
import sys
import json
import yaml
import re
from json import JSONDecodeError
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pymongo import MongoClient


import gradio as gr
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster

# Import libraries for working with language models and Google Gemini
from langchain_core.prompts import PromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate

# Read configuration
config_file = '../gradio_config.yaml'
with open(config_file, 'r') as fin:
    config = yaml.safe_load(fin)
# end with
groq_api_key = os.environ['GROQ_API_KEY']
GEMINI_KEY   = os.environ['GEMINI_KEY']
MONGO_URI    = os.environ['MONGO_URI']
genai.configure(api_key=GEMINI_KEY)

# Initialise mongo client
mongo_client = MongoClient(MONGO_URI)

# Setup default LLM model
default_llm = genai.GenerativeModel('gemini-1.5-flash-latest')

In [63]:
def load_database():
    # Connect to the MongoDB client
    try:
        db = mongo_client[config["database"]["name"]]
        train_documents = db[config["database"]["train_collection"]].find()
        print("Train data successfully fetched from MongoDB\n")
    except Exception as error: 
        print(f"Unable to fetch train data from MongoDB. Check your connection the database...\nERROR: {error}\n")
        sys.exit()   
    try:
        test_docs = db[config["database"]["test_collection"]].find()
        print("Test data successfully fetched from MongoDB\n")
    except:
        print(f"Unable to fetch test data from MongoDB. Check your connection the database...\nERROR: {error}\n")
        sys.exit()
    df_train = pd.DataFrame.from_dict(list(train_documents))
    df_test = pd.DataFrame.from_dict(list(test_docs))
    df = pd.concat([df_train, df_test], axis=0)
    return df

def get_text_embeddings(df):
    print("Fetching embeddings...\n")
    #Deserializing the embeddings
    body_embeddings = np.array(df['embeddings'].apply(ast.literal_eval).tolist())
    return body_embeddings

df= load_database()
embeddings = get_text_embeddings(df)
max_d = 0.58
# Pre computed hierarchical clustering
Z = linkage(embeddings, method='average', metric='cosine')
cluster_labels = fcluster(Z, max_d, criterion='distance')
df['Cluster_label'] = cluster_labels

Train data successfully fetched from MongoDB

Test data successfully fetched from MongoDB

Fetching embeddings...



In [64]:
test_id = "st_1155048"
def get_predicted_cluster(test_id, df):
    test_article = df[df['st_id'] == test_id].reset_index(drop=True)
    predicted_cluster = test_article['Cluster_label'][0]
    return predicted_cluster
predicted_cluster = get_predicted_cluster(test_id, df)

In [65]:
cluster_df = df[df['Cluster_label'] == predicted_cluster]
if len(cluster_df) > 30:
    df.drop("Cluster_label", axis=1, inplace=True)
    maxd_ranges = np.arange(0.4,max_d,0.0001)
    for i in range(len(maxd_ranges)):
        new_cluster_labels = fcluster(Z, maxd_ranges[i], criterion='distance')
        df['temp_label'] = new_cluster_labels
        pred_cluster = df[df['st_id'] == test_id].reset_index(drop=True)['temp_label'][0]
        pred_cluster_df = df[df['temp_label'] == pred_cluster]
        len_cluster_df = len(pred_cluster_df)
        print(len_cluster_df)
        if len_cluster_df > 30:
            df.drop("temp_label", axis=1, inplace=True)
        else:
            break
pred_cluster_df

6


Unnamed: 0,_id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,article_url,st_id,phrase_Bert_tags_embeddings,temp_label
198,6666ac3d6619e3e180cbb3d3,NICOSIA/PARIS/BRUSSELS - World powers mee...,"Maritime corridor, floating hospitals for Gaza...","[-0.011251, 0.05515, -0.019199, 0.036688, -0.0...","Title: Maritime corridor, floating hospitals f...","[Gaza Strip, Humanitarian Aid, Maritime Corrid...","[0.0033435772638767958, 0.024307824671268463, ...","[-0.030322, 0.047287, -0.015469, -0.032054, -0...",2023-11-07,https://www.straitstimes.com/world/europe/mari...,st_1160613,"[[0.1662367582321167, 0.05175376310944557, -1....",693
892,6666ac3d6619e3e180cbb689,PARIS – France will host an international huma...,France to host international conference for Ga...,"[0.005599, 0.060432, 0.008109, 0.067439, -0.05...",Title: France to host international conference...,"[France, Gaza, Humanitarian Aid, International...","[0.009617969393730164, 0.050256144255399704, 0...","[0.008851, 0.056725, 0.009956, 0.057874, -0.04...",2023-11-04,https://www.straitstimes.com/asia/france-to-ho...,st_1159796,"[[-0.47271475195884705, -0.2090330868959427, -...",693
1106,6666ac3d6619e3e180cbb75f,"MAGEN, Israel - EU foreign policy chief J...",EU's Borrell tells Israel: 'One horror doesn’t...,"[-0.017135, 0.014011, 0.02071, 0.042722, -0.04...",Title: EU's Borrell tells Israel: 'One horror ...,"[Israel, Gaza, Hamas, European Union, Josep Bo...","[-0.004307456314563751, 0.03101140446960926, 0...","[-0.02887, 0.031537, 0.017549, -0.008952, -0.0...",2023-11-16,https://www.straitstimes.com/asia/eus-borrell-...,st_1162690,"[[0.12629008293151855, 0.9903199076652527, -1....",693
1389,6666ac3d6619e3e180cbb87a,"MARRAKECH, Morocco - Global finance leade...","IMF, World Bank 'impotent' on Israel-Gaza war ...","[0.028992, 0.046687, 0.013369, -0.000236, -0.0...","Title: IMF, World Bank 'impotent' on Israel-Ga...","[IMF, World Bank, Israel-Gaza conflict, Econom...","[-0.010194081813097, 0.030959995463490486, -0....","[0.018668, 0.062374, 0.006034, -0.039918, -0.0...",2023-10-16,https://www.straitstimes.com/asia/imf-world-ba...,st_1155311,"[[-0.3626147508621216, 0.14135390520095825, -0...",693
1770,6666ac3d6619e3e180cbb9f7,EU countries are still discussing the idea of ...,EU continues talks on humanitarian ceasefire i...,"[0.034244, 0.034936, 0.002737, 0.039488, -0.05...",Title: EU continues talks on humanitarian ceas...,"[EU, Ceasefire, Israel-Hamas conflict, Gaza, S...","[0.004376180469989777, 0.021570585668087006, -...","[-0.011841, 0.008162, -0.005965, 0.006027, -0....",2023-10-23,https://www.straitstimes.com/asia/eu-continues...,st_1157082,"[[-0.014690160751342773, 0.142982617020607, 0....",693
2,6666ac3f6619e3e180cbbadf,BRUSSELS - European Council president Charles ...,EU leaders to hold emergency virtual summit on...,"[-0.015096, 0.010564, 0.004007, 0.049611, -0.0...",Title: EU leaders to hold emergency virtual su...,"[EU, Emergency summit, Israel-Hamas, Gaza Stri...","[0.002247289987280965, 0.0027428099419921637, ...","[0.010236, -0.079974, -0.027459, 0.040262, -0....",2023-10-15,https://www.straitstimes.com/world/middle-east...,st_1155048,"[[-0.014690160751342773, 0.142982617020607, 0....",693
