In [5]:
import json
import yaml
import os
import sys
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import torch
import requests
from torch import nn
from tqdm import trange
from pymongo import MongoClient
from datetime import datetime


# Groq to generate main event for each article
import re
import json
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import PromptTemplate
from langchain_core.prompts import PromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_groq import ChatGroq
from sentence_transformers import CrossEncoder
from langchain_core.prompts import PromptTemplate

chat_model = "llama3-8b-8192"
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)


# Normally where to do this? (in which function?)
with open("../gradio_config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)
    
hf_key = os.getenv('HUGGINGFACE_API_KEY')
dense_embedder_api = os.getenv("HF_API_URL")
# Normally where to do this? (in which function?)
with open("../gradio_config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

# Initialise mongo client.
mongo_client = MongoClient(config["database"]["uri"])

In [15]:
# Load the current data 
files = ["../data/test_data/test.json", "../data/test_data/train.json"]
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as fin:
            # Load data from the file and append it to the combined list
            data = json.load(fin)
            combined_data.extend(data)
    return combined_data

def clean_llm_score(output):
    text = output.parts[0].text.replace("```", '').replace('json','')
    result = json.loads(text)
    return result

def clean_llm_output(output):
        text = output.parts[0].text.replace("```", '').replace('json','')
        result = json.loads(text)
        return result

def format_timeline_date(date_str):
    formats = ['%Y', '%Y-%m-%d', '%Y-%m']
    for fmt in formats:
        try:
            date_obj = datetime.strptime(date_str, fmt)
            if fmt == '%Y':
                return date_obj.strftime('%Y')
            elif fmt == '%Y-%m-%d':
                return date_obj.strftime('%d %B %Y')
            elif fmt == '%Y-%m':
                return date_obj.strftime('%B %Y')
        except ValueError:
            continue
    return None

def to_generate_timeline(test_article):
    print("Evaluating necessity of Timeline for this article.\n")
    llm = genai.GenerativeModel('gemini-1.5-flash-latest')
    class Event(BaseModel):
        score: int = Field(description="The need for this article to have a timeline")
        Reason: str = Field(description = "The main reason for your choice why a timeline is needed or why it is not needed")
            
    output_parser = JsonOutputParser(pydantic_object=Event)

    # See the prompt template you created for formatting
    format_instructions = output_parser.get_format_instructions()

    # Define the template
    template = '''
    You are a highly intelligent AI tasked with analyzing articles to determine whether generating a timeline of events leading up to the key event in the article would be beneficial. 
    Consider the following factors to make your decision:
    1. **Significance of the Event**:
       - Does the event have a significant impact on a large number of people, industries, or countries?
       - Are the potential long-term consequences of the event important?

    2. **Controversy or Debate**:
       - Is the event highly controversial or has it sparked significant debate?
       - Has the event garnered significant media attention and public interest?

    3. **Complexity**:
       - Does the event involve multiple factors, stakeholders, or causes that make it complex?
       - Does the event have deep historical roots or is it the culmination of long-term developments?

    4. **Personal Relevance**:
       - Does the event directly affect the reader or their community?
       - Is the event of particular interest to the reader due to economic implications, political affiliations, or social issues?

    5. Educational Purposes:
       - Would a timeline provide valuable learning or research information?

    Here is the information for the article:
    Title:{title}
    Text: {text}

    Based on the factors above, decide whether generating a timeline of events leading up to the key event in this article would be beneficial. 
    Your answer will include the need for this article to have a timeline with a score 1 - 5, 1 means unnecessary, 5 means necessary. It will also include the main reason for your choice.
    {format_instructions}    
    ANSWER:
    '''

    # Create the prompt template
    prompt = PromptTemplate(
        input_variables=["text", "title"],
        partial_variables={"format_instructions": format_instructions},
        template=template,
    )

        # Define the headline
    headline = test_article["Title"]
    body = test_article["Text"]

        # Format the prompt
    final_prompt = prompt.format(title=headline, text=body)

        # Generate content using the generative model
    response = llm.generate_content(
            final_prompt,
            safety_settings={
                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
            }
        )
    final_response = clean_llm_score(response)
    # If LLM approves
    if final_response['score'] >=3:
        print("Timeline is necessary for this chosen article.\n")
        return True, None
    else:
        print("A timeline for this article is not required. \n")
        for part in final_response['Reason'].replace(". ", ".").split(". "):
            print(f"{part}\n")
        print("Hence I gave this a required timeline score of " + str(final_response['score']))
        output_error = "A timeline for this article is not required. \n" \
                    + "\n" +final_response['Reason'] + "\n"+ "\nHence this timeline received a necessity score of " \
                    + str(final_response['score'])  + "\n"
        return False, output_error

# Generate the header of the timeline of the desired article
def groq_header(title):
    llm = genai.GenerativeModel('gemini-1.5-flash-latest' )
    
    class timeline_headaer(BaseModel):
        timeline_header: str = Field(description="Suitable header of a timeline for this article")
    
    parser = JsonOutputParser(pydantic_object=timeline_headaer)

    template = '''
I would like to create a timeline of events based on the title of an article.
Given a list of article titles below, you are tasked with creating an extremely generalised, suitable name for a timeline for this article that will provide a reader contextual information about a timeline of events regarding the article.
The header should be something that can be generalised to other similar articles.
For instance, if a title is "S’pore Red Cross gives $270k worth of relief aid to victims of Hamas-Israel war in Gaza", the header should be "Relief Aid for Gaza Conflict Victims"

Article Title:
{title}

{format_instructions}
Before you return the answer, ensure and double check that you have adhered the answer format instructions strictly.
'''
    prompt = PromptTemplate(
        template=template,
        input_variables=["title"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    
    final_prompt = prompt.format(title=title)
    response = llm.generate_content(final_prompt,
                                        safety_settings={
                                            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                            })
    cleaned_output = clean_llm_output(response)
    extracted_header= list(cleaned_output.values())[0]
    return extracted_header

# Initialise dense embedder model
def dense_embed(payload: str) -> str:
        response = requests.post(dense_embedder_api, headers={"Authorization": f"Bearer {hf_key}"}, json=payload)
        return response.json()

def get_cosine_text(timeline_embed, train_article):
    cos_sim = nn.CosineSimilarity(dim=0)
    similarity = cos_sim(torch.tensor(timeline_embed), torch.tensor(eval(train_article['embeddings'])))
    return similarity

def get_cosine_titles(timeline_embed, train_article):
    cos_sim = nn.CosineSimilarity(dim=0)
    similarity = cos_sim(torch.tensor(timeline_embed), torch.tensor(eval(train_article['Title_embeddings'])))
    return similarity

def get_similar_by_text(test_article, timeline_header, db):
    print("Title of test article: " + test_article['Title'] + "\n")
    print("Computing similarities between article texts...")
    timeline_heading_embed = dense_embed(timeline_header)
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Text'] = db[i]['Text']
        dic['Date'] = db[i]['Publication_date']
        dic['Article_URL'] = db[i]['article_url']
        dic['cosine_score'] = get_cosine_text(timeline_heading_embed, db[i])
        by_tags_records.append(dic)

    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    # Returns the top 10 most similar articles (might not always need top 10)
    print()
    return by_tags_records[:20]

def get_similar_by_titles(timeline_header, db):
    print("Computing similarities between article titles...")
    timeline_heading_embed = dense_embed(timeline_header)
    by_tags_records = []
    for i in trange(len(db)):
        dic = {}
        dic['id'] = db[i]['st_id']
        dic['Title'] = db[i]['Title']
        dic['Text'] = db[i]['Text']
        dic['Date'] = db[i]['Publication_date']
        dic['Article_URL'] = db[i]['article_url']
        dic['cosine_score'] = get_cosine_titles(timeline_heading_embed, db[i])
        by_tags_records.append(dic)
    by_tags_records.sort(key = lambda x: x['cosine_score'], reverse=True)
    # Returns the top 10 most similar articles (might not always need top 10)
    return by_tags_records[:20]


# Combine the similar articles retrieved by text and title embeddings
def combine_titles(similar_articles_titles, similar_article_text):
    combined_similars = []
    for i in range(len(similar_article_text)):
        combined_similars.append(similar_article_text[i])
        combined_similars.append(similar_articles_titles[i])
    
    # Initialize a set to track seen titles
    seen_titles = set()

    # List comprehension to remove duplicates based on 'Title'
    unique_list = []
    for item in combined_similars:
        title = item["Title"]
        if title not in seen_titles:
            seen_titles.add(title)
            unique_list.append(item)
    print("-"* 100)
    return unique_list

def re_rank_articles(combined_titles, timeline_header):
    cross_encoder = CrossEncoder(
        "cross-encoder/ms-marco-TinyBERT-L-2-v2", max_length=512, device="cpu"
    )
    print("-"* 100)
    unranked_docs = [(timeline_header, doc['Text']) for doc in combined_titles]
    # Get the scores
    scores = cross_encoder.predict(unranked_docs).tolist()

    for i in range(len(combined_titles)):
        # Criteria that it has to be positive relationship between the timeline header and the article
        if scores[i]>0:
            combined_titles[i]['reranked_score'] = scores[i]
    combined_articles = [article for article in combined_titles if 'reranked_score' in article]
    return combined_articles

# Generate the main event of each article by using its contents
def groq_event(date, title, text):
    llm = genai.GenerativeModel('gemini-1.5-flash-latest' )
    
    class summarized_event(BaseModel):
        main_event: str = Field(description="Main event of the article")
        event_date: str = Field(description="Date which the main event occured in YYYY-MM-DD")
    
    parser = JsonOutputParser(pydantic_object=summarized_event)

    
    template = '''
You are a news article editor. Analyse the article deeply, and describe the main event of the article below in one short sentence.
Using this main event and the publication date, identify the date at when this main event occured.
You should use any time references such as "last week," "last month," or specific dates. 
If the article does not specify the exact date, save the date in the YYYY-MM-XX or YYYY-XX-XX format.
Do not provide any explanations for your answer.

Publication Date:
{date}
Article Title:
{title}
Article Text:
{text}

{format_instructions}
Before you return the answer, ensure and double check that you have adhered the answer format instructions strictly.
'''
    prompt = PromptTemplate(
        template=template,
        input_variables=["date", "title", "text"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )
    
    final_prompt = prompt.format(date=date, title=title, text=text)
    response = llm.generate_content(final_prompt,
                                        safety_settings={
                                            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                            })
    
    cleaned_output = clean_llm_output(response)
    return cleaned_output

def filter_ranked_articles(combined_articles):
    # Retrieve only the top k articles 
    top_k = 12
    sorted_articles = sorted(combined_articles, key=lambda x: x['reranked_score'], reverse=True)
    if len(sorted_articles)>top_k:
        sorted_articles = sorted_articles[:top_k]
    print("Generating main events from each article\n")
    for i in trange(len(sorted_articles)):
        article_text = sorted_articles[i]['Text']
        article_title = sorted_articles[i]['Title']
        article_date = sorted_articles[i]['Date']
        displayed_event = groq_event(article_date, article_title, article_text)
        sorted_articles[i]['Event'] = displayed_event['main_event']
        sorted_articles[i]['Event_date'] = displayed_event['event_date']
        sorted_articles[i].pop("cosine_score")
    return sorted_articles

# Format the dates into readable format and sort by date 
def process_articles(filtered_articles):
    sorted_events = sorted([{"Event": event['Event'], "Date": event['Event_date'], "Article_URL": event['Article_URL'], "Article_title": event['Title']} for event in filtered_articles], key= lambda x: x['Date'])
    for event in sorted_events:
        event['Date'] = format_timeline_date(event['Date'])
    for event in sorted_events:
        url_title_pair = {}
        url_title_pair["url"] = event['Article_URL']
        url_title_pair["title"] = event['Article_title']
        event_url = []
        event_url.append(url_title_pair)
        event['Article_URL'] = event_url
    return sorted_events

def export_hybrid_timeline(test_article, sorted_events, timeline_header):
    print("Fetching database to store the generated timeline.. \n")
        # Pull database
    db = mongo_client[config["database"]["name"]]
        
        # Get collection from database
    gen_timeline_documents = db[config["database"]["hybrid_timeline_collection"]]
        
    test_article_id = test_article['st_id']
    test_article_title = test_article['Title']

    # If no error in timeline, then generate a heading for it
    print("Generating the timeline header...\n")
    timeline_display_header = "Timeline of " + timeline_header
    # Convert the timeline to JSON
    timeline_json = json.dumps(sorted_events)
    timeline_return = {"Article_id": test_article_id, 
                            "Article_Title": test_article_title, 
                            "Timeline_header": timeline_display_header,
                            "Timeline": timeline_json}
    timeline_export = timeline_return
            
    # Send the timeline data to MongoDB
    try:
        # Insert result into collection
        gen_timeline_documents.insert_one(timeline_export)
        print(f"Timeline with article id {test_article_id} successfully saved to MongoDB")
    except Exception as error:
        print(f"Unable to save timeline to database. Check your connection the database...\nERROR: {error}\n")
        sys.exit()


In [16]:
db = combine_json(files)
test_id = "st_1155048"
for i in range(len(db)):
    if db[i]['st_id'] == test_id:
        test_index = i
test_article = db[test_index]

if to_generate_timeline(test_article):
    title = test_article['Title']
    timeline_header = groq_header(title)
    similar_article_text = get_similar_by_text(test_article, timeline_header, db)
    similar_articles_titles = get_similar_by_titles(timeline_header, db)
    combined_titles = combine_titles(similar_articles_titles, similar_article_text)
    combined_articles = re_rank_articles(combined_titles, timeline_header)
    filtered_articles = filter_ranked_articles(combined_articles)
    sorted_events = process_articles(filtered_articles)
    export_hybrid_timeline(test_article, sorted_events, timeline_header)
else:
    print("Timeline is not necessary for this article")

Evaluating necessity of Timeline for this article.

Timeline is necessary for this chosen article.

Title of test article: EU leaders to hold emergency virtual summit on Israel-Hamas conflict on Tuesday  

Computing similarities between article texts...


100%|██████████| 2007/2007 [00:01<00:00, 1157.17it/s]



Computing similarities between article titles...


100%|██████████| 2007/2007 [00:01<00:00, 1152.68it/s]


----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Generating main events from each article



100%|██████████| 8/8 [00:08<00:00,  1.06s/it]

Fetching database to store the generated timeline.. 

Generating the timeline header...

Timeline with article id st_1155048 successfully saved to MongoDB



