In [3]:
# Import necessary libraries
import os
import json
from dotenv import load_dotenv
from tqdm import trange

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

In [12]:
# Read the JSON string from the file
with open('../data_upload/single_timeline_trial.json', 'r', encoding='utf-8') as fin:
    timeline = json.load(fin)

In [13]:
timeline

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), which resulted in a six-week war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began, with Azerbaijan retaking control of several territories in Nagorno-Karabakh.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2020-09-27',
  'Event': 'Azerbaijan regained control of swathes of territory in Nagorno-Karabakh and surrounding regions.',
  'Article_id': 'bon7mzog28fpzkqk'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A ceasefire was signed between Armenia and Azerbaijan, with Nagorno-Karabakh being returned to Azerbaijani control.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2022-03-24',
  'Event': 

In [14]:
with open('../data_upload/df_retrieve.json', 'r', encoding='utf-8') as fin:
    data = json.load(fin)
retrieval = pd.DataFrame(data)
retrieval.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,Cluster_labels
0,x6bs26mxih4wjnvb,LONDON - Armenia and Azerbaijan accused each o...,"Azerbaijan, Armenia accuse each other of milit...","[0.063698, -0.023136, 0.008678, 0.013113, -0.0...","Title: Azerbaijan, Armenia accuse each other o...","[Armenia, Azerbaijan, Nagorno-Karabakh, Russia...","[0.03720000758767128, 0.010111724957823753, -0...","[0.035694, -0.013111, 0.013628, 0.005414, 0.01...",2023-09-08,523
1,96moyhcopwdgflfi,Four ex-leaders of Azerbaijan's formerly ethni...,"Four Karabakh leaders held in Azerbaijan, thre...","[0.054137, -0.006537, 0.007394, 0.020186, -0.0...",Title: Four Karabakh leaders held in Azerbaija...,"[Azerbaijan, Armenia, Nagorno-Karabakh, Ex-Kar...","[0.04491448402404785, -0.0014135852688923478, ...","[0.094126, -0.055465, -0.002879, 0.023244, -0....",2023-10-04,523
2,j40zdfvo5mm8yv3c,A Russian truck carrying food aid for Armenian...,Russia truck sets off with food aid for Armeni...,"[0.030952, -0.013953, -0.001719, 0.044435, -0....",Title: Russia truck sets off with food aid for...,"[Russia, Armenia, Nagorno-Karabakh, Khankendi,...","[0.040053002536296844, -0.0006843761657364666,...","[0.03145, -0.020559, -0.000136, 0.045389, 0.00...",2023-09-12,523
3,9owr18ngmcvsob4x,"MOSCOW - Armenia needs to be ""free of con...","Armenia needs peace, PM says after Azerbaijan ...","[0.044655, 0.050847, -0.003256, -0.007795, -0....","Title: Armenia needs peace, PM says after Azer...","[Armenia, Nagorno-Karabakh, Azerbaijan, Nikol ...","[0.043720196932554245, 0.0023561876732856035, ...","[0.05158, 0.040333, -0.038155, -0.006117, -0.0...",2023-09-21,523
4,kgiisc7nhbtx8o7o,"GORIS, Armenia - After the village was bo...","Fleeing bombs and death, Karabakh Armenians re...","[0.015886, -0.005729, 0.008714, -0.003721, -0....","Title: Fleeing bombs and death, Karabakh Armen...","[Armenia, Karabakh, Azerbaijan, Refugees, Conf...","[0.03360241651535034, 0.0005320683121681213, -...","[0.031704, -0.012698, 0.001959, -0.060447, 0.0...",2023-09-25,523


## Task Description:
- Optimise the timeline by making it less dry and more contextually relevant. 
- Removing duplicates events and dates

In [4]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

llm = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')

class Event(BaseModel):
    Date: str = Field(description="The date of the event in YYYY-MM-DD format")
    Event: str = Field(description="A detailed description of the event")
    Contextual_Annotation: str = Field(description="Conetextual Anecdotes of the event.")
    Article: str = Field(description="The article id from which the event was extracted")

parser = JsonOutputParser(pydantic_object=Event)

template = '''
You are given a timeline of events, your task is to enhance this timeline by improving its clarity, contextual information. 
If events occur on the same date and have similar descriptions, I want you to merge these events to avoid redundancy.
Add Contextual Annotations by providing brief annotations for major events to give additional context and improve understanding.
Only retain important information that would be value add when the general public reads the information.

Initial Timeline:
{text}

{format_instructions}
Ensure that the format follows the Example output format strictly before returning the output
'''
prompt = PromptTemplate(
    input_variables=["text"],
    template=template
)

In [None]:
final_prompt = prompt.format(text=timeline, format_instructions= parser.get_format_instructions())
response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )

In [17]:
print(response.parts[0].text)

```json
[
  {
    "Date": "2020-09-27",
    "Event": "Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.",
    "Contextual_Annotation": "This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region.",
    "Article": "96moyhcopwdgflfi"
  },
  {
    "Date": "2020-11-10",
    "Event": "A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.",
    "Contextual_Annotation": "The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions.",
    "Article": "96moyhcopwdgflfi"
  },
  {
    "Date": "2022-03-24",
    "Event": "Azerbaijan accuses Armenia of violating the ceasefire, sparking tensions between the two countries.",
    "Contextual_Annotation": "This marked a period of heightened tension foll

In [20]:
import re
enhanced_timeline = re.search(r'\[[^\]]*\]', response.parts[0].text).group(0)
enhanced_timeline

'[\n  {\n    "Date": "2020-09-27",\n    "Event": "Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.",\n    "Contextual_Annotation": "This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region.",\n    "Article": "96moyhcopwdgflfi"\n  },\n  {\n    "Date": "2020-11-10",\n    "Event": "A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.",\n    "Contextual_Annotation": "The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions.",\n    "Article": "96moyhcopwdgflfi"\n  },\n  {\n    "Date": "2022-03-24",\n    "Event": "Azerbaijan accuses Armenia of violating the ceasefire, sparking tensions between the two countries.",\n    "Contextual_Annotation": "This marked a period of heightened ten

In [21]:
data = json.loads(enhanced_timeline)
data

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.',
  'Contextual_Annotation': 'This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Contextual_Annotation': 'The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2022-03-24',
  'Event': 'Azerbaijan accuses Armenia of violating the ceasefire, sparking tensions between the two countries.',
  'Contextual_Annotation': 'This marked a period of heightened tension following the ceasefire agreement, highlighting the fr

In [22]:
from datetime import datetime
sorted_timeline = sorted(data, key=lambda x: datetime.strptime(x['Date'], '%Y-%m-%d'))
sorted_timeline

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.',
  'Contextual_Annotation': 'This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Contextual_Annotation': 'The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2022-03-24',
  'Event': 'Azerbaijan accuses Armenia of violating the ceasefire, sparking tensions between the two countries.',
  'Contextual_Annotation': 'This marked a period of heightened tension following the ceasefire agreement, highlighting the fr

In [23]:
unique_ids = []
for event in sorted_timeline:
    if event['Article'] not in unique_ids:
        unique_ids.append(event['Article'])
unique_ids

['96moyhcopwdgflfi',
 'm5vcc0dfsmuho41p',
 'x6bs26mxih4wjnvb',
 'j40zdfvo5mm8yv3c',
 'bon7mzog28fpzkqk',
 '9owr18ngmcvsob4x',
 'kgiisc7nhbtx8o7o']

In [30]:
llm = genai.GenerativeModel(model_name='gemini-1.5-flash-latest')

template = '''
You are given a timeline of events, and an article related to this timeline. 
Your task is to reference the provided article and enhance this timeline by improving its clarity and contextual information.
Add to the Contextual Annotations by providing annotations for major events to give additional context and improve understanding, \
  however, ensure that the contextual annotations added do not blatantly repeat what the contents of the event.

Initial Timeline:
{timeline}

Reference Article:
{article}

{format_instructions}
Double check and ensure that the format follows the Example output format strictly before returning the output
'''
prompt = PromptTemplate(
    input_variables=["text", "article"],
    template=template
)

In [31]:
from json import JSONDecodeError
import re
import time

def clean_output(output):
    try:
        updated_timeline = json.loads(output)
        return updated_timeline
    except JSONDecodeError:
        #try 1: Ensuring that the string ends with just the open and close lists brackets
        output = re.search(r'\[[^\]]*\]', output).group(0)
        updated_timeline = json.loads(output)
        return updated_timeline

final_timeline = []
unique_ids = []
for event in sorted_timeline:
    if event['Article'] not in unique_ids:
        unique_ids.append(event['Article'])
for i in trange(len(unique_ids)):
    text = []
    for event in sorted_timeline:
        if event['Article'] == unique_ids[i]:
            text.append(event)
    article = retrieval[retrieval['id'] == unique_ids[i]].reset_index()['Text'][0]
    final_prompt = prompt.format(timeline=text, article = article, format_instructions=parser.get_format_instructions())
    response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )
    section = clean_output(response.parts[0].text)
    final_timeline.append(section)
    time.sleep(5)

final_timeline

100%|██████████| 7/7 [01:24<00:00, 12.07s/it]


[[{'Date': '2020-09-27',
   'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.',
   'Contextual_Annotation': 'This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region. The war was a result of decades of unresolved tensions between Armenia and Azerbaijan over the region, which had been under Armenian control since the 1990s following the collapse of the Soviet Union.',
   'Article': '96moyhcopwdgflfi'},
  {'Date': '2020-11-10',
   'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
   'Contextual_Annotation': 'The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions, including the city of Shusha, which is considered to be of great strategic and symbolic imp

In [32]:
unsorted_timeline = []
for timeline in final_timeline:
    for event in timeline:
        unsorted_timeline.append(event)
unsorted_timeline

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), resulting in a six-week war.',
  'Contextual_Annotation': 'This marked the beginning of the Second Karabakh War, a conflict rooted in the historical dispute over the Nagorno-Karabakh region. The war was a result of decades of unresolved tensions between Armenia and Azerbaijan over the region, which had been under Armenian control since the 1990s following the collapse of the Soviet Union.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Contextual_Annotation': 'The ceasefire agreement resulted in Azerbaijan regaining control of significant territories in Nagorno-Karabakh and surrounding regions, including the city of Shusha, which is considered to be of great strategic and symbolic importance

In [33]:
sorted_events = sorted(unsorted_timeline, key=lambda x: x['Date'])
json_data = json.dumps(sorted_events, indent=4, ensure_ascii=False)

# Write the JSON string to a file
with open('../data_upload/enhanced_timeline_trial.json', 'w', encoding='utf-8' ) as fin:
    fin.write(json_data)