In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
from tqdm import trange


# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


In [2]:
# Read the JSON string from the file
with open('../data_upload/single_timeline_trial.json', 'r', encoding='utf-8') as fin:
    timeline = json.load(fin)

In [3]:
timeline

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), which resulted in a six-week war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began, with Azerbaijan retaking control of several territories in Nagorno-Karabakh.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2020-09-27',
  'Event': 'Azerbaijan regained control of swathes of territory in Nagorno-Karabakh and surrounding regions.',
  'Article_id': 'bon7mzog28fpzkqk'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A ceasefire was signed between Armenia and Azerbaijan, with Nagorno-Karabakh being returned to Azerbaijani control.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2022-03-24',
  'Event': 

In [4]:
with open('../data_upload/df_retrieve.json', 'r', encoding='utf-8') as fin:
    data = json.load(fin)
retrieval = pd.DataFrame(data)
retrieval.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,Cluster_labels
0,x6bs26mxih4wjnvb,LONDON - Armenia and Azerbaijan accused each o...,"Azerbaijan, Armenia accuse each other of milit...","[0.063698, -0.023136, 0.008678, 0.013113, -0.0...","Title: Azerbaijan, Armenia accuse each other o...","[Armenia, Azerbaijan, Nagorno-Karabakh, Russia...","[0.03720000758767128, 0.010111724957823753, -0...","[0.035694, -0.013111, 0.013628, 0.005414, 0.01...",2023-09-08,523
1,96moyhcopwdgflfi,Four ex-leaders of Azerbaijan's formerly ethni...,"Four Karabakh leaders held in Azerbaijan, thre...","[0.054137, -0.006537, 0.007394, 0.020186, -0.0...",Title: Four Karabakh leaders held in Azerbaija...,"[Azerbaijan, Armenia, Nagorno-Karabakh, Ex-Kar...","[0.04491448402404785, -0.0014135852688923478, ...","[0.094126, -0.055465, -0.002879, 0.023244, -0....",2023-10-04,523
2,j40zdfvo5mm8yv3c,A Russian truck carrying food aid for Armenian...,Russia truck sets off with food aid for Armeni...,"[0.030952, -0.013953, -0.001719, 0.044435, -0....",Title: Russia truck sets off with food aid for...,"[Russia, Armenia, Nagorno-Karabakh, Khankendi,...","[0.040053002536296844, -0.0006843761657364666,...","[0.03145, -0.020559, -0.000136, 0.045389, 0.00...",2023-09-12,523
3,9owr18ngmcvsob4x,"MOSCOW - Armenia needs to be ""free of con...","Armenia needs peace, PM says after Azerbaijan ...","[0.044655, 0.050847, -0.003256, -0.007795, -0....","Title: Armenia needs peace, PM says after Azer...","[Armenia, Nagorno-Karabakh, Azerbaijan, Nikol ...","[0.043720196932554245, 0.0023561876732856035, ...","[0.05158, 0.040333, -0.038155, -0.006117, -0.0...",2023-09-21,523
4,kgiisc7nhbtx8o7o,"GORIS, Armenia - After the village was bo...","Fleeing bombs and death, Karabakh Armenians re...","[0.015886, -0.005729, 0.008714, -0.003721, -0....","Title: Fleeing bombs and death, Karabakh Armen...","[Armenia, Karabakh, Azerbaijan, Refugees, Conf...","[0.03360241651535034, 0.0005320683121681213, -...","[0.031704, -0.012698, 0.001959, -0.060447, 0.0...",2023-09-25,523


## Task Description:
- Optimise the timeline by making it less dry and more contextually relevant. 
- Removing duplicates events and dates

In [8]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

llm = genai.GenerativeModel(model_name='gemini-1.0-pro')

class Event(BaseModel):
    Date: str = Field(description="The date of the event in YYYY-MM-DD format")
    Event: str = Field(description="A detailed description of the event")
    Contextual_Annotation: str = Field(description="Conetextual Anecdotes of the event.")
    Article: str = Field(description="The article id from which the event was extracted")

parser = JsonOutputParser(pydantic_object=Event)

template = '''
You are given a timeline of events, your task is to enhance this timeline by improving its clarity, contextual information. 
If events occur on the same date and have similar descriptions, I want you to merge these events to avoid redundancy.
Add Contextual Annotations by providing brief annotations for major events to give additional context and improve understanding.
Only retain important information that would be value add when the general public reads the information.

Initial Timeline:
{text}

{format_instructions}
Ensure that the format follows the Example output format strictly before returning the output
'''
prompt = PromptTemplate(
    input_variables=["text"],
    template=template
)

In [10]:
final_prompt = prompt.format(text=timeline, format_instructions= parser.get_format_instructions())
response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )

In [17]:
print(response.parts[0].text)

[{"Date": "2020-09-27", "Event": "The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.", "Contextual_Annotation": "Context: This marked a significant escalation in the long-standing conflict between the two countries over the disputed Nagorno-Karabakh region.", "Article": "96moyhcopwdgflfi"}, {"Date": "2020-11-10", "Event": "A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.", "Contextual_Annotation": "Context: The ceasefire brought an end to the conflict, but tensions between the two countries remained high.", "Article": "96moyhcopwdgflfi"}, {"Da

In [16]:
enhanced_timeline = response.parts[0].text
enhanced_timeline

'[{"Date": "2020-09-27", "Event": "The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.", "Contextual_Annotation": "Context: This marked a significant escalation in the long-standing conflict between the two countries over the disputed Nagorno-Karabakh region.", "Article": "96moyhcopwdgflfi"}, {"Date": "2020-11-10", "Event": "A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.", "Contextual_Annotation": "Context: The ceasefire brought an end to the conflict, but tensions between the two countries remained high.", "Article": "96moyhcopwdgflfi"}, {"D

In [12]:
data = json.loads(enhanced_timeline)
data

[{'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.',
  'Contextual_Annotation': 'Context: This marked a significant escalation in the long-standing conflict between the two countries over the disputed Nagorno-Karabakh region.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.',
  'Contextual_Annotation': 'Context: The ceasefire brought an end to the conflict, but tensions between the two countries remained high.',
  'Article': '96moyhcopwd

In [13]:
from datetime import datetime
sorted_timeline = sorted(data, key=lambda x: datetime.strptime(x['Date'], '%Y-%m-%d'))
sorted_timeline

[{'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.',
  'Contextual_Annotation': 'Context: This marked a significant escalation in the long-standing conflict between the two countries over the disputed Nagorno-Karabakh region.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.',
  'Contextual_Annotation': 'Context: The ceasefire brought an end to the conflict, but tensions between the two countries remained high.',
  'Article': '96moyhcopwd

In [34]:
unique_ids = []
for event in sorted_timeline:
    if event['Article'] not in unique_ids:
        unique_ids.append(event['Article'])
unique_ids

['96moyhcopwdgflfi',
 'm5vcc0dfsmuho41p',
 'x6bs26mxih4wjnvb',
 'bon7mzog28fpzkqk',
 '9owr18ngmcvsob4x']

In [19]:
llm = genai.GenerativeModel(model_name='gemini-1.0-pro')

template = '''
You are given a timeline of events, and an article related to this timeline. 
Your task is to reference the provided article and enhance this timeline by improving its clarity and contextual information.
Add to the Contextual Annotations by providing annotations for major events to give additional context and improve understanding, \
  however, ensure that the contextual annotations added do not blatantly repeat what the contents of the event.

Initial Timeline:
{timeline}

Reference Article:
{article}

{format_instructions}
Ensure that the format follows the Example output format strictly before returning the output
'''
prompt = PromptTemplate(
    input_variables=["text", "article"],
    template=template
)

In [35]:
from json import JSONDecodeError
import re

def clean_output(output):
    try:
        updated_timeline = json.loads(output)
        return updated_timeline
    except JSONDecodeError:
        #try 1: Ensuring that the string ends with just the open and close lists brackets
        output = re.search(r'\[[^\]]*\]', output).group(0)
        updated_timeline = json.loads(output)
        return updated_timeline

final_timeline = []
unique_ids = []
for event in sorted_timeline:
    if event['Article'] not in unique_ids:
        unique_ids.append(event['Article'])
for id in unique_ids:
    text = []
    for event in sorted_timeline:
        if event['Article'] == id:
            text.append(event)
    article = retrieval[retrieval['id'] == id].reset_index()['Text'][0]
    final_prompt = prompt.format(timeline=text, article = article, format_instructions=parser.get_format_instructions())
    response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )
    section = clean_output(response.parts[0].text)
    final_timeline.append(section)

final_timeline

[[{'Date': '2020-09-27',
   'Event': 'The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.',
   'Contextual_Annotation': 'The Second Karabakh War was a continuation of the long-standing conflict between Azerbaijan and Armenia over the disputed Nagorno-Karabakh region. The war began with Azerbaijan launching a large-scale offensive on September 27, 2020, and ended with a tripartite ceasefire agreement on November 10, 2020.',
   'Article': '96moyhcopwdgflfi'},
  {'Date': '2020-11-10',
   'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.',

In [40]:
unsorted_timeline = []
for timeline in final_timeline:
    for event in timeline:
        unsorted_timeline.append(event)
unsorted_timeline

[{'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began with Azerbaijan launching a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh). The war lasted six weeks, resulting in Azerbaijan regaining control of large swaths of territory.',
  'Contextual_Annotation': 'The Second Karabakh War was a continuation of the long-standing conflict between Azerbaijan and Armenia over the disputed Nagorno-Karabakh region. The war began with Azerbaijan launching a large-scale offensive on September 27, 2020, and ended with a tripartite ceasefire agreement on November 10, 2020.',
  'Article': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war. The agreement stipulated that Armenia would return all territories surrounding Nagorno-Karabakh to Azerbaijan, while Russian peacekeepers would be deployed to monitor the ceasefire.',
  'Co

In [43]:
sorted_events = sorted(unsorted_timeline, key=lambda x: x['Date'])
json_data = json.dumps(sorted_events, indent=4, ensure_ascii=False)

# Write the JSON string to a file
with open('../data_upload/enhanced_timeline_trial.json', 'w', encoding='utf-8' ) as fin:
    fin.write(json_data)