## Exporting necessary libraries

In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster


# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


## Load and Combine JSON files

In [6]:
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r') as f:
            # Load data from the file and append it to the combined list
            data = json.load(f)
            combined_data.extend(data)
    return combined_data

# Example usage
files = ['../data_upload/final_db1.json', '../data_upload/final_db2.json', '../data_upload/final_db3.json', '../data_upload/final_db4.json']
db = combine_json(files)

In [7]:
def read_load_json(json_data):
    for item in json_data:
        #Convert the embeddings to json string as CSVs dont accept list as a data type
        item['tags_embeddings'] = json.dumps(item['tags_embeddings'])
        item['Title_embeddings'] = json.dumps(item['Title_embeddings'])
    df = pd.DataFrame(json_data)
    return df

In [8]:
df = read_load_json(db)
df.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",Title: Remains found in Swiss Alps are those o...,"[Missing Climber, Swiss Alps, Glaciers, Global...","[0.025687463581562042, 0.03274165466427803, -0...","[0.021028, 0.006548, 0.037958, 0.049163, -0.00...",2023-09-01
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",Title: Involve youth in shaping ethical use of...,"[Youth activism, Artificial intelligence, Ethi...","[0.026038197800517082, 0.05095928534865379, -0...","[0.033077, 0.121931, -0.034714, 0.012957, -0.0...",2023-09-02
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...","Title: Swiatek, Djokovic headline third round ...","[US Open, Grand Slam, Novak Djokovic, Iga Swia...","[-0.04092131927609444, 0.015564153902232647, -...","[-0.018808, -0.049826, 0.005458, -0.010391, -0...",2023-09-01
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",Title: ‘Systematic repression’ crushing Myanma...,"[Myanmar, UN chief, ASEAN, Rohingya, Military ...","[0.02929660677909851, 0.0006651841104030609, -...","[0.059998, -0.014698, 0.02184, -0.031714, 0.00...",2023-09-07
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",Title: Israel's shekel falls as judicial showd...,"[Israel, Shekel, Judicial crisis, Supreme Cour...","[0.015406888909637928, 0.04966922104358673, 0....","[-0.02634, 0.070879, 0.013255, -0.008821, -0.0...",2023-09-07


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2008 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2008 non-null   object
 1   Text              2008 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2008 non-null   object
 4   combined          2007 non-null   object
 5   tags              2008 non-null   object
 6   tags_embeddings   2008 non-null   object
 7   Title_embeddings  2008 non-null   object
 8   Publication_date  2008 non-null   object
dtypes: object(9)
memory usage: 141.3+ KB


In [10]:
## NEED TO CHECK 

In [11]:
nan_rows = df[df.isnull().any(axis=1)]
df = df.drop(nan_rows.index)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


## Data Preprocessing 
- Concatentation of embeddings
- Standardisation of embeddings

In [10]:
# Deserialising of embeddings
body_embeddings= np.array(df['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings= np.array(df['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings= np.array(df['tags_embeddings'].apply(ast.literal_eval).tolist())
all_embeddings = np.concatenate((body_embeddings, title_embeddings, tags_embeddings), axis=1)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_embeddings, test_embeddings = train_test_split(all_embeddings, test_size=10, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_embeddings)
X_test_scaled = scaler.transform(test_embeddings)

In [14]:
X_train_scaled.shape

(1997, 2304)

## Conducting PCA Experimentation to find best amount of variance.

#### For variance range of 94% to 97%

In [20]:
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.neighbors import KNeighborsClassifier
# Experiment for this variance range of 94% to 97%
variance_range = list(np.arange(0.92, 0.97, 0.01))
variance_perf = {}

for variance in variance_range:
    pca = PCA(n_components=variance)
    train_pca_embeddings = pca.fit_transform(X_train_scaled)
    test_pca_embeddings = pca.transform(X_test_scaled)
    
    # Range of max_d values to try
    max_d_values = np.arange(45, 70)
    
    # Lists to store silhouette scores
    silhouette_scores_train = []
    silhouette_scores_test = []

    # Perform hierarchical clustering
    Z = linkage(train_pca_embeddings, method='ward')

    for max_d in max_d_values:
        clusters_train = fcluster(Z, max_d, criterion='distance')
        
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(train_pca_embeddings, clusters_train)
        clusters_test = knn.predict(test_pca_embeddings)
        
        # Calculate silhouette scores only if there are at least 2 unique clusters and fewer than the number of samples
        if 1 < len(set(clusters_train)) < len(train_pca_embeddings):
            score_train = silhouette_score(train_pca_embeddings, clusters_train)
        else:
            score_train = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        if 1 < len(set(clusters_test)) < len(test_pca_embeddings):
            score_test = silhouette_score(test_pca_embeddings, clusters_test)
        else:
            score_test = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        silhouette_scores_train.append(score_train)
        silhouette_scores_test.append(score_test)

    # Determine the best max_d
    best_max_d_train = max_d_values[np.argmax(silhouette_scores_train)]
    best_max_d_test = max_d_values[np.argmax(silhouette_scores_test)]
    variance_perf[variance] = {
        'max_d_train': best_max_d_train,
        "max_d_test": best_max_d_test,
        'best_train_silhouette': max(silhouette_scores_train),
        "best_test_silhouette": max(silhouette_scores_test)
    }

In [21]:
variance_perf

{0.92: {'max_d_train': 54,
  'max_d_test': 45,
  'best_train_silhouette': 0.09662836924445259,
  'best_test_silhouette': -1},
 0.93: {'max_d_train': 55,
  'max_d_test': 45,
  'best_train_silhouette': 0.09493866482143248,
  'best_test_silhouette': -1},
 0.9400000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.09341043113832068,
  'best_test_silhouette': -1},
 0.9500000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0918934174697297,
  'best_test_silhouette': -1},
 0.9600000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0900331324671914,
  'best_test_silhouette': -1}}

In [25]:
# Find the best test variance based on the silhouette score
def get_best_variance(perf_results):
    highest_train_sil = 0
    best_variance_s = []
    for variance, scores in perf_results.items():
        if scores['best_train_silhouette'] > highest_train_sil:
            highest_train_sil = scores['best_train_silhouette']
            best_variance_s = [variance]  
        elif scores['best_train_silhouette'] == highest_train_sil:
            best_variance_s.append(variance)  
    
    final_best_max_d = perf_results[best_variance_s[0]]['max_d_train']
    return round(best_variance_s[0], 2), final_best_max_d

best_variance, best_max_d = get_best_variance(variance_perf)
print((best_variance , best_max_d))          

(0.92, 54)


## Best parameters when test size is 10

In [16]:
best_variance = 0.92
best_max_d = 54
print(best_variance)
print(best_max_d)

0.92
54


In [17]:
df_test = df.sample(10)
df_train = df.drop(df_test.index)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.neighbors import KNeighborsClassifier
# Deserializing the embeddings
body_embeddings_train = np.array(df_train['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_train = np.array(df_train['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_train = np.array(df_train['tags_embeddings'].apply(ast.literal_eval).tolist())

body_embeddings_test = np.array(df_test['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_test = np.array(df_test['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_test = np.array(df_test['tags_embeddings'].apply(ast.literal_eval).tolist())

# Combine embeddings
all_embeddings_train = np.concatenate((body_embeddings_train, title_embeddings_train, tags_embeddings_train), axis=1)
all_embeddings_test = np.concatenate((body_embeddings_test, title_embeddings_test, tags_embeddings_test), axis=1)

# Standardize embeddings
scaler = StandardScaler()
train_embeddings = scaler.fit_transform(all_embeddings_train)
test_embeddings = scaler.transform(all_embeddings_test)

# Perform PCA
pca = PCA(n_components=best_variance)
pca_train_embeddings = pca.fit_transform(train_embeddings)
pca_test_embeddings = pca.transform(test_embeddings)


Z = linkage(pca_train_embeddings, method='ward', metric='euclidean')
clusters_train = fcluster(Z, best_max_d, criterion='distance')

In [22]:
# Predict clusters for test data using the nearest cluster center
def predict_cluster(test_embedding, train_embeddings, clusters):
    distances = np.linalg.norm(train_embeddings - test_embedding, axis=1)
    return clusters[np.argmin(distances)]

# Assign clusters to test points
test_clusters = [predict_cluster(te, pca_train_embeddings, clusters_train) for te in pca_test_embeddings]
# Print the contents of each cluster assigned to each of the 5 test points
for i, (test_point, test_cluster) in enumerate(zip(df_test.itertuples(), test_clusters)):
    print(f"Test point {i} (Title: {test_point.Title}\n, Tags: {test_point.tags}):")
    print()
    print(f"Cluster {test_cluster} contents:")
    
    cluster_indices = np.where(clusters_train == test_cluster)[0]
    cluster_df = df_train.iloc[cluster_indices]
    
    for _, row in cluster_df.iterrows():
        print(f"Title: {row['Title']}, Tags: {row['tags']}")
    
    print("\n" + "-"*80 + "\n")


Test point 0 (Title: Nobel Peace Prize could honour  contributions of activists: Experts
, Tags: ['Nobel Peace Prize', 'Ukraine', 'Activism', 'Climate Change', 'Indigenous Rights', 'Womens Rights']):

Cluster 672 contents:
Title: Jailed Iranian activist Narges Mohammadi wins 2023 Nobel Peace Prize , Tags: ['Nobel Peace Prize', 'Iran', 'Womens Rights', 'Human Rights', 'Activism', 'Imprisonment']
Title: Jailed Iranian Peace laureate Narges Mohammadi's daughter: 'I miss you dearly', Tags: ['Nobel Peace Prize', 'Iran', 'Human Rights', 'Womens Rights', 'Political Prisoners', 'Demonstrations']

--------------------------------------------------------------------------------

Test point 1 (Title: The tissue that connects the muscles may be a key to better health
, Tags: ['Fascia health', 'Exercise', 'Chronic pain', 'Inflammation', 'Flexibility', 'Movement']):

Cluster 444 contents:
Title: Start your day with a five-minute yoga routine to ‘wake up your body’, Tags: ['Yoga', 'Exercise', 'Health

In [23]:
test_clusters

[672, 444, 361, 613, 650, 701, 330, 497, 277, 523]

In [24]:
df_train['Cluster_labels'] = clusters_train
df_test['Cluster_labels'] = test_clusters
df_test.reset_index(drop=True, inplace=True)

In [25]:
def common_tags(tags1, tags2):
    return set(tags1).intersection(set(tags2))

# Initialize the dictionary to store similar articles
similar_articles_dict = {}

# Iterate over each test article in the filtered df_test
for index, test_row in df_test.iterrows():
    test_tags = test_row['tags']
    test_cluster_label = test_row['Cluster_labels']
    
    # Filter df_train for the same cluster label
    df_train_cluster = df_train[df_train['Cluster_labels'] == test_cluster_label]
    
    # Find similar articles in df_train
    similar_indexes = []
    for train_index, train_row in df_train_cluster.iterrows():
        train_tags = train_row['tags']
        if len(common_tags(test_tags, train_tags)) >= 2:
            similar_indexes.append(train_index)
    
    # Store the result in the dictionary if there are at least 2 supporting articles
    if len(similar_indexes) >= 2:
        similar_articles_dict[index] = {
            'Title': test_row['Title'],
            'indexes': similar_indexes,
            'Text': test_row['Text']
        }

In [26]:
similar_articles_dict

{0: {'Title': 'Nobel Peace Prize could honour  contributions of activists: Experts',
  'indexes': [1026, 1849],
  'Text': 'OSLO/STOCKHOLM - Ukrainian President Volodymyr Zelensky and Russian dissident Alexei Navalny are among favourites for the 2023 Nobel Peace Prize, but experts say campaigners for women, indigenous people or the environment could well steal the stage.Given past form, the Norwegian Nobel Committee is also capable of a complete surprise in the Oct 6 announcement. Though bookmakers have Mr Zelensky as a top candidate to join the illustrious list of laureates from Mr Nelson Mandela to Mr Martin Luther King, Nobel specialists believe that as a wartime leader, the Ukrainian President is unlikely to be named.The imprisoned Navalny’s chances are lessened because Russian dissidents won in 2022 and the year before.A third bookmakers’ favourite is jailed Uighur activist Ilham Tohti, though that would infuriate China. When jailed dissident Liu Xiaobo won the peace prize, Beijing

In [27]:
for test_index, info in similar_articles_dict.items():
    print(f"Test Article Index: {test_index}")
    print(f"Test Article Title: {info['Title']}")
    print("Supporting Articles:")
    for idx in info['indexes']:
        print(f" - {df_train.loc[idx, 'Title']}")
    print("\n" + "-"*80 + "\n")

Test Article Index: 0
Test Article Title: Nobel Peace Prize could honour  contributions of activists: Experts
Supporting Articles:
 - Jailed Iranian activist Narges Mohammadi wins 2023 Nobel Peace Prize 
 - Jailed Iranian Peace laureate Narges Mohammadi's daughter: 'I miss you dearly'

--------------------------------------------------------------------------------

Test Article Index: 3
Test Article Title: MDIS says Jokowi’s son, a V-P candidate in Indonesia, did not fake his education
Supporting Articles:
 - Chicken soup with Widodo to soothe tense souls in Indonesia as political dynasty storm rages on
 - Weak economy casts shadow on Indonesia’s election season 

--------------------------------------------------------------------------------

Test Article Index: 9
Test Article Title: Azerbaijan, Armenia accuse each other of military build-up
Supporting Articles:
 - Azerbaijan launches 'anti-terrorist operation' in Karabakh
 - Several hundred protesters gather in Armenian capital aft

In [29]:
# Initialize the generative model
llm = genai.GenerativeModel('gemini-1.0-pro')

# Define the template
template = '''
You are a highly intelligent AI tasked with analyzing articles to determine whether generating a timeline of events leading up to the key event in the article would be beneficial. Consider the following factors to make your decision:

    1. **Significance of the Event**:
       - Does the event have a significant impact on a large number of people, industries, or countries?
       - Are the potential long-term consequences of the event important?

    2. **Controversy or Debate**:
       - Is the event highly controversial or has it sparked significant debate?
       - Has the event garnered significant media attention and public interest?

    3. **Complexity**:
       - Does the event involve multiple factors, stakeholders, or causes that make it complex?
       - Does the event have deep historical roots or is it the culmination of long-term developments?

    4. **Personal Relevance**:
       - Does the event directly affect the reader or their community?
       - Is the event of particular interest to the reader due to economic implications, political affiliations, or social issues?

    5. Educational Purposes:
       - Would a timeline provide valuable learning or research information?

    Here is the information for the article:

    Title:{title}
    Text: {text}

    Based on the factors above, decide whether generating a timeline of events leading up to the key event in this article would be beneficial. Provide a brief explanation for your decision. 
    Then reply in terms of the need with a score 1 - 5, 1 means unnecessary, 5 means necessary, in JSON format, for example score 3. No need for an explanation
    ANSWER:
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text", "title"],
    template=template,
)

timeline_keys = []

for key, title_index in list(similar_articles_dict.items()):
    
    # Define the headline
    headline = title_index['Title']
    body = title_index['Text']

    # Format the prompt
    final_prompt = prompt.format(title=headline, text=body)

    # Generate content using the generative model
    response = llm.generate_content(
        final_prompt,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        }
    )
    for i in range(len(response.parts[0].text)):
      if response.parts[0].text[i:i+5] == 'score':
         score = int(response.parts[0].text[i+8])
         print(key, title_index['Title'], score )
         if score >=3:
               timeline_keys.append(key)
timeline_keys
    

0 Nobel Peace Prize could honour  contributions of activists: Experts 3
3 MDIS says Jokowi’s son, a V-P candidate in Indonesia, did not fake his education 1
9 Azerbaijan, Armenia accuse each other of military build-up 3


[0, 9]

In [32]:
similar_articles_dict[timeline_keys[1]]

{'Title': 'Azerbaijan, Armenia accuse each other of military build-up',
 'indexes': [190, 766, 855, 976, 1129, 1367, 1798, 1852],
 'Text': 'LONDON - Armenia and Azerbaijan accused each other on Thursday of moving troops close to their joint border as tensions over the future of the Nagorno-Karabakh enclave rose even as the two countries said they remained committed to a peace process.Nagorno-Karabakh, an ethnic Armenian enclave internationally recognised as part of Azerbaijan but run by ethnic Armenian authorities, is at the centre of a rancorous standoff, with Azerbaijan restricting movement along the only road to it from Armenia to thwart what it says is arms smuggling.Armenian Prime Minister Nikol Pashinyan on Thursday accused Azerbaijan of conducting an “ongoing military build-up along the line of contact in Nagorno-Karabakh and the Armenia-Azerbaijan border”, according to Armenian state news agency Armenpress.Armenia’s foreign ministry, which said Yerevan was not interested in mil

In [90]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

class Event(BaseModel):
    Date: str = Field(description="The date of the event in YYYY-MM-DD format")
    Event: str = Field(description="A detailed description of the event")
    Article: int = Field(description="The article number from which the event was extracted")

output_parser = JsonOutputParser(pydantic_object=Event)

# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()

template = '''
Given a series of articles, each containing a publication date, title, and content, your task is to construct a detailed timeline of events leading up to the main event described in the first article.
Analyze the First Article: Begin by thoroughly analyzing the title, content, and publication date of the first article to understand the main event.
Use Subsequent Articles: For each following article, examine the title, content, and publication date. Identify events, context, and any time references such as "last week," "last month," or specific dates.

Construct the Timeline:
Chronological Order: Organize the events chronologically, using the publication dates and time references within the articles.
Detailed Descriptions: Provide detailed descriptions of each event, explaining how it relates to the main event of the first article.
Contextual Links: Use information from the articles to link events together logically and coherently.
Handle Ambiguities: If an article uses ambiguous time references, infer the date based on the publication date of the article and provide a clear rationale for your inference.

Contextual Links:
External Influences: Mention any external influences (e.g., global conflicts, economic trends, scientific discoveries) that might have indirectly affected the events.
Internal Issues: Highlight any internal issues or developments (e.g., political changes, organizational restructuring, societal movements) within the entities involved that might have impacted the events.
Efforts for Improvement: Note any indications of efforts to improve the situation (e.g., policy changes, strategic initiatives, collaborative projects) despite existing challenges.

Be as thorough and precise as possible, ensuring the timeline accurately reflects the sequence and context of events leading to the main event.

Series of Articles:
{text}

Example Output is in a json format as shown below:
JSON
[
{{'Date': '2023-10-07', 'Event': 'Event description', 'Article': 1}},
{{'Date': '2023-11-11', 'Event': 'Event description', 'Article': 2}},
{{'Date': '2023-11-12', 'Event': "Event description", 'Article': 3}}
]
Check and ensure again that the output follows the format instructions above very strictly. 

'''

prompt = PromptTemplate(
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

In [80]:
similar_articles_dict[timeline_keys[1]]

{'Title': 'Azerbaijan, Armenia accuse each other of military build-up',
 'indexes': [190, 766, 855, 976, 1129, 1367, 1798, 1852],
 'Text': 'LONDON - Armenia and Azerbaijan accused each other on Thursday of moving troops close to their joint border as tensions over the future of the Nagorno-Karabakh enclave rose even as the two countries said they remained committed to a peace process.Nagorno-Karabakh, an ethnic Armenian enclave internationally recognised as part of Azerbaijan but run by ethnic Armenian authorities, is at the centre of a rancorous standoff, with Azerbaijan restricting movement along the only road to it from Armenia to thwart what it says is arms smuggling.Armenian Prime Minister Nikol Pashinyan on Thursday accused Azerbaijan of conducting an “ongoing military build-up along the line of contact in Nagorno-Karabakh and the Armenia-Azerbaijan border”, according to Armenian state news agency Armenpress.Armenia’s foreign ministry, which said Yerevan was not interested in mil

In [91]:
trial_keys = [9]
timelines_text = []
for key in trial_keys:
    df_retrieve = df_train.loc[similar_articles_dict[key]['indexes']]
    df_retrieve = pd.concat([df_retrieve, df_test.iloc[[key]]], axis=0)
    df_retrieve = df_retrieve.iloc[::-1].reset_index(drop=True)
    indiv_text = list(df_retrieve.combined.values)
    indiv_dates = list(df_retrieve.Publication_date.values)
    all = []
    for i in range(len(indiv_text)):
        s =  f'Article {i+1}: Publication date: {indiv_dates[i]}  {indiv_text[i]}'
        all.append(s)
    sum_of_text = ", ".join(all) 
    
    final_prompt = prompt.format(text=sum_of_text)
    response = llm.generate_content(final_prompt,
                                   safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    }
    )

In [93]:
print(response.parts[0].text)

{
"Timeline": [
{
"Date": "Pre-2020",
"Event": "Russia brokered a ceasefire after the last war in the region in 2020, in which Azerbaijan regained control of swathes of territory in Nagorno-Karabakh and the surrounding regions.",
"Article": 9
},
{
"Date": "2023-09-19",
"Event": "Azerbaijan launched an offensive after the separatist authority reported a major escalation of hostilities in Azerbaijan's Nagorno-Karabakh region.",
"Article": 9
},
{
"Date": "2023-09-20",
"Event": "Thousands of Armenians gathered at the airport in Nagorno-Karabakh after separatist forces agreed to a ceasefire with Azerbaijan.",
"Article": 4
},
{
"Date": "2023-09-21",
"Event": "Armenia's Prime Minister Nikol Pashinyan said that Armenia needs to be 'free of conflict' for the sake of its independence, after the fall of Nagorno-Karabakh.",
"Article": 7
},
{
"Date": "2023-09-24",
"Event": "The ethnic Armenian leadership of breakaway Nagorno-Karabakh said that the terms of their ceasefire with Azerbaijan were being

In [458]:
from datetime import datetime
def check_symmetry(string):
    open, closed=0, 0
    for char in string:
        if char =='{':
            open  += 1
        elif char =="}":
            closed += 1
    return open==closed
    

def generate_timeline(output):
    text = output.parts[0].text
    cleaned_string = text.strip('```').strip()
    # Found out that some generations had more than one '}' at the end, check for symmetry
    if not check_symmetry(cleaned_string):
        cleaned_string = text.strip('```').strip()[:-1]
    # Parse the cleaned string into a list of dictionaries

    events = json.loads(f'[{cleaned_string}]')
    for event in events:
        # Retrieve article id of respective events in timeline
        article_index = event["Article"] - 1
        event["Article_id"] = df_retrieve.iloc[article_index].id
        del event["Article"]
    
    sorted_timeline = sorted(events, key=lambda x: datetime.strptime(x['Date'], '%Y-%m-%d'))
    return sorted_timeline

In [472]:
timeline = generate_timeline(response)
timeline

[{'Date': '2023-10-07',
  'Event': 'Hamas militants carried out a deadly cross-border assault on Israel, killing around 1,200 people, mostly civilians, and taking around 240 people hostage.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-11',
  'Event': 'Israel launched a ground offensive into Gaza to eliminate Hamas after its deadly cross-border assault on Oct 7.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-12',
  'Event': "Israeli military offered to evacuate babies from Gaza's Al Shifa hospital after two newborns died as fuel ran out amid intense fighting.",
  'Article_id': 'pcc5muz8fbpivl7d'},
 {'Date': '2023-11-12',
  'Event': "Israeli military stated it would help evacuate babies from Gaza's Al Shifa hospital at the request of the staff.",
  'Article_id': 'lof2tcw594hrdhjd'},
 {'Date': '2023-11-14',
  'Event': "Israeli military claimed to have found signs that hostages were held in Gaza's Rantissi Hospital, a paediatric hospital.",
  'Article_id': 'gg689sax

In [2]:
timeline = [{'Date': '2023-10-07',
  'Event': 'Hamas militants carried out a deadly cross-border assault on Israel, killing around 1,200 people, mostly civilians, and taking around 240 people hostage.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-11',
  'Event': 'Israel launched a ground offensive into Gaza to eliminate Hamas after its deadly cross-border assault on Oct 7.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-12',
  'Event': "Israeli military offered to evacuate babies from Gaza's Al Shifa hospital after two newborns died as fuel ran out amid intense fighting.",
  'Article_id': 'pcc5muz8fbpivl7d'},
 {'Date': '2023-11-12',
  'Event': "Israeli military stated it would help evacuate babies from Gaza's Al Shifa hospital at the request of the staff.",
  'Article_id': 'lof2tcw594hrdhjd'},
 {'Date': '2023-11-14',
  'Event': "Israeli military claimed to have found signs that hostages were held in Gaza's Rantissi Hospital, a paediatric hospital.",
  'Article_id': 'gg689saxdq0uw7us'},
 {'Date': '2023-11-18',
  'Event': 'UN team visited Al-Shifa hospital and described it as a “death zone” after witnessing hundreds fleeing the hospital following Israeli army’s order to empty it.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-19',
  'Event': '31 premature babies were evacuated from Gaza City’s Al-Shifa hospital in a high-risk operation by Palestinian medics.',
  'Article_id': '9374ojr66hnevmhg'},
 {'Date': '2023-11-20',
  'Event': 'Israeli tanks were reported near a hospital complex in north Gaza where 12 Palestinians were killed and dozens wounded.',
  'Article_id': 'i9ihivdhoxkpzqlz'}]

In [5]:
import json
json_data = json.dumps(timeline, indent=4, ensure_ascii=False)

# Write the JSON string to a file
with open('../data_upload/timeline_trial1.json', 'w', encoding='utf-8' ) as fin:
    fin.write(json_data)

## NEW TESTING:
- Try generating a timeline of events for each article in the selected articles.


In [13]:
# testing
# 4: {'Title': 'Blinken says Palestinian voices key to Gaza future',
#   'indexes': [280, 632, 1301, 1397, 1480, 1505],

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


In [33]:
similar_articles_dict[timeline_keys[1]]

{'Title': 'Azerbaijan, Armenia accuse each other of military build-up',
 'indexes': [190, 766, 855, 976, 1129, 1367, 1798, 1852],
 'Text': 'LONDON - Armenia and Azerbaijan accused each other on Thursday of moving troops close to their joint border as tensions over the future of the Nagorno-Karabakh enclave rose even as the two countries said they remained committed to a peace process.Nagorno-Karabakh, an ethnic Armenian enclave internationally recognised as part of Azerbaijan but run by ethnic Armenian authorities, is at the centre of a rancorous standoff, with Azerbaijan restricting movement along the only road to it from Armenia to thwart what it says is arms smuggling.Armenian Prime Minister Nikol Pashinyan on Thursday accused Azerbaijan of conducting an “ongoing military build-up along the line of contact in Nagorno-Karabakh and the Armenia-Azerbaijan border”, according to Armenian state news agency Armenpress.Armenia’s foreign ministry, which said Yerevan was not interested in mil

In [212]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
llm = genai.GenerativeModel('gemini-1.5-pro' )

class Event(BaseModel):
    Date: str = Field(description="The date of the event in YYYY-MM-DD format")
    Event: str = Field(description="A detailed description of the event")
    Article: int = Field(description="The article number from which the event was extracted")

output_parser = JsonOutputParser(pydantic_object=Event)

# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()

template = '''
Given an article, containing a publication date, title, and content, your task is to construct a detailed timeline of events leading up to the main event described in the first article.
Begin by thoroughly analyzing the title, content, and publication date of the first article to understand the main event. 
the dates are represented in YYYY-MM-DD format. Identify events, context, and any time references such as "last week," "last month," or specific dates. 
The article could contain more or one key events. 

Construct the Timeline:
Chronological Order: Organize the events chronologically, using the publication dates and time references within the articles.
Detailed Descriptions: Provide detailed descriptions of each event, explaining how it relates to the main event of the first article.
Contextual Links: Use information from the articles to link events together logically and coherently.
Handle Ambiguities: If the article uses ambiguous time references, infer the date based on the publication date of the article and provide a clear rationale for your inference. Do not use dates like 'Thursday', state the actual referred date instead.

Contextual Links:
External Influences: Mention any external influences (e.g., global conflicts, economic trends, scientific discoveries) that might have indirectly affected the events.
Internal Issues: Highlight any internal issues or developments (e.g., political changes, organizational restructuring, societal movements) within the entities involved that might have impacted the events.
Efforts for Improvement: Note any indications of efforts to improve the situation (e.g., policy changes, strategic initiatives, collaborative projects) despite existing challenges.

Be as thorough and precise as possible, ensuring the timeline accurately reflects the sequence and context of events leading to the main event.

Article:
{text}

Example Output:
json
[
  {{'Date': '2023-10-07', 'Event': 'Antony Blinken, U.S. top diplomat, visited Ramallah and met with Palestinian Authority President Mahmoud Abbas to discuss the future of the Gaza Strip.', 'Article': 1}},
  {{'Date': '2023-11-11', 'Event': 'Event description', 'Article': 2}},
  {{'Date': '2023-11-12', 'Event': "Event description", 'Article': 3}}
]
Check and ensure again that the output follows the format instructions above very strictly. 
'''

prompt = PromptTemplate(
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

In [181]:
trial_keys = [9]
timelines_text = []

df_retrieve = df_train.loc[similar_articles_dict[key]['indexes']]
df_retrieve = pd.concat([df_retrieve, df_test.iloc[[key]]], axis=0)
df_retrieve = df_retrieve.iloc[::-1].reset_index(drop=True)
df_retrieve

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,Cluster_labels
0,x6bs26mxih4wjnvb,LONDON - Armenia and Azerbaijan accused each o...,"Azerbaijan, Armenia accuse each other of milit...","[0.063698, -0.023136, 0.008678, 0.013113, -0.0...","Title: Azerbaijan, Armenia accuse each other o...","[Armenia, Azerbaijan, Nagorno-Karabakh, Russia...","[0.03720000758767128, 0.010111724957823753, -0...","[0.035694, -0.013111, 0.013628, 0.005414, 0.01...",2023-09-08,523
1,96moyhcopwdgflfi,Four ex-leaders of Azerbaijan's formerly ethni...,"Four Karabakh leaders held in Azerbaijan, thre...","[0.054137, -0.006537, 0.007394, 0.020186, -0.0...",Title: Four Karabakh leaders held in Azerbaija...,"[Azerbaijan, Armenia, Nagorno-Karabakh, Ex-Kar...","[0.04491448402404785, -0.0014135852688923478, ...","[0.094126, -0.055465, -0.002879, 0.023244, -0....",2023-10-04,523
2,j40zdfvo5mm8yv3c,A Russian truck carrying food aid for Armenian...,Russia truck sets off with food aid for Armeni...,"[0.030952, -0.013953, -0.001719, 0.044435, -0....",Title: Russia truck sets off with food aid for...,"[Russia, Armenia, Nagorno-Karabakh, Khankendi,...","[0.040053002536296844, -0.0006843761657364666,...","[0.03145, -0.020559, -0.000136, 0.045389, 0.00...",2023-09-12,523
3,9owr18ngmcvsob4x,"MOSCOW - Armenia needs to be ""free of con...","Armenia needs peace, PM says after Azerbaijan ...","[0.044655, 0.050847, -0.003256, -0.007795, -0....","Title: Armenia needs peace, PM says after Azer...","[Armenia, Nagorno-Karabakh, Azerbaijan, Nikol ...","[0.043720196932554245, 0.0023561876732856035, ...","[0.05158, 0.040333, -0.038155, -0.006117, -0.0...",2023-09-21,523
4,kgiisc7nhbtx8o7o,"GORIS, Armenia - After the village was bo...","Fleeing bombs and death, Karabakh Armenians re...","[0.015886, -0.005729, 0.008714, -0.003721, -0....","Title: Fleeing bombs and death, Karabakh Armen...","[Armenia, Karabakh, Azerbaijan, Refugees, Conf...","[0.03360241651535034, 0.0005320683121681213, -...","[0.031704, -0.012698, 0.001959, -0.060447, 0.0...",2023-09-25,523
5,m5vcc0dfsmuho41p,"NEAR KORNIDZOR, Armenia - The ethnic Armenian ...",Karabakh Armenians say ceasefire being impleme...,"[0.034208, -0.022481, 0.008892, 0.018049, -0.0...",Title: Karabakh Armenians say ceasefire being ...,"[Nagorno-Karabakh, Ceasefire, Humanitarian Aid...","[0.030207229778170586, 0.009617523290216923, -...","[0.054287, -0.004536, -0.006971, -0.0148, -0.0...",2023-09-24,523
6,8pjpbua9arko3ovu,YEREVAN - Thousands of Armenians in Nagor...,Thousands of Armenians in Karabakh mass at air...,"[0.028429, -0.021259, -0.006373, 0.000124, -0....",Title: Thousands of Armenians in Karabakh mass...,"[Armenia, Karabakh, Ceasefire, Russia, Azerbai...","[0.030525000765919685, 0.008214612491428852, -...","[0.013365, -0.000876, 1.7e-05, -0.033337, -0.0...",2023-09-20,523
7,vehvbiu65a6u2ryr,YEREVAN - Several hundred protesters gath...,Several hundred protesters gather in Armenian ...,"[0.015835, -0.007741, 0.006544, -0.007725, -0....",Title: Several hundred protesters gather in Ar...,"[Armenia, Nagorno-Karabakh, Azerbaijan, Protes...","[0.020311029627919197, 0.02640683390200138, -0...","[-0.00594, -0.002363, -0.00611, 0.015682, 0.00...",2023-09-20,523
8,bon7mzog28fpzkqk,TBILISI - Ethnic Armenian separatist auth...,Azerbaijan launches 'anti-terrorist operation'...,"[0.060131, 0.009901, 0.00471, -0.015017, -0.06...",Title: Azerbaijan launches 'anti-terrorist ope...,"[Azerbaijan, Nagorno-Karabakh, Armenia, Russia...","[0.037528347223997116, -0.0033757074270397425,...","[0.040035, -0.037588, 0.031369, -0.039279, -0....",2023-09-19,523


In [182]:
timelines = {}
indiv_text = list(df_retrieve.combined.values)
indiv_dates = list(df_retrieve.Publication_date.values)
for i in range(len(indiv_text)):
    s =  f'Article {i+1}: Publication date: {indiv_dates[i]}  {indiv_text[i]}'
    final_prompt = prompt.format(text=s)
    response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )
    #key corresponds to the index in the df_test
    try:
        timelines[i] = response.parts[0].text
    except ValueError:
        timelines[i] = "Timeline could not be generated"

timelines

{0: '[\n  {\n    "Date": "2023-09-08",\n    "Event": "Armenia accuses Azerbaijan of conducting a military build-up along the border of Nagorno-Karabakh and the Armenia-Azerbaijan border.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Azerbaijan’s foreign ministry rejects Armenia\'s assertion and accuses Yerevan of conducting military and political provocations.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Azerbaijan’s foreign policy adviser, Hikmet Hajiyev, confirms that Azerbaijan\'s armed forces are conducting pre-planned drills, but accuses Armenia of concentrating troops on the border and purchasing new weaponry systems.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Russia criticizes Armenia\'s decision to host a joint exercise involving 85 US soldiers, stating that it does not contribute to stabilizing the situation in the region.",\n    "Article": 1\n  }\n]',
 1: '```json\n[\n  {\n    "Date": "2020-0

In [216]:
#Use gemini 1.5 pro for the tidy up of the timeline
llm = genai.GenerativeModel(model_name='gemini-1.5-pro-latest')
template = '''
You are given multiple JSON strings, which is a timeline of events, representing events with dates and articles. 
Some of these strings are enclosed in triple backticks and may contain unnecessary escape characters or extra braces. 
Your task is to clean and reformat these JSON strings into a well-structured list of dictionaries. 
Each dictionary should contain the keys "Date", "Event", and "Article".
Ensure that there is no information loss in your attempt to reformat the events. 

Timeline:
{text}

Example Output:
json
[
  {{'Date': '2023-10-07', 'Event': 'Antony Blinken, U.S. top diplomat, visited Ramallah and met with Palestinian Authority President.', 'Article': 1}},
  {{'Date': '2023-11-11', 'Event': 'Event description', 'Article': 2}},
  {{'Date': '2023-11-12', 'Event': "Event description", 'Article': 3}}
]
'''
prompt = PromptTemplate(
    input_variables=["text"],
    template=template
)


In [214]:
delete = []
for k,v in timelines.items():
    for i in range(len(v)):
        if v[i:i+10] == 'properties':
            delete.append(k)
for el in delete:
    del timelines[el]
timelines
    

{0: '[\n  {\n    "Date": "2023-09-08",\n    "Event": "Armenia accuses Azerbaijan of conducting a military build-up along the border of Nagorno-Karabakh and the Armenia-Azerbaijan border.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Azerbaijan’s foreign ministry rejects Armenia\'s assertion and accuses Yerevan of conducting military and political provocations.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Azerbaijan’s foreign policy adviser, Hikmet Hajiyev, confirms that Azerbaijan\'s armed forces are conducting pre-planned drills, but accuses Armenia of concentrating troops on the border and purchasing new weaponry systems.",\n    "Article": 1\n  },\n  {\n    "Date": "2023-09-08",\n    "Event": "Russia criticizes Armenia\'s decision to host a joint exercise involving 85 US soldiers, stating that it does not contribute to stabilizing the situation in the region.",\n    "Article": 1\n  }\n]',
 1: '```json\n[\n  {\n    "Date": "2020-0

In [217]:
combined_timeline = ''
for k,v in timelines.items():
    combined_timeline += v.replace("\n    ", "").replace("\n  ", "").replace("\n", "")
        
final_prompt = prompt.format(text=combined_timeline)
# response = llm.generate_content(final_prompt,
#                                     safety_settings={
#                                         HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
#                                         HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
#                                         HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
#                                         HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
#                                         }
#     )

    

In [218]:
print(response.parts[0].text)

```json
[
  {"Date": "2023-09-08", "Event": "Armenia accuses Azerbaijan of conducting a military build-up along the border of Nagorno-Karabakh and the Armenia-Azerbaijan border.", "Article": 1},
  {"Date": "2023-09-08", "Event": "Azerbaijan’s foreign ministry rejects Armenia's assertion and accuses Yerevan of conducting military and political provocations.", "Article": 1},
  {"Date": "2023-09-08", "Event": "Azerbaijan’s foreign policy adviser, Hikmet Hajiyev, confirms that Azerbaijan's armed forces are conducting pre-planned drills, but accuses Armenia of concentrating troops on the border and purchasing new weaponry systems.", "Article": 1},
  {"Date": "2023-09-08", "Event": "Russia criticizes Armenia's decision to host a joint exercise involving 85 US soldiers, stating that it does not contribute to stabilizing the situation in the region.", "Article": 1},
  {"Date": "2020-09-27", "Event": "Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (N

In [220]:
json_string = response.parts[0].text
json_string

'```json\n[\n  {"Date": "2023-09-08", "Event": "Armenia accuses Azerbaijan of conducting a military build-up along the border of Nagorno-Karabakh and the Armenia-Azerbaijan border.", "Article": 1},\n  {"Date": "2023-09-08", "Event": "Azerbaijan’s foreign ministry rejects Armenia\'s assertion and accuses Yerevan of conducting military and political provocations.", "Article": 1},\n  {"Date": "2023-09-08", "Event": "Azerbaijan’s foreign policy adviser, Hikmet Hajiyev, confirms that Azerbaijan\'s armed forces are conducting pre-planned drills, but accuses Armenia of concentrating troops on the border and purchasing new weaponry systems.", "Article": 1},\n  {"Date": "2023-09-08", "Event": "Russia criticizes Armenia\'s decision to host a joint exercise involving 85 US soldiers, stating that it does not contribute to stabilizing the situation in the region.", "Article": 1},\n  {"Date": "2020-09-27", "Event": "Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of 

In [221]:
from datetime import datetime
def check_symmetry(string):
    open, closed=0, 0
    for char in string:
        if char =='{':
            open  += 1
        elif char =="}":
            closed += 1
    return open==closed
    

def generate_timeline(output):
    text = output.replace("[", "").replace("]", "").replace("json", "")
    cleaned_string = text.strip('```').strip()
    # Found out that some generations had more than one '}' at the end, check for symmetry
    if not check_symmetry(cleaned_string):
        cleaned_string = text.strip('```').strip()[:-1]
    # Parse the cleaned string into a list of dictionaries

    events = json.loads(f'[{cleaned_string}]')
    for event in events:
        # Retrieve article id of respective events in timeline
        article_index = event["Article"] - 1
        event["Article_id"] = df_retrieve.iloc[article_index].id
        del event["Article"]
    
    return events

In [222]:
data = generate_timeline(json_string)
data

[{'Date': '2023-09-08',
  'Event': 'Armenia accuses Azerbaijan of conducting a military build-up along the border of Nagorno-Karabakh and the Armenia-Azerbaijan border.',
  'Article_id': 'x6bs26mxih4wjnvb'},
 {'Date': '2023-09-08',
  'Event': "Azerbaijan’s foreign ministry rejects Armenia's assertion and accuses Yerevan of conducting military and political provocations.",
  'Article_id': 'x6bs26mxih4wjnvb'},
 {'Date': '2023-09-08',
  'Event': "Azerbaijan’s foreign policy adviser, Hikmet Hajiyev, confirms that Azerbaijan's armed forces are conducting pre-planned drills, but accuses Armenia of concentrating troops on the border and purchasing new weaponry systems.",
  'Article_id': 'x6bs26mxih4wjnvb'},
 {'Date': '2023-09-08',
  'Event': "Russia criticizes Armenia's decision to host a joint exercise involving 85 US soldiers, stating that it does not contribute to stabilizing the situation in the region.",
  'Article_id': 'x6bs26mxih4wjnvb'},
 {'Date': '2020-09-27',
  'Event': 'Azerbaijan 

In [224]:
sorted_timeline = sorted(data, key=lambda x: datetime.strptime(x['Date'], '%Y-%m-%d'))
sorted_timeline

[{'Date': '2020-09-27',
  'Event': 'Azerbaijan launched a large-scale offensive against the self-proclaimed Republic of Artsakh (Nagorno-Karabakh), which resulted in a six-week war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-09-27',
  'Event': 'The Second Karabakh War began, with Azerbaijan retaking control of several territories in Nagorno-Karabakh.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2020-09-27',
  'Event': 'Azerbaijan regained control of swathes of territory in Nagorno-Karabakh and surrounding regions.',
  'Article_id': 'bon7mzog28fpzkqk'},
 {'Date': '2020-11-10',
  'Event': 'A tripartite ceasefire agreement was signed between Armenia, Azerbaijan, and Russia, ending the 2020 Nagorno-Karabakh war.',
  'Article_id': '96moyhcopwdgflfi'},
 {'Date': '2020-11-10',
  'Event': 'A ceasefire was signed between Armenia and Azerbaijan, with Nagorno-Karabakh being returned to Azerbaijani control.',
  'Article_id': 'vehvbiu65a6u2ryr'},
 {'Date': '2022-03-24',
  'Event': 

In [227]:
json_data = json.dumps(sorted_timeline, indent=4, ensure_ascii=False)

# Write the JSON string to a file
with open('../data_upload/single_timeline_trial.json', 'w', encoding='utf-8' ) as fin:
    fin.write(json_data)

In [226]:
llm = genai.GenerativeModel(model_name = 'gemini-1.0-pro')

In [235]:
df_retrieve

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,Cluster_labels
0,x6bs26mxih4wjnvb,LONDON - Armenia and Azerbaijan accused each o...,"Azerbaijan, Armenia accuse each other of milit...","[0.063698, -0.023136, 0.008678, 0.013113, -0.0...","Title: Azerbaijan, Armenia accuse each other o...","[Armenia, Azerbaijan, Nagorno-Karabakh, Russia...","[0.03720000758767128, 0.010111724957823753, -0...","[0.035694, -0.013111, 0.013628, 0.005414, 0.01...",2023-09-08,523
1,96moyhcopwdgflfi,Four ex-leaders of Azerbaijan's formerly ethni...,"Four Karabakh leaders held in Azerbaijan, thre...","[0.054137, -0.006537, 0.007394, 0.020186, -0.0...",Title: Four Karabakh leaders held in Azerbaija...,"[Azerbaijan, Armenia, Nagorno-Karabakh, Ex-Kar...","[0.04491448402404785, -0.0014135852688923478, ...","[0.094126, -0.055465, -0.002879, 0.023244, -0....",2023-10-04,523
2,j40zdfvo5mm8yv3c,A Russian truck carrying food aid for Armenian...,Russia truck sets off with food aid for Armeni...,"[0.030952, -0.013953, -0.001719, 0.044435, -0....",Title: Russia truck sets off with food aid for...,"[Russia, Armenia, Nagorno-Karabakh, Khankendi,...","[0.040053002536296844, -0.0006843761657364666,...","[0.03145, -0.020559, -0.000136, 0.045389, 0.00...",2023-09-12,523
3,9owr18ngmcvsob4x,"MOSCOW - Armenia needs to be ""free of con...","Armenia needs peace, PM says after Azerbaijan ...","[0.044655, 0.050847, -0.003256, -0.007795, -0....","Title: Armenia needs peace, PM says after Azer...","[Armenia, Nagorno-Karabakh, Azerbaijan, Nikol ...","[0.043720196932554245, 0.0023561876732856035, ...","[0.05158, 0.040333, -0.038155, -0.006117, -0.0...",2023-09-21,523
4,kgiisc7nhbtx8o7o,"GORIS, Armenia - After the village was bo...","Fleeing bombs and death, Karabakh Armenians re...","[0.015886, -0.005729, 0.008714, -0.003721, -0....","Title: Fleeing bombs and death, Karabakh Armen...","[Armenia, Karabakh, Azerbaijan, Refugees, Conf...","[0.03360241651535034, 0.0005320683121681213, -...","[0.031704, -0.012698, 0.001959, -0.060447, 0.0...",2023-09-25,523
5,m5vcc0dfsmuho41p,"NEAR KORNIDZOR, Armenia - The ethnic Armenian ...",Karabakh Armenians say ceasefire being impleme...,"[0.034208, -0.022481, 0.008892, 0.018049, -0.0...",Title: Karabakh Armenians say ceasefire being ...,"[Nagorno-Karabakh, Ceasefire, Humanitarian Aid...","[0.030207229778170586, 0.009617523290216923, -...","[0.054287, -0.004536, -0.006971, -0.0148, -0.0...",2023-09-24,523
6,8pjpbua9arko3ovu,YEREVAN - Thousands of Armenians in Nagor...,Thousands of Armenians in Karabakh mass at air...,"[0.028429, -0.021259, -0.006373, 0.000124, -0....",Title: Thousands of Armenians in Karabakh mass...,"[Armenia, Karabakh, Ceasefire, Russia, Azerbai...","[0.030525000765919685, 0.008214612491428852, -...","[0.013365, -0.000876, 1.7e-05, -0.033337, -0.0...",2023-09-20,523
7,vehvbiu65a6u2ryr,YEREVAN - Several hundred protesters gath...,Several hundred protesters gather in Armenian ...,"[0.015835, -0.007741, 0.006544, -0.007725, -0....",Title: Several hundred protesters gather in Ar...,"[Armenia, Nagorno-Karabakh, Azerbaijan, Protes...","[0.020311029627919197, 0.02640683390200138, -0...","[-0.00594, -0.002363, -0.00611, 0.015682, 0.00...",2023-09-20,523
8,bon7mzog28fpzkqk,TBILISI - Ethnic Armenian separatist auth...,Azerbaijan launches 'anti-terrorist operation'...,"[0.060131, 0.009901, 0.00471, -0.015017, -0.06...",Title: Azerbaijan launches 'anti-terrorist ope...,"[Azerbaijan, Nagorno-Karabakh, Armenia, Russia...","[0.037528347223997116, -0.0033757074270397425,...","[0.040035, -0.037588, 0.031369, -0.039279, -0....",2023-09-19,523


In [237]:
list_of_dicts = list_of_dicts = df.to_dict(orient='records')
json_string = json.dumps(list_of_dicts, indent=4)
with open('../data_upload/df_retrieve.json', 'w', encoding='utf-8') as f:
    f.write(json_string)