## Exporting necessary libraries

In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster


# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


## Load and Combine JSON files

In [2]:
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r') as f:
            # Load data from the file and append it to the combined list
            data = json.load(f)
            combined_data.extend(data)
    return combined_data

# Example usage
files = ['../data_upload/final_db1.json', '../data_upload/final_db2.json', '../data_upload/final_db3.json', '../data_upload/final_db4.json']
db = combine_json(files)

In [3]:
def read_load_json(json_data):
    for item in json_data:
        #Convert the embeddings to json string as CSVs dont accept list as a data type
        item['tags_embeddings'] = json.dumps(item['tags_embeddings'])
        item['Title_embeddings'] = json.dumps(item['Title_embeddings'])
    df = pd.DataFrame(json_data)
    return df

In [4]:
df = read_load_json(db)
df.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",Title: Remains found in Swiss Alps are those o...,"[Missing Climber, Swiss Alps, Glaciers, Global...","[0.025687463581562042, 0.03274165466427803, -0...","[0.021028, 0.006548, 0.037958, 0.049163, -0.00...",2023-09-01
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",Title: Involve youth in shaping ethical use of...,"[Youth activism, Artificial intelligence, Ethi...","[0.026038197800517082, 0.05095928534865379, -0...","[0.033077, 0.121931, -0.034714, 0.012957, -0.0...",2023-09-02
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...","Title: Swiatek, Djokovic headline third round ...","[US Open, Grand Slam, Novak Djokovic, Iga Swia...","[-0.04092131927609444, 0.015564153902232647, -...","[-0.018808, -0.049826, 0.005458, -0.010391, -0...",2023-09-01
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",Title: ‘Systematic repression’ crushing Myanma...,"[Myanmar, UN chief, ASEAN, Rohingya, Military ...","[0.02929660677909851, 0.0006651841104030609, -...","[0.059998, -0.014698, 0.02184, -0.031714, 0.00...",2023-09-07
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",Title: Israel's shekel falls as judicial showd...,"[Israel, Shekel, Judicial crisis, Supreme Cour...","[0.015406888909637928, 0.04966922104358673, 0....","[-0.02634, 0.070879, 0.013255, -0.008821, -0.0...",2023-09-07


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2008 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2008 non-null   object
 1   Text              2008 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2008 non-null   object
 4   combined          2007 non-null   object
 5   tags              2008 non-null   object
 6   tags_embeddings   2008 non-null   object
 7   Title_embeddings  2008 non-null   object
 8   Publication_date  2008 non-null   object
dtypes: object(9)
memory usage: 141.3+ KB


In [140]:
## NEED TO CHECK 

In [8]:
nan_rows = df[df.isnull().any(axis=1)]
df = df.drop(nan_rows.index)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


## Data Preprocessing 
- Concatentation of embeddings
- Standardisation of embeddings

In [10]:
# Deserialising of embeddings
body_embeddings= np.array(df['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings= np.array(df['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings= np.array(df['tags_embeddings'].apply(ast.literal_eval).tolist())
all_embeddings = np.concatenate((body_embeddings, title_embeddings, tags_embeddings), axis=1)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_embeddings, test_embeddings = train_test_split(all_embeddings, test_size=10, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_embeddings)
X_test_scaled = scaler.transform(test_embeddings)

In [14]:
X_train_scaled.shape

(1997, 2304)

## Conducting PCA Experimentation to find best amount of variance.

#### For variance range of 94% to 97%

In [20]:
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.neighbors import KNeighborsClassifier
# Experiment for this variance range of 94% to 97%
variance_range = list(np.arange(0.92, 0.97, 0.01))
variance_perf = {}

for variance in variance_range:
    pca = PCA(n_components=variance)
    train_pca_embeddings = pca.fit_transform(X_train_scaled)
    test_pca_embeddings = pca.transform(X_test_scaled)
    
    # Range of max_d values to try
    max_d_values = np.arange(45, 70)
    
    # Lists to store silhouette scores
    silhouette_scores_train = []
    silhouette_scores_test = []

    # Perform hierarchical clustering
    Z = linkage(train_pca_embeddings, method='ward')

    for max_d in max_d_values:
        clusters_train = fcluster(Z, max_d, criterion='distance')
        
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(train_pca_embeddings, clusters_train)
        clusters_test = knn.predict(test_pca_embeddings)
        
        # Calculate silhouette scores only if there are at least 2 unique clusters and fewer than the number of samples
        if 1 < len(set(clusters_train)) < len(train_pca_embeddings):
            score_train = silhouette_score(train_pca_embeddings, clusters_train)
        else:
            score_train = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        if 1 < len(set(clusters_test)) < len(test_pca_embeddings):
            score_test = silhouette_score(test_pca_embeddings, clusters_test)
        else:
            score_test = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        silhouette_scores_train.append(score_train)
        silhouette_scores_test.append(score_test)

    # Determine the best max_d
    best_max_d_train = max_d_values[np.argmax(silhouette_scores_train)]
    best_max_d_test = max_d_values[np.argmax(silhouette_scores_test)]
    variance_perf[variance] = {
        'max_d_train': best_max_d_train,
        "max_d_test": best_max_d_test,
        'best_train_silhouette': max(silhouette_scores_train),
        "best_test_silhouette": max(silhouette_scores_test)
    }

In [21]:
variance_perf

{0.92: {'max_d_train': 54,
  'max_d_test': 45,
  'best_train_silhouette': 0.09662836924445259,
  'best_test_silhouette': -1},
 0.93: {'max_d_train': 55,
  'max_d_test': 45,
  'best_train_silhouette': 0.09493866482143248,
  'best_test_silhouette': -1},
 0.9400000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.09341043113832068,
  'best_test_silhouette': -1},
 0.9500000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0918934174697297,
  'best_test_silhouette': -1},
 0.9600000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0900331324671914,
  'best_test_silhouette': -1}}

In [25]:
# Find the best test variance based on the silhouette score
def get_best_variance(perf_results):
    highest_train_sil = 0
    best_variance_s = []
    for variance, scores in perf_results.items():
        if scores['best_train_silhouette'] > highest_train_sil:
            highest_train_sil = scores['best_train_silhouette']
            best_variance_s = [variance]  
        elif scores['best_train_silhouette'] == highest_train_sil:
            best_variance_s.append(variance)  
    
    final_best_max_d = perf_results[best_variance_s[0]]['max_d_train']
    return round(best_variance_s[0], 2), final_best_max_d

best_variance, best_max_d = get_best_variance(variance_perf)
print((best_variance , best_max_d))          

(0.92, 54)


## Best parameters when test size is 10

In [164]:
print(best_variance)
print(best_max_d)

0.92
57


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


In [163]:
df_test = df.sample(10)
df_train = df.drop(df_test.index)

In [168]:
best_max_d += 2

In [169]:
# Deserializing the embeddings
body_embeddings_train = np.array(df_train['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_train = np.array(df_train['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_train = np.array(df_train['tags_embeddings'].apply(ast.literal_eval).tolist())

body_embeddings_test = np.array(df_test['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_test = np.array(df_test['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_test = np.array(df_test['tags_embeddings'].apply(ast.literal_eval).tolist())

# Combine embeddings
all_embeddings_train = np.concatenate((body_embeddings_train, title_embeddings_train, tags_embeddings_train), axis=1)
all_embeddings_test = np.concatenate((body_embeddings_test, title_embeddings_test, tags_embeddings_test), axis=1)

# Standardize embeddings
scaler = StandardScaler()
train_embeddings = scaler.fit_transform(all_embeddings_train)
test_embeddings = scaler.transform(all_embeddings_test)

# Perform PCA
pca = PCA(n_components=best_variance)
pca_train_embeddings = pca.fit_transform(train_embeddings)
pca_test_embeddings = pca.transform(test_embeddings)


Z = linkage(pca_train_embeddings, method='ward', metric='euclidean')
clusters_train = fcluster(Z, best_max_d, criterion='distance')

In [170]:
# Predict clusters for test data using the nearest cluster center
def predict_cluster(test_embedding, train_embeddings, clusters):
    distances = np.linalg.norm(train_embeddings - test_embedding, axis=1)
    return clusters[np.argmin(distances)]

# Assign clusters to test points
test_clusters = [predict_cluster(te, pca_train_embeddings, clusters_train) for te in pca_test_embeddings]
# Print the contents of each cluster assigned to each of the 5 test points
for i, (test_point, test_cluster) in enumerate(zip(df_test.itertuples(), test_clusters)):
    print(f"Test point {i} (Title: {test_point.Title}\n, Tags: {test_point.tags}):")
    print()
    print(f"Cluster {test_cluster} contents:")
    
    cluster_indices = np.where(clusters_train == test_cluster)[0]
    cluster_df = df_train.iloc[cluster_indices]
    
    for _, row in cluster_df.iterrows():
        print(f"Title: {row['Title']}, Tags: {row['tags']}")
    
    print("\n" + "-"*80 + "\n")


Test point 1 (Title: Rodri’s absence gives Arsenal a chance to end losing run against Manchester City
, Tags: ['Arsenal', 'Manchester City', 'Premier League', 'Erling Haaland', 'William Saliba', 'Bukayo Saka']):

Cluster 17 contents:
Title: Southgate said VAR checks left him bored in Spurs v Chelsea thriller, Tags: ['VAR', 'Tottenham Hotspur', 'Chelsea', 'Premier League', 'Gareth Southgate', 'Manchester United']
Title: Plenty of optimism in North London with Arsenal, Tottenham showing signs of progress, Tags: ['Arsenal', 'Tottenham Hotspur', 'English Premier League', 'North London', 'Mauricio Pochettino', 'Ange Postecoglou']
Title: After Everton shock, chaos looms in the English  Premier League , Tags: ['English Premier League', 'Everton', 'Chelsea', 'Manchester City', 'Roman Abramovich', 'Financial Doping']
Title: Hat-trick of disallowed goals for Son Heung-min as Spurs fall 2-1 to Villa, Tags: ['Football', 'Premier League', 'Tottenham Hotspur', 'Aston Villa', 'goals', 'disallowed goa

In [177]:
test_clusters

[17, 603, 548, 702, 510, 32, 84, 145, 113, 500]

In [175]:
df_train['Cluster_labels'] = clusters_train

In [176]:
df_test['Cluster_labels'] = test_clusters

In [178]:
df_train[df_train['Cluster_labels']==17]

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date,Cluster_labels
120,ln0wbw9261rgtd6w,LONDON - Endless VAR stoppages during Tot...,Southgate said VAR checks left him bored in Sp...,"[0.026354, -0.017797, -0.023525, -0.027427, -0...",Title: Southgate said VAR checks left him bore...,"[VAR, Tottenham Hotspur, Chelsea, Premier Leag...","[-0.013461079448461533, 0.02058023028075695, 0...","[-0.00934, -0.055659, -0.010452, -0.027771, 0....",2023-11-10,17
742,07c7msvwusnq9lj1,LONDON – To recall a time when both Tottenham ...,Plenty of optimism in North London with Arsena...,"[0.012466, 0.021113, 0.006988, 0.012907, -0.01...",Title: Plenty of optimism in North London with...,"[Arsenal, Tottenham Hotspur, English Premier L...","[-0.013860863633453846, 0.010777235962450504, ...","[-0.038067, 0.057702, -0.014818, -0.020333, 0....",2023-09-23,17
1565,bdztls82fk7uswtl,"So much for an international break, it has bee...","After Everton shock, chaos looms in the Englis...","[0.002271, 0.082522, -0.007638, 0.015125, -0.0...","Title: After Everton shock, chaos looms in the...","[English Premier League, Everton, Chelsea, Man...","[-0.013298161327838898, 0.040276069194078445, ...","[0.017072, 0.006074, -0.028485, -0.033419, 0.0...",2023-11-18,17
1931,4fmj770hedw04crt,LONDON – Before their home fixture against Ast...,Hat-trick of disallowed goals for Son Heung-mi...,"[0.028495, 0.005668, 0.007684, -0.037367, 0.03...",Title: Hat-trick of disallowed goals for Son H...,"[Football, Premier League, Tottenham Hotspur, ...","[-0.01915312558412552, 6.474554538726807e-06, ...","[-0.010974, -0.044578, -0.013848, -0.02533, -0...",2023-11-27,17


In [187]:
df_test.reset_index(drop=True, inplace=True)

In [214]:
def common_tags(tags1, tags2):
    return set(tags1).intersection(set(tags2))

# Initialize the dictionary to store similar articles
similar_articles_dict = {}

# Iterate over each test article in the filtered df_test
for index, test_row in df_test.iterrows():
    test_tags = test_row['tags']
    test_cluster_label = test_row['Cluster_labels']
    
    # Filter df_train for the same cluster label
    df_train_cluster = df_train[df_train['Cluster_labels'] == test_cluster_label]
    
    # Find similar articles in df_train
    similar_indexes = []
    for train_index, train_row in df_train_cluster.iterrows():
        train_tags = train_row['tags']
        if len(common_tags(test_tags, train_tags)) >= 2:
            similar_indexes.append(train_index)
    
    # Store the result in the dictionary if there are at least 2 supporting articles
    if len(similar_indexes) >= 2:
        similar_articles_dict[index] = {
            'Title': test_row['Title'],
            'indexes': similar_indexes,
            'Text': test_row['Text']
        }

MFA director-general charged over use of diplomatic bags containing Panadol, luxury watches 
Brussels gunman suspect was known to police but not on watchlist
US Senator Bob Menendez rejects calls to step down from Congress
Deadly storm sweeps across Greece, PM postpones keynote speech
Veteran Novak Djokovic overcomes Ben Shelton to reach US Open final
UK PM Sunak gambles on return of Cameron to win over moderate voters
Blinken, Netanyahu shelter in bunker amid air raid sirens in Tel Aviv
Book review: Hwang Bo-reum’s Welcome To The Hyunam-dong Bookshop explores books and burnout
Condo resale volume recovers slightly, price growth slows in October 
Jackson Wang and stars from 88rising light up Padang Stage on first night of Singapore GP
Cristiano Ronaldo bags brace as Portugal thrash Bosnia and Herzegovina in Euro 2024 qualifying
Italian FA to ban Tonali for 10 months for betting offences
From Angel to Icon: The success and struggles of Victoria’s Secret model Taylor Hill
Safuwan, Ikhsan

In [196]:
similar_articles_dict

{1: {'Title': 'Fiji says will strengthen defence cooperation with Australia',
  'indexes': [475, 1197, 1424, 1594, 1605],
  'Text': 'SYDNEY  -     Pacific island nation Fiji said on Tuesday it will strengthen defence and security cooperation with Australia as Prime Minister Sitiveni Rabuka visited Canberra.In a meeting with Australia\'s Defence Minister Richard Marles, Rabuka said that evolving global security threats meant Australia and Fiji should strengthen cooperation in security intelligence, cyber security, defence and police cooperation, Fiji\'s government said in a statement.The two countries already work closely together on border and maritime security, and Australia had funded a security training camp in Fiji, Rabuka noted.Rabuka is on a three-day official visit to Australia.China struck policing agreements with Pacific Island neighbours Solomon Islands and Vanautu this year, however a decade-old policing agreement between China and Fiji had been put "on hold", Rabuka confirm

In [215]:
for test_index, info in similar_articles_dict.items():
    print(f"Test Article Index: {test_index}")
    print(f"Test Article Title: {info['Title']}")
    print("Supporting Articles:")
    for idx in info['indexes']:
        print(f" - {df_train.loc[idx, 'Title']}")
    print("\n" + "-"*80 + "\n")

Test Article Index: 1
Test Article Title: Fiji says will strengthen defence cooperation with Australia
Supporting Articles:
 - Safety of defence personnel 'utmost priority', Australia says after China warship incident
 - Australia's Albanese to highlight trade on China visit as ties warm
 - Australian delegation heading to Beijing for dialogue, government says
 - Australian PM Anthony Albanese says China ties damaged by navy divers incident
 - Philippines, Australia sign strategic partnership deal with eye on China

--------------------------------------------------------------------------------

Test Article Index: 4
Test Article Title: Blinken says Palestinian voices key to Gaza future
Supporting Articles:
 - Blinken, Netanyahu shelter in bunker amid air raid sirens in Tel Aviv
 - Blinken rebuffs Arab states’ push for immediate Gaza ceasefire
 - Blinken says 'far too many' Palestinians killed as Israel battles Hamas in Gaza
 - Jordan minister doubts Israel can wipe out Hamas
 - Blink

In [216]:
# Initialize the generative model
llm = genai.GenerativeModel('gemini-1.0-pro')

# Define the template
template = '''
You are a highly intelligent AI tasked with analyzing articles to determine whether generating a timeline of events leading up to the key event in the article would be beneficial. Consider the following factors to make your decision:

    1. **Significance of the Event**:
       - Does the event have a significant impact on a large number of people, industries, or countries?
       - Are the potential long-term consequences of the event important?

    2. **Controversy or Debate**:
       - Is the event highly controversial or has it sparked significant debate?
       - Has the event garnered significant media attention and public interest?

    3. **Complexity**:
       - Does the event involve multiple factors, stakeholders, or causes that make it complex?
       - Does the event have deep historical roots or is it the culmination of long-term developments?

    4. **Personal Relevance**:
       - Does the event directly affect the reader or their community?
       - Is the event of particular interest to the reader due to economic implications, political affiliations, or social issues?

    5. **Educational Purposes**:
       - Would a timeline provide valuable learning or research information?

    Here is the information for the article:

    Title:{title}
    Text: {text}

    Based on the factors above, decide whether generating a timeline of events leading up to the key event in this article would be beneficial. Provide a brief explanation for your decision. 
    Then reply in terms of the need with a score 1 - 5, 1 means unnecessary, 5 means necessary, in JSON format, for example score 3. No need for an explanation
    ANSWER:
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text", "title"],
    template=template,
)

timeline_keys = []

for key, title_index in list(similar_articles_dict.items()):
    
    # Define the headline
    headline = title_index['Title']
    body = title_index['Text']

    # Format the prompt
    final_prompt = prompt.format(title=headline, text=body)

    # Generate content using the generative model
    response = llm.generate_content(
        final_prompt,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        }
    )
    for i in range(len(response.parts[0].text)):
      if response.parts[0].text[i:i+5] == 'score':
         score = int(response.parts[0].text[i+8])
         if score >=3:
               timeline_keys.append(key)
timeline_keys
    

[4, 8, 9]

In [278]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

template = '''
Given a series of articles, each containing a publication date, title, and content, your task is to construct a detailed timeline of events leading up to the main event described in the first article.
Analyze the First Article: Begin by thoroughly analyzing the title, content, and publication date of the first article to understand the main event.
Use Subsequent Articles: For each following article, examine the title, content, and publication date. Identify events, context, and any time references such as "last week," "last month," or specific dates.

Construct the Timeline:
Chronological Order: Organize the events chronologically, using the publication dates and time references within the articles.
Detailed Descriptions: Provide detailed descriptions of each event, explaining how it relates to the main event of the first article.
Contextual Links: Use information from the articles to link events together logically and coherently.
Handle Ambiguities: If an article uses ambiguous time references, infer the date based on the publication date of the article and provide a clear rationale for your inference.

Contextual Links:
External Influences: Mention any external influences (e.g., global conflicts, economic trends, scientific discoveries) that might have indirectly affected the events.
Internal Issues: Highlight any internal issues or developments (e.g., political changes, organizational restructuring, societal movements) within the entities involved that might have impacted the events.
Efforts for Improvement: Note any indications of efforts to improve the situation (e.g., policy changes, strategic initiatives, collaborative projects) despite existing challenges.

Be as thorough and precise as possible, ensuring the timeline accurately reflects the sequence and context of events leading to the main event.

Series of Articles: {text}


Now, I want you to carefully analyse the timeline and cross reference that the information is correct, and that it corresponds with the correct date.
Then, check carefully to ensure that the dates are arranged in chronological order.

Return them as a JSON object that the keys are Dates and Events the explanation, respectively.
This is a good example of how to return the output (Do not include this in the timeline):
Date: 2000-05-22
Event: Reports emerge about the successful evacuation of thousands of residents, and the government begins planning long-term recovery efforts. Article: 1
Date: 2023-08-01
Event: Major flood hits the southern region of Country X, causing significant damage and displacing thousands of residents. Article: 2
Date: 2023-08-05 
Event: The government of Country X declares a state of emergency in response to the flooding, mobilizing national resources for disaster relief. Article: 2
Date: 2023-08-10
Event: International aid begins to arrive in Country X, with several countries sending supplies and personnel to assist in the relief efforts. Article: 3
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

In [279]:
similar_articles_dict[4]

{'Title': 'Blinken says Palestinian voices key to Gaza future',
 'indexes': [280, 632, 1301, 1397, 1480, 1505],
 'Text': 'RAMALLAH/BAGHDAD - Top United States diplomat Antony Blinken said the Palestinian Authority (PA) should play a central role in the future of the Gaza Strip, as he met Iraqi leaders and toured the region amid spiralling tensions over Israel’s war with Hamas.Mr Blinken passed through Israeli checkpoints to meet Palestinian Authority President Mahmoud Abbas in the West Bank city of Ramallah, then travelled on to Iraq. It was his second visit to the region since the Hamas militants, who rule Gaza, launched a surprise attack on Israel on Oct 7, killing 1,400 people and taking more than 240 others hostage, according to Israel.Palestinian views, voices and aspirations need to be “at the centre” of conversations about the future of Gaza, Mr Blinken told reporters in Baghdad. As Israel continued a campaign of air strikes that Gaza health officials say has killed 9,770 Palest

In [280]:
trial_keys = [4]
timelines_text = []
for key in trial_keys:
    df_retrieve = df_train.loc[similar_articles_dict[key]['indexes']]
    df_retrieve = pd.concat([df_retrieve, df_test.iloc[[key-1]]], axis=0)
    indiv_text = list(df_retrieve.combined.values)
    indiv_dates = list(df_retrieve.Publication_date.values)
    all = []
    for i in range(len(indiv_text)-1,-1,-1):
        s =  f'Publication date: {indiv_dates[i]}  {indiv_text[i]}'
        all.append(s)
    sum_of_text = ", ".join(all) 
    
    final_prompt  = prompt.format(text=sum_of_text)
    response = llm.generate_content(final_prompt,
                                   safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    }
    )
    timeline = response.parts[0].text
    timelines_text.append(timeline)


'```\n{\n  "Date": "2023-10-22",\n  "Event": "US Secretary of State Anthony Blinken warns Lebanon to remain out of the conflict between Israel and the Palestinian militant group Hamas."\n},\n{\n  "Date": "2023-10-17",\n  "Event": "Blinken and Israeli PM Benjamin Netanyahu shelter in a bunker during a meeting in Tel Aviv as air raid sirens sound."\n},\n{\n  "Date": "2023-11-05",\n  "Event": "Blinken rebuffs Arab states\' push for an immediate Gaza ceasefire, arguing that it would allow Hamas to regroup."\n},\n{\n  "Date": "2023-11-11",\n  "Event": "Blinken says \'far too many\' Palestinians have been killed in the conflict between Israel and Hamas, and calls for more protection of civilians."\n},\n{\n  "Date": "2023-11-18",\n  "Event": "Jordan\'s foreign minister expresses doubts about Israel\'s ability to eliminate Hamas."\n},\n{\n  "Date": "2023-10-14",\n  "Event": "Blinken tells Arab leaders that there can be \'no more business as usual\' with Hamas, following deadly attacks in Israe

In [285]:
string = timeline.strip('```')
json_string = f"[{string}]"
data = json.loads(json_string)
data

[{'Date': '2023-10-22',
  'Event': 'US Secretary of State Anthony Blinken warns Lebanon to remain out of the conflict between Israel and the Palestinian militant group Hamas.'},
 {'Date': '2023-10-17',
  'Event': 'Blinken and Israeli PM Benjamin Netanyahu shelter in a bunker during a meeting in Tel Aviv as air raid sirens sound.'},
 {'Date': '2023-11-05',
  'Event': "Blinken rebuffs Arab states' push for an immediate Gaza ceasefire, arguing that it would allow Hamas to regroup."},
 {'Date': '2023-11-11',
  'Event': "Blinken says 'far too many' Palestinians have been killed in the conflict between Israel and Hamas, and calls for more protection of civilians."},
 {'Date': '2023-11-18',
  'Event': "Jordan's foreign minister expresses doubts about Israel's ability to eliminate Hamas."},
 {'Date': '2023-10-14',
  'Event': "Blinken tells Arab leaders that there can be 'no more business as usual' with Hamas, following deadly attacks in Israel."},
 {'Date': '2023-10-04',
  'Event': 'British Pr

In [286]:
sorted_events = sorted(data, key=lambda x: x['Date'])
sorted_events

[{'Date': '2023-10-04',
  'Event': 'British Prime Minister Rishi Sunak cancels part of the high-speed rail project to Manchester, citing a need to redirect funds to other transport projects.'},
 {'Date': '2023-10-14',
  'Event': "Blinken tells Arab leaders that there can be 'no more business as usual' with Hamas, following deadly attacks in Israel."},
 {'Date': '2023-10-17',
  'Event': 'Blinken and Israeli PM Benjamin Netanyahu shelter in a bunker during a meeting in Tel Aviv as air raid sirens sound.'},
 {'Date': '2023-10-22',
  'Event': 'US Secretary of State Anthony Blinken warns Lebanon to remain out of the conflict between Israel and the Palestinian militant group Hamas.'},
 {'Date': '2023-11-05',
  'Event': "Blinken rebuffs Arab states' push for an immediate Gaza ceasefire, arguing that it would allow Hamas to regroup."},
 {'Date': '2023-11-11',
  'Event': "Blinken says 'far too many' Palestinians have been killed in the conflict between Israel and Hamas, and calls for more prote

In [287]:
with open('timeline_trial.json', 'w') as json_file:
    json.dump(sorted_events, json_file, indent=4)