## Exporting necessary libraries

In [2]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import torch 
from torch import nn

# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

## Load and Combine JSON files

In [3]:
def combine__4_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r') as f:
            # Load data from the file and append it to the combined list
            data = json.load(f)
            combined_data.extend(data)
    return combined_data

# Example usage
files = ['../data_upload/final_db1.json', '../data_upload/final_db2.json', '../data_upload/final_db3.json', '../data_upload/final_db4.json']
db = combine__4_json(files)

In [4]:
def read_load_json_to_df(json_data):
    for item in json_data:
        #Convert the embeddings to json string as CSVs dont accept list as a data type
        item['tags_embeddings'] = json.dumps(item['tags_embeddings'])
        item['Title_embeddings'] = json.dumps(item['Title_embeddings'])
    df = pd.DataFrame(json_data)
    return df

In [5]:
df = read_load_json_to_df(db)
df.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",Title: Remains found in Swiss Alps are those o...,"[Missing Climber, Swiss Alps, Glaciers, Global...","[0.025687463581562042, 0.03274165466427803, -0...","[0.021028, 0.006548, 0.037958, 0.049163, -0.00...",2023-09-01
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",Title: Involve youth in shaping ethical use of...,"[Youth activism, Artificial intelligence, Ethi...","[0.026038197800517082, 0.05095928534865379, -0...","[0.033077, 0.121931, -0.034714, 0.012957, -0.0...",2023-09-02
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...","Title: Swiatek, Djokovic headline third round ...","[US Open, Grand Slam, Novak Djokovic, Iga Swia...","[-0.04092131927609444, 0.015564153902232647, -...","[-0.018808, -0.049826, 0.005458, -0.010391, -0...",2023-09-01
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",Title: ‘Systematic repression’ crushing Myanma...,"[Myanmar, UN chief, ASEAN, Rohingya, Military ...","[0.02929660677909851, 0.0006651841104030609, -...","[0.059998, -0.014698, 0.02184, -0.031714, 0.00...",2023-09-07
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",Title: Israel's shekel falls as judicial showd...,"[Israel, Shekel, Judicial crisis, Supreme Cour...","[0.015406888909637928, 0.04966922104358673, 0....","[-0.02634, 0.070879, 0.013255, -0.008821, -0.0...",2023-09-07


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2008 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2008 non-null   object
 1   Text              2008 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2008 non-null   object
 4   combined          2007 non-null   object
 5   tags              2008 non-null   object
 6   tags_embeddings   2008 non-null   object
 7   Title_embeddings  2008 non-null   object
 8   Publication_date  2008 non-null   object
dtypes: object(9)
memory usage: 141.3+ KB


In [7]:
## NEED TO CHECK 

In [8]:
nan_rows = df[df.isnull().any(axis=1)]
df = df.drop(nan_rows.index)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


## Data Preprocessing 
- Concatentation of embeddings
- Standardisation of embeddings

In [10]:
# Deserialising of embeddings
body_embeddings= np.array(df['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings= np.array(df['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings= np.array(df['tags_embeddings'].apply(ast.literal_eval).tolist())
all_embeddings = np.concatenate((body_embeddings, title_embeddings, tags_embeddings), axis=1)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_embeddings, test_embeddings = train_test_split(all_embeddings, test_size=1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_embeddings)
X_test_scaled = scaler.transform(test_embeddings)

In [12]:
X_train_scaled.shape

(2006, 2304)

In [13]:
X_test_scaled.shape

(1, 2304)

## Conducting PCA Experimentation to find best amount of variance.

#### For variance range of 94% to 97%

In [14]:
from sklearn.decomposition import PCA

In [15]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, fcluster

# Assuming X_train_scaled is your scaled training data
variance_range = list(np.arange(0.92, 0.95, 0.01))
variance_perf = {}

for variance in variance_range:
    pca = PCA(n_components=variance)
    train_pca_embeddings = pca.fit_transform(X_train_scaled)
    
    # Range of max_d values to try
    max_d_values = np.arange(45, 70)
    
    # List to store silhouette scores
    silhouette_scores_train = []

    # Perform hierarchical clustering
    Z = linkage(train_pca_embeddings, method='ward')

    for max_d in max_d_values:
        clusters_train = fcluster(Z, max_d, criterion='distance')
        
        # Calculate silhouette score only if there are at least 2 unique clusters and fewer than the number of samples
        if 1 < len(set(clusters_train)) < len(train_pca_embeddings):
            score_train = silhouette_score(train_pca_embeddings, clusters_train)
        else:
            score_train = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        silhouette_scores_train.append(score_train)

    # Determine the best max_d
    best_max_d_train = max_d_values[np.argmax(silhouette_scores_train)]
    variance_perf[variance] = {
        'max_d_train': best_max_d_train,
        'best_train_silhouette': max(silhouette_scores_train)
    }

# The final variance_perf dictionary contains the best max_d and silhouette score for each variance
print(variance_perf)


{0.92: {'max_d_train': 55, 'best_train_silhouette': 0.09628315224156507}, 0.93: {'max_d_train': 55, 'best_train_silhouette': 0.0947792218720515}, 0.9400000000000001: {'max_d_train': 55, 'best_train_silhouette': 0.09363049911273336}}


In [16]:
variance_perf

{0.92: {'max_d_train': 55, 'best_train_silhouette': 0.09628315224156507},
 0.93: {'max_d_train': 55, 'best_train_silhouette': 0.0947792218720515},
 0.9400000000000001: {'max_d_train': 55,
  'best_train_silhouette': 0.09363049911273336}}

In [17]:
# Find the best test variance based on the silhouette score
def get_best_variance(perf_results):
    highest_train_sil = 0
    best_variance_s = []
    for variance, scores in perf_results.items():
        if scores['best_train_silhouette'] > highest_train_sil:
            highest_train_sil = scores['best_train_silhouette']
            best_variance_s = [variance]  
        elif scores['best_train_silhouette'] == highest_train_sil:
            best_variance_s.append(variance)  
    
    final_best_max_d = perf_results[best_variance_s[0]]['max_d_train']
    return round(best_variance_s[0], 2), final_best_max_d

best_variance, best_max_d = get_best_variance(variance_perf)
print((best_variance , best_max_d))          

(0.92, 55)


## Best parameters when test size is 1

In [18]:
print(best_variance)
print(best_max_d)

0.92
55


In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.neighbors import KNeighborsClassifier

#Since already using test size = 1, then dont need to scale everything again

df_test = df.sample(1)
df_train = df.drop(df_test.index)
# Deserializing the embeddings
body_embeddings_train = np.array(df_train['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_train = np.array(df_train['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_train = np.array(df_train['tags_embeddings'].apply(ast.literal_eval).tolist())

body_embeddings_test = np.array(df_test['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_test = np.array(df_test['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_test = np.array(df_test['tags_embeddings'].apply(ast.literal_eval).tolist())

# Combine embeddings
all_embeddings_train = np.concatenate((body_embeddings_train, title_embeddings_train, tags_embeddings_train), axis=1)
all_embeddings_test = np.concatenate((body_embeddings_test, title_embeddings_test, tags_embeddings_test), axis=1)

# Standardize embeddings
scaler = StandardScaler()
train_embeddings = scaler.fit_transform(all_embeddings_train)
test_embeddings = scaler.transform(all_embeddings_test)

# Perform PCA
pca = PCA(n_components=best_variance)
pca_train_embeddings = pca.fit_transform(train_embeddings)
pca_test_embeddings = pca.transform(test_embeddings)


Z = linkage(pca_train_embeddings, method='ward', metric='euclidean')
clusters_train = fcluster(Z, best_max_d, criterion='distance')

In [99]:
import torch
from torch import nn

In [111]:
# Predict clusters for test data using the nearest cluster center
def predict_cluster(test_embedding, train_embeddings, clusters):
    distances = np.linalg.norm(train_embeddings - test_embedding, axis=1)
    return clusters[np.argmin(distances)]

test_clusters = [predict_cluster(te, pca_train_embeddings, clusters_train) for te in pca_test_embeddings]

df_train['Cluster_labels'] = clusters_train
df_test['Cluster_labels'] = test_clusters
df_test.reset_index(drop=True, inplace=True)
# Create a dictionary to store the results
cluster_dict = {}

def calculate_cosine(vector1, vector2):
    cos_sim = nn.CosineSimilarity(dim=0)
    return cos_sim(torch.tensor(vector1), torch.tensor(vector2))
    
def calculate_dot_prod(vector1, vector2):
    return np.dot(vector1, vector2)

# Populate the dictionary with cluster contents for each test point
for i, (test_point, test_cluster) in enumerate(zip(df_test.itertuples(), test_clusters)):
    cluster_contents = []
    
    cluster_indices = np.where(clusters_train == test_cluster)[0]
    cluster_df = df_train.iloc[cluster_indices]
    
    # Sieve out embeddings for the tags
    test_embeddings_str = test_point.tags_embeddings
    test_embeddings_array = np.array(json.loads(test_embeddings_str), dtype=float)
    cluster_dict = {
        "Test point": {'id': test_point.id,
                       "Title": test_point.Title, 
                       "Tags": test_point.tags},
        "Cluster": test_cluster,
        "Cluster contents": cluster_contents
    }
    
    
    for _, row in cluster_df.iterrows():
        other_embeddings_str = row['tags_embeddings']
        other_embeddings_array = np.array(json.loads(other_embeddings_str), dtype=float)
        cosine_similarity = calculate_cosine(test_embeddings_array, other_embeddings_array)
        dot_similarity = calculate_dot_prod(test_embeddings_array, other_embeddings_array)
        cluster_contents.append({"id": row['id'], 
                                 "Title": row['Title'],
                                 "Tags": row['tags'], 
                                 "cosine_measure": cosine_similarity,
                                 "dot_measure": dot_similarity})

print(f"Cluster {test_cluster}\n")
input_list = ""
input_list += f"Test Artice: (Title: {cluster_dict['Test point']['Title']}\nTags: {cluster_dict['Test point']['Tags']}):\n\n"
for _, row in cluster_df.iterrows():
    input_list += f"Article id: {row['id']}, Title: {row['Title']}, Tags: {row['tags']}]\n"
print(input_list)

Cluster 399

Test Artice: (Title: Open category COE hits $152,000, large car COE reaches another high 
Tags: ['Singapore', 'Certificate of Entitlement', 'Open Category', 'Big Car', 'Electric Vehicles', 'Land Transport Authority']):

Article id: xjoa3l181bxr2apm, Title: More riders holding on to their motorcycles as COE premiums stay high        , Tags: ['Motorcycle', 'COE', 'Renewal', 'Ownership', 'Transfer', 'Leasing']]
Article id: 17ywn2lbldj116ly, Title: Forum: Consequences of having sky-high COE prices , Tags: ['Singapore', 'Certificate of Entitlement COE', 'High Living Costs', 'Brain Drain', 'Income Inequality', 'Public Transport Rebates']]
Article id: 36hjxw5rhkx90i84, Title: Podcast: Should the COE system be revamped? 2 motor industry veterans have their say, Tags: ['Singapore', 'Transportation', 'Motor industry', 'Certificate of Entitlement', 'COE premiums', 'Vehicle prices']]
Article id: jn49szly8z1toae9, Title: 472,000 S’pore resident households owned cars in 2022, up from 45

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field


In [136]:
# USE LLM to get similar articles:

llm = genai.GenerativeModel(
    "gemini-1.5-flash-latest",
  )

# Initialize the generative model
class Event(BaseModel):
    Article_id: list = Field(description="Article ids that are most relevant for the generation of the timeline")
        

output_parser = JsonOutputParser(pydantic_object=Event)

    # See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()

template = '''
Task Description: Given the following test article, and the relevant tags of that article, and the contents of articles similar to it.
I want you to select the articles that are closest in similarity to the test article, 
for which i will be able to leverage on to build a timeline upon. Return the article ids for the chosen articles. 
Ensure that the chosen articles are relevant in terms of geographical location and main topic.
{text}

{format_instructions}
Check and ensure again that the output follows the format instructions above very strictly. 
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions},
    template=template,
)

final_prompt = prompt.format(text=input_list)
response = llm.generate_content(
        final_prompt,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        }
    )
def get_article_dict(output):
    new_output = re.search(r'\[[^\]]*\]', response.text).group(0)
    article_keys =  json.loads(new_output)
    if not article_keys:
        print("No useful similar articles found in database for timeline generation.")
        sys.exit()
    similar_articles_dict = {}

    # Iterate over each test article in the filtered df_test
    for index, test_row in df_test.iterrows():
        test_tags = test_row['tags']
        test_cluster_label = test_row['Cluster_labels']
        
        # Filter df_train for the same cluster label
        df_train_cluster = df_train[df_train['Cluster_labels'] == test_cluster_label]
        
        # Find similar articles in df_train
        similar_indexes = []
        for train_index, train_row in df_train_cluster.iterrows():
            train_tags = train_row['tags']
            if train_row['id'] in article_keys:
                similar_indexes.append(train_index)
        
        # Store the result in the dictionary if there are at least 2 supporting articles
        if len(similar_indexes) >= 2:
            similar_articles_dict = {
                'Title': test_row['Title'],
                'indexes': similar_indexes,
                'Text': test_row['Text']
            }
    #Show results 
    print("-"*80 + "\n")
    print(f"Test Article Title: << {similar_articles_dict['Title']}>>\n")
    print("Supporting Article Titles:")
    for idx in similar_articles_dict['indexes']:
        print(f" - {df_train.loc[idx, 'Title']}")
    print("\n" + "-"*80)

    return similar_articles_dict
            
            
    



In [139]:
import re
new_output = re.search(r'\[[^\]]*\]', response.text).group(0)
article_keys = json.loads(new_output)
article_keys

['17ywn2lbldj116ly', '36hjxw5rhkx90i84', 'vwpn0hyyt4e3awhb']

In [144]:
if not article_keys:
    print("No useful similar articles found in database for timeline generation.")
    sys.exit()
# Initialize the dictionary to store similar articles
similar_articles_dict = {}

# Iterate over each test article in the filtered df_test
for index, test_row in df_test.iterrows():
    test_tags = test_row['tags']
    test_cluster_label = test_row['Cluster_labels']
    
    # Filter df_train for the same cluster label
    df_train_cluster = df_train[df_train['Cluster_labels'] == test_cluster_label]
    
    # Find similar articles in df_train
    similar_indexes = []
    for train_index, train_row in df_train_cluster.iterrows():
        train_tags = train_row['tags']
        if train_row['id'] in article_keys:
            similar_indexes.append(train_index)
    
    # Store the result in the dictionary if there are at least 2 supporting articles
    if len(similar_indexes) >= 2:
        similar_articles_dict = {
            'Title': test_row['Title'],
            'indexes': similar_indexes,
            'Text': test_row['Text']
        }

In [145]:
similar_articles_dict

{'Title': 'Open category COE hits $152,000, large car COE reaches another high ',
 'indexes': [1060, 1209, 1394],
 'Text': 'SINGAPORE – The certificate of entitlement (COE) premium for the Open category breached the $150,000 mark at the latest tender exercise on Wednesday to close at a new all-time high of $152,000.Industry observers said dealers may be trying to accumulate more Open category COEs to register cars in the remaining two months of the year before rebates are cut from 2024, as such certificates are valid for three months and transferrable. In addition, dealers are racing to meet their year-end sales targets.The premium for the Open Category – which can be used for any vehicle type except motorcycles, but ends up being used mostly for bigger cars – surged by 5.09 per cent over the $144,640 record set at the previous tender. This is the fifth consecutive time this COE category has broken its record.The COE premium for larger cars with engines above 1,600cc and 130bhp, or mor

In [154]:
print("-"*80 + "\n")
print(f"Test Article Title: << {similar_articles_dict['Title']}>>\n")
print("Supporting Article Titles:")
for idx in similar_articles_dict['indexes']:
    print(f" - {df_train.loc[idx, 'Title']}")
print("\n" + "-"*80)

--------------------------------------------------------------------------------

Test Article Title: << Open category COE hits $152,000, large car COE reaches another high >>

Supporting Article Titles:
 - Forum: Consequences of having sky-high COE prices 
 - Podcast: Should the COE system be revamped? 2 motor industry veterans have their say
 - Days of $150k COEs ‘are over’, but system needs thorough relook, say dealers, experts

--------------------------------------------------------------------------------


In [156]:
# Initialize the generative model
llm = genai.GenerativeModel('gemini-1.5-flash-latest')
class Event(BaseModel):
    score: int = Field(description="The need for this article to have a timeline")
    Reason: str = Field(description = "The main reason for your choice why a timeline is needed or why it is not needed")
        

output_parser = JsonOutputParser(pydantic_object=Event)

    # See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()

def clean_llm_score(output):
    text = output.parts[0].text.replace("```", '').replace('json','')
    result = json.loads(text)
    return result

# Define the template
template = '''
You are a highly intelligent AI tasked with analyzing articles to determine whether generating a timeline of events leading up to the key event in the article would be beneficial. 
Consider the following factors to make your decision:
    1. **Significance of the Event**:
       - Does the event have a significant impact on a large number of people, industries, or countries?
       - Are the potential long-term consequences of the event important?

    2. **Controversy or Debate**:
       - Is the event highly controversial or has it sparked significant debate?
       - Has the event garnered significant media attention and public interest?

    3. **Complexity**:
       - Does the event involve multiple factors, stakeholders, or causes that make it complex?
       - Does the event have deep historical roots or is it the culmination of long-term developments?

    4. **Personal Relevance**:
       - Does the event directly affect the reader or their community?
       - Is the event of particular interest to the reader due to economic implications, political affiliations, or social issues?

    5. Educational Purposes:
       - Would a timeline provide valuable learning or research information?

    Here is the information for the article:
    Title:{title}
    Text: {text}
    

    Based on the factors above, decide whether generating a timeline of events leading up to the key event in this article would be beneficial. 
    Your answer will include the need for this article to have a timeline with a score 1 - 5, 1 means unnecessary, 5 means necessary. It will also include the main reason for your choice.
    {format_instructions}    
    ANSWER:
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text", "title"],
    partial_variables={"format_instructions": format_instructions},
    template=template,
)

    # Define the headline
headline = test_data.Title[0]
body = test_data.Text[0]

    # Format the prompt
final_prompt = prompt.format(title=headline, text=body)

    # Generate content using the generative model
response = llm.generate_content(
        final_prompt,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        }
    )
final_response = clean_llm_score(response)
    # If LLM approves
if final_response['score'] >=3:
       print("Timeline is necessary for this chosen article. " + final_response['Reason'] )
else:
        print("A timeline for this article is not required. " + "\n" + final_response['Reason'] + "\n" +  "Hence a required timeline score of " + str(final_response['score']))

Timeline is necessary for this chosen article.While the article discusses a significant event (the COE premium reaching a new all-time high), it's more about current market trends and potential future implications than a clear timeline of past events leading to this specific peak. The article does mention some recent developments like changes to the VES scheme and the LTA reallocation of COEs, which could be included in a timeline. However, the overall focus is on current trends and speculation, making a detailed timeline less crucial compared to articles with a strong focus on historical events and their direct impact.


In [216]:
df_test.Text[0]

'SINGAPORE – The certificate of entitlement (COE) premium for the Open category breached the $150,000 mark at the latest tender exercise on Wednesday to close at a new all-time high of $152,000.Industry observers said dealers may be trying to accumulate more Open category COEs to register cars in the remaining two months of the year before rebates are cut from 2024, as such certificates are valid for three months and transferrable. In addition, dealers are racing to meet their year-end sales targets.The premium for the Open Category – which can be used for any vehicle type except motorcycles, but ends up being used mostly for bigger cars – surged by 5.09 per cent over the $144,640 record set at the previous tender. This is the fifth consecutive time this COE category has broken its record.The COE premium for larger cars with engines above 1,600cc and 130bhp, or more powerful electric vehicles (EVs) above 110 kilowatts, climbed to $146,002, 3.63 per cent above the previous high of $140,

In [125]:
from json import JSONDecodeError
import re
def clean_output(output):
    try:
        updated_timeline = json.loads(output)
        return updated_timeline
    except JSONDecodeError:
        #try 1: Ensuring that the string ends with just the open and close lists brackets
        try:
            new_output = re.search(r'\[[^\]]*\]', output).group(0)
        except AttributeError:
            new_output = re.search(r'\{.*?\}', output, re.DOTALL).group(0)  
        updated_timeline = json.loads(new_output)
        return updated_timeline

In [None]:
from json import JSONDecodeError
import re
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

In [201]:
## NEW TESTING:
#- Generating a timeline of events for each article in the selected articles.
llm = genai.GenerativeModel('gemini-1.5-flash-latest' )

class Event(BaseModel):
    Date: str = Field(description="The date of the event in YYYY-MM-DD format")
    Event: str = Field(description="A detailed description of the important event")
    Article: int = Field(description="The article number from which the event was extracted")

output_parser = JsonOutputParser(pydantic_object=Event)

# See the prompt template you created for formatting
format_instructions = output_parser.get_format_instructions()

template = '''
Given an article, containing a publication date, title, and content, your task is to construct a detailed timeline of events leading up to the main event described in the first article.
Begin by thoroughly analyzing the title, content, and publication date of the article to understand the main event in the first article. 
the dates are represented in YYYY-MM-DD format. Identify events, context, and any time references such as "last week," "last month," or specific dates. 
The article could contain more or one key events. 
If the article does not provide a publication date or any events leading up to the main event, return NAN in the Date field, and 0 i the Article Field


Construct the Timeline:
Chronological Order: Organize the events chronologically, using the publication dates and time references within the articles.
Detailed Descriptions: Provide detailed descriptions of each event, explaining how it relates to the main event of the first article.
Contextual Links: Use information from the articles to link events together logically and coherently.
Handle Ambiguities: If an article uses ambiguous time references, infer the date based on the publication date of the article and provide a clear rationale for your inference.

Contextual Links:
External Influences: Mention any external influences (e.g., global conflicts, economic trends, scientific discoveries) that might have indirectly affected the events.
Internal Issues: Highlight any internal issues or developments (e.g., political changes, organizational restructuring, societal movements) within the entities involved that might have impacted the events.
Efforts for Improvement: Note any indications of efforts to improve the situation (e.g., policy changes, strategic initiatives, collaborative projects) despite existing challenges.

Be as thorough and precise as possible, ensuring the timeline accurately reflects the sequence and context of events leading to the main event.

Article:
{text}

{format_instructions}
Check and ensure again that the output follows the format instructions above very strictly. 
'''

prompt = PromptTemplate(
    input_variables=["text"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

df_retrieve = df_train.loc[similar_articles_dict['indexes']]
df_retrieve = pd.concat([df_retrieve, df_test], axis=0)
df_retrieve = df_retrieve.iloc[::-1].reset_index(drop=True)
df_retrieve
timelines = {}
indiv_text = list(df_retrieve.combined.values)
indiv_dates = list(df_retrieve.Publication_date.values)
for i in range(len(indiv_text)):
    s =  f'Article {i+1}: Publication date: {indiv_dates[i]}  {indiv_text[i]}'
    final_prompt = prompt.format(text=s)
    response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
    )
    if '[' or '{' not in response.parts[0].text:
        response = llm.generate_content(final_prompt,
                                    safety_settings={
                                        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                        }
        )
    #key corresponds to the index in the df_test
    try:
        timelines[i] = response.parts[0].text
    except ValueError:
        print("ERROR: There were issues with the generation of the timeline. The timeline could not be generated")
        
def clean_output(output):
    try:
        updated_timeline = json.loads(output)
        return updated_timeline
    except JSONDecodeError:
        #try 1: Ensuring that the string ends with just the open and close lists brackets
        try:
            new_output = re.search(r'\[[^\]]*\]', output).group(0)
        except AttributeError:
            new_output = re.search(r'\{.*?\}', output, re.DOTALL).group(0)  
        updated_timeline = json.loads(new_output)
        return updated_timeline

def get_timeline_content(timelines):
    generated_timeline = []
    for idx, line in timelines.items():
        indiv_timeline = clean_output(line)
        if type(indiv_timeline) == list:
            for el in indiv_timeline:
                generated_timeline.append(el)
        else:
            generated_timeline.append(indiv_timeline)
    return generated_timeline

In [209]:
def clean_sort_timeline(timelines):  
    generated_timeline = get_timeline_content(timelines)
    unsorted_timeline = []

    for event in generated_timeline:
        article_index = event["Article"] - 1
        event["Article_id"] = df_retrieve.iloc[article_index].id
    for event in generated_timeline:
        del event["Article"]
        unsorted_timeline.append(event)  
        
    timeline = sorted(unsorted_timeline, key=lambda x:x['Date'])
    timeline = [event for event in timeline if event['Date'].lower()!= 'nan']
    for event in timeline:
        date = event['Date']
        if date.endswith('-XX-XX'):
            event['Date'] = date[:4]
        elif date.endswith('-XX'):
            event['Date'] = date[:7]
    return timeline

cleaned_timeline = clean_sort_timeline(timelines)
cleaned_timeline

[{'Date': '2012-10-31',
  'Event': 'The industry appealed for changes to the COE system specifically for taxis.',
  'Article_id': '36hjxw5rhkx90i84'},
 {'Date': '2022-10-31',
  'Event': 'COE premiums have doubled from what they were a year ago, meaning that vehicle prices have also increased significantly.',
  'Article_id': '36hjxw5rhkx90i84'},
 {'Date': '2023-09-29',
  'Event': 'The Land Transport Authority (LTA) announced the reallocation of 300 additional COEs for smaller cars and less powerful EVs, equally distributed between the two October tender exercises. This reallocation aimed to address the anticipated surge in demand from car buyers following the September announcement of changes to the Vehicular Emissions Scheme (VES).',
  'Article_id': 'lc0braszutiz8f8s'},
 {'Date': '2023-10-04',
  'Event': 'The Open category COE premium crossed the $150,000 mark, reaching a new all-time high of $152,000 at the latest tender exercise. The increase was attributed to dealers accumulating Op

In [210]:
import json
timeline_data = json.dumps(cleaned_timeline, indent=4, ensure_ascii=False)

# Write the JSON string to a file
with open('../data_upload/single_timeline_trial.json', 'w', encoding='utf-8' ) as fout:
    fout.write(timeline_data)

In [240]:
list_of_dicts = df_retrieve.to_dict(orient='records')
json_string = json.dumps(list_of_dicts, indent=4)
with open('../data_upload/df_retrieve.json', 'w', encoding='utf-8') as f:
    f.write(json_string)