## Exporting necessary libraries

In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster


# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


## Load and Combine JSON files

In [2]:
def combine_json(files):
    combined_data = []
    for file in files:
        with open(file, 'r') as f:
            # Load data from the file and append it to the combined list
            data = json.load(f)
            combined_data.extend(data)
    return combined_data

# Example usage
files = ['../data_upload/final_db1.json', '../data_upload/final_db2.json', '../data_upload/final_db3.json', '../data_upload/final_db4.json']
db = combine_json(files)

In [3]:
def read_load_json(json_data):
    for item in json_data:
        #Convert the embeddings to json string as CSVs dont accept list as a data type
        item['tags_embeddings'] = json.dumps(item['tags_embeddings'])
        item['Title_embeddings'] = json.dumps(item['Title_embeddings'])
    df = pd.DataFrame(json_data)
    return df

In [4]:
df = read_load_json(db)
df.head()

Unnamed: 0,id,Text,Title,embeddings,combined,tags,tags_embeddings,Title_embeddings,Publication_date
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",Title: Remains found in Swiss Alps are those o...,"[Missing Climber, Swiss Alps, Glaciers, Global...","[0.025687463581562042, 0.03274165466427803, -0...","[0.021028, 0.006548, 0.037958, 0.049163, -0.00...",2023-09-01
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",Title: Involve youth in shaping ethical use of...,"[Youth activism, Artificial intelligence, Ethi...","[0.026038197800517082, 0.05095928534865379, -0...","[0.033077, 0.121931, -0.034714, 0.012957, -0.0...",2023-09-02
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...","Title: Swiatek, Djokovic headline third round ...","[US Open, Grand Slam, Novak Djokovic, Iga Swia...","[-0.04092131927609444, 0.015564153902232647, -...","[-0.018808, -0.049826, 0.005458, -0.010391, -0...",2023-09-01
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",Title: ‘Systematic repression’ crushing Myanma...,"[Myanmar, UN chief, ASEAN, Rohingya, Military ...","[0.02929660677909851, 0.0006651841104030609, -...","[0.059998, -0.014698, 0.02184, -0.031714, 0.00...",2023-09-07
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",Title: Israel's shekel falls as judicial showd...,"[Israel, Shekel, Judicial crisis, Supreme Cour...","[0.015406888909637928, 0.04966922104358673, 0....","[-0.02634, 0.070879, 0.013255, -0.008821, -0.0...",2023-09-07


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2008 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2008 non-null   object
 1   Text              2008 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2008 non-null   object
 4   combined          2007 non-null   object
 5   tags              2008 non-null   object
 6   tags_embeddings   2008 non-null   object
 7   Title_embeddings  2008 non-null   object
 8   Publication_date  2008 non-null   object
dtypes: object(9)
memory usage: 141.3+ KB


In [140]:
## NEED TO CHECK 

In [8]:
nan_rows = df[df.isnull().any(axis=1)]
df = df.drop(nan_rows.index)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


## Data Preprocessing 
- Concatentation of embeddings
- Standardisation of embeddings

In [10]:
# Deserialising of embeddings
body_embeddings= np.array(df['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings= np.array(df['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings= np.array(df['tags_embeddings'].apply(ast.literal_eval).tolist())
all_embeddings = np.concatenate((body_embeddings, title_embeddings, tags_embeddings), axis=1)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

train_embeddings, test_embeddings = train_test_split(all_embeddings, test_size=10, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_embeddings)
X_test_scaled = scaler.transform(test_embeddings)

In [14]:
X_train_scaled.shape

(1997, 2304)

## Conducting PCA Experimentation to find best amount of variance.

#### For variance range of 94% to 97%

In [20]:
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.neighbors import KNeighborsClassifier
# Experiment for this variance range of 94% to 97%
variance_range = list(np.arange(0.92, 0.97, 0.01))
variance_perf = {}

for variance in variance_range:
    pca = PCA(n_components=variance)
    train_pca_embeddings = pca.fit_transform(X_train_scaled)
    test_pca_embeddings = pca.transform(X_test_scaled)
    
    # Range of max_d values to try
    max_d_values = np.arange(45, 70)
    
    # Lists to store silhouette scores
    silhouette_scores_train = []
    silhouette_scores_test = []

    # Perform hierarchical clustering
    Z = linkage(train_pca_embeddings, method='ward')

    for max_d in max_d_values:
        clusters_train = fcluster(Z, max_d, criterion='distance')
        
        knn = KNeighborsClassifier(n_neighbors=1)
        knn.fit(train_pca_embeddings, clusters_train)
        clusters_test = knn.predict(test_pca_embeddings)
        
        # Calculate silhouette scores only if there are at least 2 unique clusters and fewer than the number of samples
        if 1 < len(set(clusters_train)) < len(train_pca_embeddings):
            score_train = silhouette_score(train_pca_embeddings, clusters_train)
        else:
            score_train = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        if 1 < len(set(clusters_test)) < len(test_pca_embeddings):
            score_test = silhouette_score(test_pca_embeddings, clusters_test)
        else:
            score_test = -1  # Assign a score of -1 if less than 2 unique clusters or too many clusters
        
        silhouette_scores_train.append(score_train)
        silhouette_scores_test.append(score_test)

    # Determine the best max_d
    best_max_d_train = max_d_values[np.argmax(silhouette_scores_train)]
    best_max_d_test = max_d_values[np.argmax(silhouette_scores_test)]
    variance_perf[variance] = {
        'max_d_train': best_max_d_train,
        "max_d_test": best_max_d_test,
        'best_train_silhouette': max(silhouette_scores_train),
        "best_test_silhouette": max(silhouette_scores_test)
    }

In [21]:
variance_perf

{0.92: {'max_d_train': 54,
  'max_d_test': 45,
  'best_train_silhouette': 0.09662836924445259,
  'best_test_silhouette': -1},
 0.93: {'max_d_train': 55,
  'max_d_test': 45,
  'best_train_silhouette': 0.09493866482143248,
  'best_test_silhouette': -1},
 0.9400000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.09341043113832068,
  'best_test_silhouette': -1},
 0.9500000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0918934174697297,
  'best_test_silhouette': -1},
 0.9600000000000001: {'max_d_train': 56,
  'max_d_test': 45,
  'best_train_silhouette': 0.0900331324671914,
  'best_test_silhouette': -1}}

In [25]:
# Find the best test variance based on the silhouette score
def get_best_variance(perf_results):
    highest_train_sil = 0
    best_variance_s = []
    for variance, scores in perf_results.items():
        if scores['best_train_silhouette'] > highest_train_sil:
            highest_train_sil = scores['best_train_silhouette']
            best_variance_s = [variance]  
        elif scores['best_train_silhouette'] == highest_train_sil:
            best_variance_s.append(variance)  
    
    final_best_max_d = perf_results[best_variance_s[0]]['max_d_train']
    return round(best_variance_s[0], 2), final_best_max_d

best_variance, best_max_d = get_best_variance(variance_perf)
print((best_variance , best_max_d))          

(0.92, 54)


## Best parameters when test size is 10

In [28]:
print(best_variance)
print(best_max_d)

0.92
54


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2007 entries, 0 to 2007
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                2007 non-null   object
 1   Text              2007 non-null   object
 2   Title             2007 non-null   object
 3   embeddings        2007 non-null   object
 4   combined          2007 non-null   object
 5   tags              2007 non-null   object
 6   tags_embeddings   2007 non-null   object
 7   Title_embeddings  2007 non-null   object
 8   Publication_date  2007 non-null   object
dtypes: object(9)
memory usage: 156.8+ KB


In [None]:
df_test = df.sample(10, random_state=42)
df_train = df.drop(df_test.index)

In [None]:
best_max_d += 1
print(best_max_d)

In [38]:
# Deserializing the embeddings
body_embeddings_train = np.array(df_train['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_train = np.array(df_train['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_train = np.array(df_train['tags_embeddings'].apply(ast.literal_eval).tolist())

body_embeddings_test = np.array(df_test['embeddings'].apply(ast.literal_eval).tolist())
title_embeddings_test = np.array(df_test['Title_embeddings'].apply(ast.literal_eval).tolist())
tags_embeddings_test = np.array(df_test['tags_embeddings'].apply(ast.literal_eval).tolist())

# Combine embeddings
all_embeddings_train = np.concatenate((body_embeddings_train, title_embeddings_train, tags_embeddings_train), axis=1)
all_embeddings_test = np.concatenate((body_embeddings_test, title_embeddings_test, tags_embeddings_test), axis=1)

# Standardize embeddings
scaler = StandardScaler()
train_embeddings = scaler.fit_transform(all_embeddings_train)
test_embeddings = scaler.transform(all_embeddings_test)

# Perform PCA
pca = PCA(n_components=best_variance)
pca_train_embeddings = pca.fit_transform(train_embeddings)
pca_test_embeddings = pca.transform(test_embeddings)


Z = linkage(pca_train_embeddings, method='ward', metric='euclidean')
clusters_train = fcluster(Z, best_max_d, criterion='distance')

In [43]:
# Predict clusters for test data using the nearest cluster center
def predict_cluster(test_embedding, train_embeddings, clusters):
    distances = np.linalg.norm(train_embeddings - test_embedding, axis=1)
    return clusters[np.argmin(distances)]

# Assign clusters to test points
test_clusters = [predict_cluster(te, pca_train_embeddings, clusters_train) for te in pca_test_embeddings]
# Print the contents of each cluster assigned to each of the 5 test points
for i, (test_point, test_cluster) in enumerate(zip(df_test.itertuples(), test_clusters)):
    print(f"Test point {i + 1} (Title: {test_point.Title}\n, Tags: {test_point.tags}):")
    print()
    print(f"Cluster {test_cluster} contents:")
    
    cluster_indices = np.where(clusters_train == test_cluster)[0]
    cluster_df = df_train.iloc[cluster_indices]
    
    for _, row in cluster_df.iterrows():
        print(f"Title: {row['Title']}, Tags: {row['tags']}")
    
    print("\n" + "-"*80 + "\n")


Test point 1 (Title: US climate envoy Kerry meets China's VP, urges China to 'raise ambition'
, Tags: ['US', 'China', 'Climate Change', 'Energy', 'Diplomacy', 'United Nations']):

Cluster 469 contents:
Title: Biden and Xi talk past each other in San Francisco, Tags: ['United States', 'China', 'Xi Jinping', 'Joe Biden', 'Geopolitics', 'Diplomacy']
Title: Efforts to stabilise US-China ties will continue amid small window of opportunity: Panellists, Tags: ['US-China Relations', 'Stability Efforts', 'High-Level Visits', 'Geopolitical Tensions', '2024 US Presidential Election', 'Political Constraints']
Title: Moderate expectations from possible US-China summit, top diplomats in Washington say, Tags: ['US-China Relations', 'Asia-Pacific Economic Cooperation', 'Summit', 'Strategic Competition', 'Taiwan Conflict', 'Singapore', 'France', 'Australia']
Title: What did the Chinese get out of Biden-Xi meeting?   , Tags: ['US-China Relations', 'Biden-Xi Meeting', 'Taiwan Issue', 'Military Communicat

In [85]:
similar_articles_dict = {}

# Generate the list of the most similar articles in each cluster
for i, (test_point, test_cluster) in enumerate(zip(df_test.itertuples(), test_clusters)):
    print(f"Test point {i + 1} (Title: {test_point.Title}\n, Tags: {test_point.tags}):")
    print()
    print(f"Cluster {test_cluster} contents:")
    
    cluster_indices = np.where(clusters_train == test_cluster)[0]
    cluster_df = df_train.iloc[cluster_indices]
    
    test_point_tags = set(test_point.tags)
    max_matching_tags = 0
    most_similar_articles = []

    for idx, row in cluster_df.iterrows():
        row_tags = set(row['tags'])
        matching_tags = len(test_point_tags.intersection(row_tags))
        
        if matching_tags > max_matching_tags:
            max_matching_tags = matching_tags
            most_similar_articles = [idx]
        elif matching_tags == max_matching_tags:
            most_similar_articles.append(idx)
    
    # Store the titles and indexes in the dictionary if the test point has at least 2 supporting articles
    if len(most_similar_articles)>=2:
        similar_articles_dict[i + 1] = {"Title": test_point.Title, 'indexes': most_similar_articles}
    
    print(f"Most similar articles with {max_matching_tags} matching tags:")
    for idx in most_similar_articles:
        article = df_train.iloc[idx]
        print(f"Row {idx} - Title: {article['Title']}, Tags: {article['tags']}")
    
    print("\n" + "-"*80 + "\n")

# Print the dictionary
print("Dictionary of similar articles indexes:")

Test point 1 (Title: US climate envoy Kerry meets China's VP, urges China to 'raise ambition'
, Tags: ['US', 'China', 'Climate Change', 'Energy', 'Diplomacy', 'United Nations']):

Cluster 469 contents:
Most similar articles with 2 matching tags:
Row 104 - Title: Biden and Xi talk past each other in San Francisco, Tags: ['United States', 'China', 'Xi Jinping', 'Joe Biden', 'Geopolitics', 'Diplomacy']
Row 525 - Title: Russia will succeed in Ukraine unless US support continues: Pentagon chief, Tags: ['Ukraine', 'Russia', 'US support', 'Pentagon', 'Senate Appropriations Committee', 'Funding for Ukraine']
Row 813 - Title: Sri Lanka pip Pakistan to make Asia Cup final v India, Tags: ['Sri Lanka', 'Pakistan', 'Asia Cup', 'Cricket', 'Final', 'Thriller']

--------------------------------------------------------------------------------

Test point 2 (Title: Births in Italy heading for new record low in 2023: Stats office
, Tags: ['Italy', 'Birth rate', 'Demography', 'Economy', 'Prime Minister Gi

In [86]:
similar_articles_dict

{1: {'Title': "US climate envoy Kerry meets China's VP, urges China to 'raise ambition'",
  'indexes': [104, 525, 813]},
 4: {'Title': 'US concerned at Canada allegations of Sikh murder, urges India to cooperate -official',
  'indexes': [152, 168, 765, 842, 887]},
 6: {'Title': 'Forum: Affiliated schools contribute to a diverse education landscape    ',
  'indexes': [100, 679]}}

In [101]:
# Initialize the generative model
llm = genai.GenerativeModel('gemini-1.0-pro')

# Define the template
template = '''
Given a news headline, determine whether headline requires a contextual background (for example, in the form of a timeline). 
Guideline:
Needs timeline: (1) Complex article (2) Specific details or referring to individuals or entities (3) Important and impactful article (4) Political events
Does not need timeline: (1) Explainer article (2) Forum, commentaries, podcast, info-graphics (3) Accidents or independent events
Then reply in terms of the need with a score 1 - 5, 1 means unnecessary, 5 means necessary, in JSON format, for example score 3.
TEXT: {text}:
ANSWER
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

timeline_keys = []

for key, title_index in similar_articles_dict.items():
    
    # Define the headline
    headline = title_index['Title']

    # Format the prompt
    final_prompt = prompt.format(text=headline)

    # Generate content using the generative model
    response = llm.generate_content(
        final_prompt,
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        }
    )
    score = json.loads(response.parts[0].text.strip("`").strip().strip("json").strip())['score']
    if score >=3:
        timeline_keys.append(key)

timeline_keys

[1, 4]

In [107]:
template = '''
Given a series of articles, each containing a publication date, title, and content, your task is to construct a detailed timeline of events leading up to the main event described in the first article.
Analyze the First Article: Begin by thoroughly analyzing the title, content, and publication date of the first article to understand the main event.
Use Subsequent Articles: For each following article, examine the title, content, and publication date. Identify events, context, and any time references such as "last week," "last month," or specific dates.

Construct the Timeline:
Chronological Order: Organize the events chronologically, using the publication dates and time references within the articles.
Detailed Descriptions: Provide detailed descriptions of each event, explaining how it relates to the main event of the first article.
Contextual Links: Use information from the articles to link events together logically and coherently.
Handle Ambiguities: If an article uses ambiguous time references, infer the date based on the publication date of the article and provide a clear rationale for your inference.

Example Format:
Date: Description of the event with context and links to the main event.
Date: Description of the event with context and links to the main event.
...

Example of a good output (Do not include this in the output):
Date: August 1, 2023: Major flood hits the southern region of Country X, causing significant damage and displacing thousands of residents. (Article 2)
Date: August 5, 2023: The government of Country X declares a state of emergency in response to the flooding, mobilizing national resources for disaster relief. (Article 1)
Date: August 10, 2023: International aid begins to arrive in Country X, with several countries sending supplies and personnel to assist in the relief efforts. (Article 3)
Date: August 15, 2023: Reports emerge about the successful evacuation of thousands of residents, and the government begins planning long-term recovery efforts. (Article 4)

Contextual Links:
External Influences: Mention any external influences (e.g., global conflicts, economic trends, scientific discoveries) that might have indirectly affected the events.
Internal Issues: Highlight any internal issues or developments (e.g., political changes, organizational restructuring, societal movements) within the entities involved that might have impacted the events.
Efforts for Improvement: Note any indications of efforts to improve the situation (e.g., policy changes, strategic initiatives, collaborative projects) despite existing challenges.
Be as thorough and precise as possible, ensuring the timeline accurately reflects the sequence and context of events leading to the main event.

Series of Articles: {text}
'''

# Create the prompt template
prompt = PromptTemplate(
    input_variables=["text"],
    template=template,
)

In [108]:
trial_keys = [1]
for key in trial_keys:
    df_retrieve = df.iloc[similar_articles_dict[key]['indexes']]
    df_retrieve = pd.concat([df_retrieve, df_test.iloc[[key-1]]], axis=0)

    indiv_text = list(df_retrieve.combined.values)
    indiv_dates = list(df_retrieve.Publication_date.values)
    all = []
    for i in range(len(indiv_text)-1,-1,-1):
        s =  f'Publication date: {indiv_dates[i]}  {indiv_text[i]}'
        all.append(s)
    sum_of_text = ", ".join(all) 
    
    final_prompt  = prompt.format(text=sum_of_text)
    response = llm.generate_content(final_prompt,
                                   safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    }
    )
timeline = response.parts[0].text
timeline

"**Timeline of Events Leading to US-China Summit:**\n\n**Date:** 2023-09-06 \n**Event:** Ukraine's parliament votes to approve Rustem Umerov as the new defense minister, amidst the country's largest wartime shake-up of the defense sector.\n\n**Date:** 2023-09-20\n**Event:** In a meeting at the United Nations General Assembly in New York City, US climate envoy John Kerry meets with Chinese Vice President Han Zheng and emphasizes the need for China to raise its ambition in efforts to address climate change.\n\n**Date:** 2023-10-29\n**Event:** China's Foreign Minister Wang Yi warns that the road to a potential summit between President Xi Jinping and US President Joe Biden would not be smooth, citing ongoing tensions between the two countries.\n\n**Date:** 2023-11-15\n**Event:** US President Joe Biden and Chinese leader Xi Jinping meet face-to-face at a grand villa outside San Francisco. Despite token bilateral agreements, the talks reveal a significant disconnect in geopolitical perspecti

In [110]:
def format_timeline(timeline_str):
    # Split the string into lines
    lines = timeline_str.strip().split('\n')
    
    # Initialize an empty list to hold formatted lines
    formatted_lines = []
    
    # Loop through each line to format it
    for line in lines:
        # Strip leading/trailing spaces
        line = line.strip()
        
        # Check for specific keywords and format accordingly
        if line.startswith("**Timeline of Events Leading to"):
            formatted_lines.append(f"# {line.replace('**', '')}\n")
        elif line.startswith("**Date:**"):
            date = line.replace("**Date:**", "").strip()
            formatted_lines.append(f"## {date}\n")
        elif line.startswith("**Event:**"):
            event = line.replace("**Event:**", "").strip()
            formatted_lines.append(f"- {event}\n")
        else:
            formatted_lines.append(line)
    
    # Join the formatted lines back into a single string
    formatted_timeline = "\n".join(formatted_lines)
    
    return formatted_timeline

formatted_timeline = format_timeline(timeline)
print(formatted_timeline)

# Timeline of Events Leading to US-China Summit:


## 2023-09-06

- Ukraine's parliament votes to approve Rustem Umerov as the new defense minister, amidst the country's largest wartime shake-up of the defense sector.


## 2023-09-20

- In a meeting at the United Nations General Assembly in New York City, US climate envoy John Kerry meets with Chinese Vice President Han Zheng and emphasizes the need for China to raise its ambition in efforts to address climate change.


## 2023-10-29

- China's Foreign Minister Wang Yi warns that the road to a potential summit between President Xi Jinping and US President Joe Biden would not be smooth, citing ongoing tensions between the two countries.


## 2023-11-15

- US President Joe Biden and Chinese leader Xi Jinping meet face-to-face at a grand villa outside San Francisco. Despite token bilateral agreements, the talks reveal a significant disconnect in geopolitical perspectives, leaving US-China relations vulnerable to a deeper crisis.

