In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)


/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


In [2]:
# Function to load and combine the split dataframes
def load_and_merge_csv(file_pattern, num_files):
    file_names = [file_pattern.format(i) for i in range(1, num_files + 1)]
    dataframes = [pd.read_csv(filename) for filename in file_names]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

df = load_and_merge_csv('../data_upload/cluster_labels{}.csv', 4)

In [3]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            2019 non-null   object
 1   Text          2019 non-null   object
 2   Title         2018 non-null   object
 3   embeddings    2019 non-null   object
 4   Cluster       2019 non-null   int64 
 5   combined      2018 non-null   object
 6   Common_Theme  2019 non-null   object
dtypes: int64(1), object(6)
memory usage: 110.5+ KB


- multi processing of inputs using multiprocessing library

## Multithreading for Tag Generation

In [8]:
#Ran this only to generat second batch of tags
df = df.loc[(range(1249,len(df)))]
df.head()

Unnamed: 0,id,Text,Title,embeddings,Cluster,combined,Common_Theme
1249,cua9j48642kyknz0,NEW YORK – For a region that lives and breathe...,Why a football-crazy continent gets just 3 Wor...,"[0.023741, 0.076849, -0.032606, 0.024595, 0.01...",5,Title: Why a football-crazy continent gets jus...,Football and Sporting Events
1250,ir2d3dafhjh0fkwn,HONG KONG – Asia’s last currency standing agai...,Asia’s last holdout against strong US dollar f...,"[-0.006965, -0.0196, -0.037286, -0.052639, 0.0...",16,Title: Asia’s last holdout against strong US d...,Financial Services and Business Developments i...
1251,36o7nvcxkupd7nxx,"With the support of EnterpriseSG, it moves int...",S’pore firm spends two years perfecting vegan ...,"[0.034536, 0.039622, -0.008628, -0.008201, 0.0...",19,Title: S’pore firm spends two years perfecting...,Entertainment and Culture
1252,b1nbk81lw7vcfsco,WASHINGTON - U.S. Secretary of State Anto...,Blinken meets Guatemalan President-elect Areva...,"[-0.000794, 0.062001, 0.012275, 0.029521, -0.0...",4,Title: Blinken meets Guatemalan President-elec...,Political Crises and Human Rights Concerns
1253,ekdsjni0osyemws5,SEOUL - South Korean President Yoon Suk Y...,South Korea's Yoon to warn APEC about risks fr...,"[0.050017, 0.006349, 0.015985, 0.028293, 0.028...",2,Title: South Korea's Yoon to warn APEC about r...,Geopolitical Alliances and Tensions in the Ind...


In [9]:
llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 6 most relevant tags that best categorize the main events, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely, and the response should not include any "-" or backslash n.
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    # response = llm.generate_content(final_prompt, safety_settings={
    #                                 HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    #                                 HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    #                                 HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
    #                                 HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
    #                                 })
    time.sleep(1)
    try:
        tags = str(response.parts[0])[7:].strip().replace("\n","").replace("\"","" ).split(", ")
        return article_id, tags
    except IndexError:
        return article_id, "error"
    except ValueError:
        return article_id, "error"
    except AttributeError:
        return article_id, "error"
        

def process_articles(df):
    results = []  # Use a list to store all batches for simplicity in JSON output
    max_workers = 5
    batch_size = 50
    cooldown_period = 60  # seconds

    articles = df['combined'].tolist()
    article_ids = df['id'].tolist()
    article_id_pairs = list(zip(articles, article_ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            batch_results = {} 
            processed = i# Dictionary to store results for the current batch
            for future in as_completed(futures):
                article_id, tags = future.result()
                batch_results[article_id] = tags
                print(f"Received tags for article ID {processed}")
                processed += 1

            # Append the current batch's results to the overall results
            results.append({'batch': i//batch_size + 1, 'results': batch_results})
            
            # Write/overwrite the JSON file with the updated results
            with open('batch_results2.json', 'w') as f:
                json.dump(results, f, indent=4)

            print(f"All tasks in batch {(i // batch_size) + 1} completed, cooling down for {cooldown_period} seconds...")
            time.sleep(cooldown_period)

    return results

In [10]:
all_tags = process_articles(df)
all_tags

Starting batch processing for articles 1 to 50
Received tags for article ID 0
Received tags for article ID 1
Received tags for article ID 2
Received tags for article ID 3
Received tags for article ID 4
Received tags for article ID 5
Received tags for article ID 6
Received tags for article ID 7
Received tags for article ID 8
Received tags for article ID 9
Received tags for article ID 10
Received tags for article ID 11
Received tags for article ID 12
Received tags for article ID 13
Received tags for article ID 14
Received tags for article ID 15
Received tags for article ID 16
Received tags for article ID 17
Received tags for article ID 18
Received tags for article ID 19
Received tags for article ID 20
Received tags for article ID 21
Received tags for article ID 22
Received tags for article ID 23
Received tags for article ID 24
Received tags for article ID 25
Received tags for article ID 26
Received tags for article ID 27
Received tags for article ID 28
Received tags for article ID 29
Rec

[{'batch': 1,
  'results': {'36o7nvcxkupd7nxx': ['Singapore',
    'Vegan',
    'Food',
    'Plant-based',
    'Sustainable',
    'Meat'],
   'b1nbk81lw7vcfsco': ['Guatemala',
    'Election',
    'Political Transition',
    'Antony Blinken',
    'Bernardo Arevalo',
    'United States'],
   'ekdsjni0osyemws5': ['South Korea',
    'APEC',
    'North Korea-Russia ties',
    'Indo-Pacific',
    'IPEF',
    'United States'],
   'ir2d3dafhjh0fkwn': ['US dollar',
    'Asia',
    'Currency',
    'FED',
    'Indonesia',
    'Rupiah'],
   'cua9j48642kyknz0': ['Argentina',
    'Brazil',
    'South America',
    'FIFA',
    'World Cup',
    'Lionel Messi'],
   '70h26cojtty6nw7d': ['Singapore',
    'Seniors',
    'Active ageing',
    'Community care',
    'Built environment',
    'Assisted living'],
   'iz0iptiddry2hdmd': ['Moorpark',
    'Thousand Oaks',
    'Ventura County',
    'US',
    'Altercation',
    'Protests'],
   'mm1vo2b18nyw4emy': ['Hong Kong',
    'Democracy Activists',
    'National 

## Processing the json file

In [12]:
df = load_and_merge_csv('../data_upload/cluster_labels{}.csv', 4)
df.drop(['Cluster', "Common_Theme"], axis=1, inplace=True)

In [13]:
df.head()

Unnamed: 0,id,Text,Title,embeddings,combined
0,nos7tzp7jprxlqxe,GENEVA – The remains of a climber discovered i...,Remains found in Swiss Alps are those of Briti...,"[0.063923, 0.065677, -0.001089, 0.065425, -0.0...",Title: Remains found in Swiss Alps are those o...
1,zvv4ue0w64vfqoz1,Ms Greta Thunburg became a household name when...,Involve youth in shaping ethical use of AI,"[0.063668, 0.098002, -0.022514, -0.033031, -0....",Title: Involve youth in shaping ethical use of...
2,aph1tgua3xxoq2sg,NEW YORK - Defending women's champion Iga...,"Swiatek, Djokovic headline third round action ...","[-0.019315, 0.066645, 0.009547, 0.029555, -0.0...","Title: Swiatek, Djokovic headline third round ..."
3,rlh53czyst054zfn,JAKARTA – Hopes of a return to democracy in ju...,‘Systematic repression’ crushing Myanmar’s dem...,"[0.067328, -0.004407, 0.010127, -0.004268, -0....",Title: ‘Systematic repression’ crushing Myanma...
4,aksixz7uun2gkpss,JERUSALEM - Israel's shekel dropped to it...,Israel's shekel falls as judicial showdown looms,"[-0.043186, 0.076352, -0.015492, -0.02859, -0....",Title: Israel's shekel falls as judicial showd...


In [69]:
import json
import re

def clean_words(word_list):
    pattern_prefix_suffix = re.compile(r'(^[-\\])|(\d+\.\s*)|(\*[-\\]$)')
    pattern_general = re.compile(r'[^\w\s-]')
    
    cleaned_list = []
    for item in word_list:
        parts = item.split('\\n')
        for part in parts:
            part = pattern_prefix_suffix.sub('', part)
            part = pattern_general.sub('', part)
            clean_part = part.strip()
            if clean_part:
                cleaned_list.append(clean_part)
    return cleaned_list

def clean_tags(batch_tags):
    error_ids = []
    for id, tags in batch_tags.items():
        if tags[0] == "e":
            error_ids.append(id)
            continue
        tags = clean_words(tags)
        batch_tags[id] = tags
    return error_ids

def process_files(file_names):
    all_tags = []
    all_errors = []
    for filename in file_names:
        try:
            with open(filename, 'r') as file:
                db = json.load(file)
                for batch in db:
                    error_ids = clean_tags(batch['results'])
                    all_tags.append(batch)
                    all_errors.extend(error_ids)
        except IOError:
            print(f"Error reading file: {filename}")
        except json.JSONDecodeError:
            print(f"Error decoding JSON from file: {filename}")
    return all_tags, all_errors

file_names = ["../data/batch_results.json", "../data/batch_results2.json"]
all_tags, errors = process_files(file_names)


In [70]:
all_tags

[{'batch': 1,
  'results': {'rlh53czyst054zfn': ['Myanmar',
    'UN chief',
    'ASEAN',
    'Rohingya',
    'Military coup',
    'Democracy'],
   'aph1tgua3xxoq2sg': ['US Open',
    'Grand Slam',
    'Novak Djokovic',
    'Iga Swiatek',
    'Tennis',
    'New York'],
   'aksixz7uun2gkpss': ['Israel',
    'Shekel',
    'Judicial crisis',
    'Supreme Court',
    'Benjamin Netanyahu',
    'Isaac Herzog'],
   'nos7tzp7jprxlqxe': ['Missing Climber',
    'Swiss Alps',
    'Glaciers',
    'Global Warming',
    'DNA Identification',
    'Interpol'],
   'zvv4ue0w64vfqoz1': ['Youth activism',
    'Artificial intelligence',
    'Ethics',
    'Risk management',
    'Generative AI',
    'Geopolitics'],
   'jmcyx62frlc3i24s': ['Brazil',
    'Peru',
    'Football',
    'World Cup',
    '2026 Qualifying',
    'Marquinhos'],
   'zlimezzuv9k0v2mo': ['Sembawang',
    'Car accident',
    'Multiple vehicles',
    'Female driver',
    'Hospitalization',
    'Investigation'],
   'slue2wdvlok4sfy6': ['Fumio

In [119]:
def get_retry_rows(df, errors):
    error_idx = []
    for i in range(len(df.id)):
        if df.id[i] in errors:
            error_idx.append(i)
    return df.iloc[error_idx]

df_retry = get_retry_rows(df, errors)
df_retry.head()  

Unnamed: 0,id,Text,Title,embeddings,combined
43,b8u2ielc9dfhje69,SINGAPORE – A 31-year-old paedophile who first...,Paedophile who was ‘clear danger to young boys...,"[0.064711, 0.049099, -0.00296, -0.03887, -0.05...",Title: Paedophile who was ‘clear danger to you...
438,dohc17a594mb8mpp,SINGAPORE - He was a lauded student volunteer ...,Suspended teacher gets 8 years’ jail for sex a...,"[0.057775, 0.045111, 0.011124, -0.036343, -0.0...",Title: Suspended teacher gets 8 years’ jail fo...
727,hwefkiup56caklj8,SINGAPORE - A man who was 18 years old when he...,"16 years’ jail, caning for man who took part i...","[0.058396, 0.043246, -0.001864, -0.053115, -0....","Title: 16 years’ jail, caning for man who took..."
827,sqcodbyycdg2swl2,LOS ANGELES – Actor Ashton Kutcher has stepped...,Ashton Kutcher resigns from anti-child sex abu...,"[0.055323, 0.095354, -0.012802, 0.029383, -0.0...",Title: Ashton Kutcher resigns from anti-child ...
875,d232da5iiu6ygo9e,SINGAPORE - Eve (not her real name) was relent...,Call to let victims seek quicker redress for m...,"[0.091526, 0.053794, -0.019304, -0.038749, -0....",Title: Call to let victims seek quicker redres...


In [126]:
#articles = list(map(lambda x: x[:70],list(map(str,df_retry.combined))))
articles = df_retry['combined'].tolist()
article_ids = df_retry['id'].tolist()
article_id_pairs = list(zip(articles, article_ids))

llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 6 most relevant tags that best categorize the main events, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely, and the response should not include any "-" or backslash n.
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)

def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    retries = 3  # Maximum number of retries
    attempt = 0  # Current attempt count

    while attempt < retries:
        # response = llm.generate_content(final_prompt, safety_settings={
        #     HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        #     HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        #     HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
        #     HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
        # })

        try:
            # Assuming the tag list is expected to be a comma-separated string following "Generated tags:"
            tag_str = str(response.parts[0])[7:].strip().replace("\n","").replace("\"","")
            if tag_str:  # Check if the string is not empty
                tags = tag_str.split(", ")
                return article_id, tags
            else:
                raise ValueError("Empty tag string")
        except (IndexError, ValueError):
            attempt += 1  # Increment attempt count if there's an error or empty string
            time.sleep(1)  # Sleep to avoid hitting rate limits or to allow for any transient issues

    return article_id, "error"

In [132]:
results = []
for i in range(len(article_id_pairs)):
    retried = fetch_tags(article_id_pairs[i])
    results.append(retried)
with open('tag_retries.json', 'w') as f:
                json.dump(results, f, indent=4)


In [18]:
## Unable to generate tags for these articles Will try a diff method.