In [30]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange

# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()


True

In [33]:
# Function to load and combine the split dataframes
def load_and_merge_csv(file_pattern, num_files):
    file_names = [file_pattern.format(i) for i in range(1, num_files + 1)]
    dataframes = [pd.read_csv(filename) for filename in file_names]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

df = load_and_merge_csv('data_upload/cluster_labels{}.csv', 4)

In [4]:
def visualise_titles(df, cluster):
    sample = pd.DataFrame(df[df['Cluster'] == cluster]).reset_index()
    print('Common Theme:' +sample.Common_Theme[0])
    print('Titles: ')
    print(sample.sample(8).Title.values)
    
visualise_titles(df,3)

Common Theme:Technology, Sustainability, and Social Impact
Titles: 
['Most online hate targets women, says European Union report'
 'Evening Update: Today’s headlines from The Straits Times on Nov 19, 2023'
 'Amazon and TikTok leave opening in next e-commerce boom '
 "Lessons that helped Singapore's Osim open some 400 stores in 100 cities"
 'Hassle-free cleaning: Smart robot vacuum cleans own mop pads and revisits dirty areas'
 'TikTok opposes mooted Indonesia social media transaction ban'
 'Binance sees $1.3 billion in outflows after Zhao steps down to settle US probe '
 "Apple files legal challenge to EU's Digital Markets Act"]


In [35]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            2019 non-null   object
 1   Text          2019 non-null   object
 2   Title         2018 non-null   object
 3   embeddings    2019 non-null   object
 4   Cluster       2019 non-null   int64 
 5   combined      2018 non-null   object
 6   Common_Theme  2019 non-null   object
dtypes: int64(1), object(6)
memory usage: 110.5+ KB


In [6]:
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

- multi processing of inputs using multiprocessing library

## Multiprocessing for Tag Generation

In [17]:
df = df.iloc[range(100)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            100 non-null    object
 1   Text          100 non-null    object
 2   Title         100 non-null    object
 3   embeddings    100 non-null    object
 4   Cluster       100 non-null    int64 
 5   combined      100 non-null    object
 6   Common_Theme  100 non-null    object
dtypes: int64(1), object(6)
memory usage: 6.2+ KB


In [20]:
from multiprocessing import Pool
import requests

In [38]:
df1 = df.loc[range(25)]

In [41]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            25 non-null     object
 1   Text          25 non-null     object
 2   Title         25 non-null     object
 3   embeddings    25 non-null     object
 4   Cluster       25 non-null     int64 
 5   combined      25 non-null     object
 6   Common_Theme  25 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.6+ KB


## 25 articles with thread pool

In [89]:
from concurrent.futures import ThreadPoolExecutor, as_completed

llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 3 to 5 relevant tags that categorize the main themes, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely. 
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article):
    article_text, article_id = article
    final_prompt = prompt.format(text=article_text)
    response = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    try:
        return article_id, response.text.strip().split(", ")
    except ValueError:
        error_msg = 'unable to generate'
        return article_id, error_msg
        

def process_articles(articles, ids):
    results = {}
    with ThreadPoolExecutor(max_workers=5) as executor:
        # Pair each article with its ID before submission
        article_id_pairs = zip(articles, ids)
        # Submit each paired article and ID as a task
        futures = {executor.submit(fetch_tags, pair): pair for pair in article_id_pairs}
        for future in as_completed(futures):
            article_id, tags = future.result()
            results[article_id] = tags
    return results

# Assuming df1['combined'] is a list of articles and df1['id'] contains their IDs
articles = df1['combined'].tolist()
article_ids = df1['id'].tolist()
tags = process_articles(articles, article_ids)


In [101]:
copy_tags = tags
copy_tags

{'aksixz7uun2gkpss': ['Israel',
  'Judicial crisis',
  'Shekel',
  'Supreme Court hearing',
  'Political compromise'],
 'rlh53czyst054zfn': ['Myanmar', 'UN', 'Democracy', 'Repression', 'Rohingya'],
 'nos7tzp7jprxlqxe': ['Swiss Alps',
  'Missing Climber',
  'Remains Discovered',
  'Glaciers Melting',
  'Global Warming'],
 'aph1tgua3xxoq2sg': ['U.S. Open',
  'Tennis',
  'Novak Djokovic',
  'Iga Swiatek',
  'Caroline Wozniacki'],
 'slue2wdvlok4sfy6': ['Japan',
  'Prime Minister Kishida',
  'Assassination attempt',
  'pipe bomb\n    Ryuji Kimura suspect',
  'Minimum age for running in elections'],
 'zlimezzuv9k0v2mo': ['Sembawang',
  'Car Crash',
  'Multiple Car Collision',
  'Singapore',
  'Road Accident'],
 'jmcyx62frlc3i24s': ['Brazil',
  'Peru',
  'FIFA World Cup 2026',
  'South American Qualifiers',
  'Marquinhos'],
 'szltbvfarltlhw2v': ['India', 'Cricket', 'World Cup', 'Asia Cup', 'KL Rahul'],
 'zvv4ue0w64vfqoz1': ['- Artificial intelligence (AI)\n- AI Ethics\n- AI Governance\n- Gen 

In [100]:
## Cleaning and filtering 
def map_to_id(df1, tags):
    ids = df1.id.to_list()
    ordered_tags = []
    for id in ids:
        # Clean each tag by stripping extra spaces, removing '*', replacing newlines and dashes, and capitalizing
        clean_tag = [tag.strip().replace('#', '').replace('*', '').replace('\n', ',').replace('-', '').title() for tag in tags[id] if tag.strip()]
        ordered_tags.append(clean_tag)
    return ordered_tags

# USE COPY _TAGS
clean_and_ordered_tags_list = map_to_id(df1,copy_tags)
clean_and_ordered_tags_list
#strs = list(map(lambda x: ', '.join(x), copy_tags))
# df1['tags'] = pd.DataFrame(strs)
# df1.head()

KeyError: 'zuq666o1ibnqwucu'

## All articles with thread pool

In [97]:
df1 = df.loc[range(300)]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 0 to 299
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            300 non-null    object
 1   Text          300 non-null    object
 2   Title         300 non-null    object
 3   embeddings    300 non-null    object
 4   Cluster       300 non-null    int64 
 5   combined      300 non-null    object
 6   Common_Theme  300 non-null    object
dtypes: int64(1), object(6)
memory usage: 18.8+ KB


In [99]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 3 to 5 relevant tags that categorize the main themes, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely. 
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    response = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    
    try:
        return article_id, response.text.strip().split(", ")
    except ValueError:
        error_msg = 'unable to generate'
        return article_id, error_msg

def process_articles(articles, ids):
    results = {}
    max_workers = 5
    batch_size = 100
    cooldown_period = 90  # seconds

    # Create pairs of articles and IDs
    article_id_pairs = list(zip(articles, ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Process each batch
        for i in range(0, len(article_id_pairs), batch_size):
            # Slice the batch
            current_batch = article_id_pairs[i:i+batch_size]

            # Submit tasks for the current batch
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            # Wait for all futures in the current batch to complete
            for future in as_completed(futures):
                article_id, tags = future.result()
                results[article_id] = tags

            # If there is a next batch, apply the cooldown period
            if i + batch_size < len(article_id_pairs):
                print(f"All tasks in batch completed, cooling down for {cooldown_period} seconds...")
                time.sleep(cooldown_period)

    return results

articles = df1['combined'].tolist()
article_ids = df1['id'].tolist()
all_tags = process_articles(articles, article_ids)

ResourceExhausted: 429 Quota exceeded for quota metric 'Generate Content API requests per minute' and limit 'GenerateContent request limit per minute for a region' of service 'generativelanguage.googleapis.com' for consumer 'project_number:1047065474834'. [reason: "RATE_LIMIT_EXCEEDED"
domain: "googleapis.com"
metadata {
  key: "service"
  value: "generativelanguage.googleapis.com"
}
metadata {
  key: "quota_metric"
  value: "generativelanguage.googleapis.com/generate_content_requests"
}
metadata {
  key: "quota_location"
  value: "us-west4"
}
metadata {
  key: "quota_limit"
  value: "GenerateContentRequestsPerMinutePerProjectPerRegion"
}
metadata {
  key: "quota_limit_value"
  value: "60"
}
metadata {
  key: "consumer"
  value: "projects/1047065474834"
}
, links {
  description: "Request a higher quota limit."
  url: "https://cloud.google.com/docs/quota#requesting_higher_quota"
}
]

In [28]:
#Run this tomorrow:
llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 3 to 5 relevant tags that categorize the main themes, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely. 
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
            
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)

all_tags = []
for i in trange(len(df)):
    article = df.combined[i]
    final_prompt  = prompt.format(text=article)
    article_tags = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    all_tags.append(article_tags.text.strip().split(", "))
    
all_tags


  4%|▍         | 4/100 [00:08<03:30,  2.19s/it]


[['Missing climbers',
  'Global warming',
  'Swiss Alps',
  'Saas Valley',
  'British mountaineer'],
 ['AI', 'Ethics', 'Climate change', 'Youth involvement', 'Generative AI'],
 ['Tennis', 'US Open', 'Novak Djokovic', 'Iga Swiatek', 'Grand Slam'],
 ['Myanmar', 'Political Crisis', 'UN', 'ASEAN', 'Rohingya Refugees'],
 ['', 'Judicial Crisis', 'Israel', 'Shekel', 'Politics']]

In [60]:
def clean_tags(nested_tags):
    cleaned_nested_tags = []
    for tag_list in nested_tags:
        # Include condition to filter out empty or whitespace-only tags
        cleaned_tags = [tag.strip().replace('*', '').title() for tag in tag_list if tag.strip()]
        cleaned_nested_tags.append(cleaned_tags)
    return cleaned_nested_tags

cleaned_tags = clean_tags(all_tags)
cleaned_tags

[['Swiss Alps',
  'Missing Climber',
  'Glacial Melt',
  'Dna Identification',
  'Climate Change'],
 ['Artificial Intelligence',
  'Generative Ai',
  'Ethics',
  'Risk Management',
  'Youth Engagement'],
 ['Tennis', 'U.S. Open', 'Grand Slam', 'Novak Djokovic', 'Iga Swiatek'],
 ['Myanmar', 'United Nations', 'Military Coup', 'Asean', 'Rohingya Crisis'],
 ['Israel', 'Politics', 'Currency', 'Economy', 'Judicial Reform']]

In [None]:
# For the enew article that come sin, use the generate_tag function, (create one),\
    # then find similar articles that have like 2 or 3 of the same tags, means relevant