In [1]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)


/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: /home/jerry/Desktop/timeline project/timeline/bin/python3: No such file or directory
/Users/jerryyang/Desktop/SPH/sph-timeline-project/timeline/bin/pip: line 2: exec: /home/jerry/Desktop/timeline project/timeline/bin/python3: cannot execute: No such file or directory


In [2]:
# Function to load and combine the split dataframes
def load_and_merge_csv(file_pattern, num_files):
    file_names = [file_pattern.format(i) for i in range(1, num_files + 1)]
    dataframes = [pd.read_csv(filename) for filename in file_names]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

df = load_and_merge_csv('../data_upload/cluster_labels{}.csv', 4)

In [3]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            2019 non-null   object
 1   Text          2019 non-null   object
 2   Title         2018 non-null   object
 3   embeddings    2019 non-null   object
 4   Cluster       2019 non-null   int64 
 5   combined      2018 non-null   object
 6   Common_Theme  2019 non-null   object
dtypes: int64(1), object(6)
memory usage: 110.5+ KB


- multi processing of inputs using multiprocessing library

## Multithreading for Tag Generation

In [8]:
#Ran this only to generat second batch of tags
df = df.loc[(range(1249,len(df)))]
df.head()

Unnamed: 0,id,Text,Title,embeddings,Cluster,combined,Common_Theme
1249,cua9j48642kyknz0,NEW YORK – For a region that lives and breathe...,Why a football-crazy continent gets just 3 Wor...,"[0.023741, 0.076849, -0.032606, 0.024595, 0.01...",5,Title: Why a football-crazy continent gets jus...,Football and Sporting Events
1250,ir2d3dafhjh0fkwn,HONG KONG – Asia’s last currency standing agai...,Asia’s last holdout against strong US dollar f...,"[-0.006965, -0.0196, -0.037286, -0.052639, 0.0...",16,Title: Asia’s last holdout against strong US d...,Financial Services and Business Developments i...
1251,36o7nvcxkupd7nxx,"With the support of EnterpriseSG, it moves int...",S’pore firm spends two years perfecting vegan ...,"[0.034536, 0.039622, -0.008628, -0.008201, 0.0...",19,Title: S’pore firm spends two years perfecting...,Entertainment and Culture
1252,b1nbk81lw7vcfsco,WASHINGTON - U.S. Secretary of State Anto...,Blinken meets Guatemalan President-elect Areva...,"[-0.000794, 0.062001, 0.012275, 0.029521, -0.0...",4,Title: Blinken meets Guatemalan President-elec...,Political Crises and Human Rights Concerns
1253,ekdsjni0osyemws5,SEOUL - South Korean President Yoon Suk Y...,South Korea's Yoon to warn APEC about risks fr...,"[0.050017, 0.006349, 0.015985, 0.028293, 0.028...",2,Title: South Korea's Yoon to warn APEC about r...,Geopolitical Alliances and Tensions in the Ind...


In [9]:
llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 6 most relevant tags that best categorize the main events, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely, and the response should not include any "-" or backslash n.
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    response = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    time.sleep(1)
    try:
        tags = str(response.parts[0])[7:].strip().replace("\n","").replace("\"","" ).split(", ")
        return article_id, tags
    except IndexError:
        return article_id, "error"
    except ValueError:
        return article_id, "error"
    except AttributeError:
        return article_id, "error"
        

def process_articles(df):
    results = []  # Use a list to store all batches for simplicity in JSON output
    max_workers = 5
    batch_size = 50
    cooldown_period = 60  # seconds

    articles = df['combined'].tolist()
    article_ids = df['id'].tolist()
    article_id_pairs = list(zip(articles, article_ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            batch_results = {} 
            processed = i# Dictionary to store results for the current batch
            for future in as_completed(futures):
                article_id, tags = future.result()
                batch_results[article_id] = tags
                print(f"Received tags for article ID {processed}")
                processed += 1

            # Append the current batch's results to the overall results
            results.append({'batch': i//batch_size + 1, 'results': batch_results})
            
            # Write/overwrite the JSON file with the updated results
            with open('batch_results2.json', 'w') as f:
                json.dump(results, f, indent=4)

            print(f"All tasks in batch {(i // batch_size) + 1} completed, cooling down for {cooldown_period} seconds...")
            time.sleep(cooldown_period)

    return results

In [10]:
all_tags = process_articles(df)
all_tags

Starting batch processing for articles 1 to 50
Received tags for article ID 0
Received tags for article ID 1
Received tags for article ID 2
Received tags for article ID 3
Received tags for article ID 4
Received tags for article ID 5
Received tags for article ID 6
Received tags for article ID 7
Received tags for article ID 8
Received tags for article ID 9
Received tags for article ID 10
Received tags for article ID 11
Received tags for article ID 12
Received tags for article ID 13
Received tags for article ID 14
Received tags for article ID 15
Received tags for article ID 16
Received tags for article ID 17
Received tags for article ID 18
Received tags for article ID 19
Received tags for article ID 20
Received tags for article ID 21
Received tags for article ID 22
Received tags for article ID 23
Received tags for article ID 24
Received tags for article ID 25
Received tags for article ID 26
Received tags for article ID 27
Received tags for article ID 28
Received tags for article ID 29
Rec

[{'batch': 1,
  'results': {'36o7nvcxkupd7nxx': ['Singapore',
    'Vegan',
    'Food',
    'Plant-based',
    'Sustainable',
    'Meat'],
   'b1nbk81lw7vcfsco': ['Guatemala',
    'Election',
    'Political Transition',
    'Antony Blinken',
    'Bernardo Arevalo',
    'United States'],
   'ekdsjni0osyemws5': ['South Korea',
    'APEC',
    'North Korea-Russia ties',
    'Indo-Pacific',
    'IPEF',
    'United States'],
   'ir2d3dafhjh0fkwn': ['US dollar',
    'Asia',
    'Currency',
    'FED',
    'Indonesia',
    'Rupiah'],
   'cua9j48642kyknz0': ['Argentina',
    'Brazil',
    'South America',
    'FIFA',
    'World Cup',
    'Lionel Messi'],
   '70h26cojtty6nw7d': ['Singapore',
    'Seniors',
    'Active ageing',
    'Community care',
    'Built environment',
    'Assisted living'],
   'iz0iptiddry2hdmd': ['Moorpark',
    'Thousand Oaks',
    'Ventura County',
    'US',
    'Altercation',
    'Protests'],
   'mm1vo2b18nyw4emy': ['Hong Kong',
    'Democracy Activists',
    'National 

## Processing the json file