In [13]:
# Import necessary libraries
import os
import ast
import csv
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from dotenv import load_dotenv
from tqdm import trange
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Import libraries for working with language models and Google Gemini
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Install the google-generativeai package (uncomment the line below to run the installation)
!pip install -U -q google-generativeai

# Set up the environment for plotting
%matplotlib inline

# Load environment variables
load_dotenv()


True

In [4]:
!pwd

/Users/jerryyang/Desktop/SPH/sph-timeline-project/src


In [14]:
# Function to load and combine the split dataframes
def load_and_merge_csv(file_pattern, num_files):
    file_names = [file_pattern.format(i) for i in range(1, num_files + 1)]
    dataframes = [pd.read_csv(filename) for filename in file_names]
    merged_df = pd.concat(dataframes, ignore_index=True)
    return merged_df

df = load_and_merge_csv('../data_upload/cluster_labels{}.csv', 4)

In [7]:
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            2019 non-null   object
 1   Text          2019 non-null   object
 2   Title         2018 non-null   object
 3   embeddings    2019 non-null   object
 4   Cluster       2019 non-null   int64 
 5   combined      2018 non-null   object
 6   Common_Theme  2019 non-null   object
dtypes: int64(1), object(6)
memory usage: 110.5+ KB


In [15]:
GEMINI_KEY = os.environ.get('GEMINI_KEY')
genai.configure(api_key=GEMINI_KEY)

- multi processing of inputs using multiprocessing library

## Multiprocessing for Tag Generation

In [9]:
df = df.iloc[range(100)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            100 non-null    object
 1   Text          100 non-null    object
 2   Title         100 non-null    object
 3   embeddings    100 non-null    object
 4   Cluster       100 non-null    int64 
 5   combined      100 non-null    object
 6   Common_Theme  100 non-null    object
dtypes: int64(1), object(6)
memory usage: 6.2+ KB


In [16]:
from multiprocessing import Pool
import requests

In [17]:
df1 = df.loc[range(25)]

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, 0 to 24
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            25 non-null     object
 1   Text          25 non-null     object
 2   Title         25 non-null     object
 3   embeddings    25 non-null     object
 4   Cluster       25 non-null     int64 
 5   combined      25 non-null     object
 6   Common_Theme  25 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.6+ KB


In [None]:
df1.head()

## 25 articles with thread pool

In [89]:

llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 3 to 5 relevant tags that categorize the main themes, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely. 
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    response = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    time.sleep(1)
    try:
        return article_id, response.text.strip().split(", ")
    except ValueError:
        return article_id, response.prompt_feedback

def process_articles(df):
    results = {}
    max_workers = 10
    batch_size = 100
    cooldown_period = 90

    articles = df['Text'].tolist()
    ids = df['id'].tolist()
    article_id_pairs = list(zip(articles, ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            for future in as_completed(futures):
                article_id, tags = future.result()
                results[article_id] = tags

            if i + batch_size < len(article_id_pairs):
                print(f"All tasks in batch {i//batch_size + 1} completed, cooling down for {cooldown_period} seconds...")
                time.sleep(cooldown_period)

    return results

articles = df1['combined'].tolist()
article_ids = df1['id'].tolist()
tags = process_articles(articles, article_ids)


In [None]:
copy_tags = tags
copy_tags

In [None]:
## Cleaning and filtering 
def map_to_id(df1, tags):
    ids = df1.id.to_list()
    ordered_tags = []
    for id in ids:
        # Clean each tag by stripping extra spaces, removing '*', replacing newlines and dashes, and capitalizing
        clean_tag = [tag.strip().replace('#', '').replace('*', '').replace('\n', ',').replace('-', '').title() for tag in tags[id] if tag.strip()]
        ordered_tags.append(clean_tag)
    return ordered_tags

# USE COPY _TAGS
clean_and_ordered_tags_list = map_to_id(df1,copy_tags)
clean_and_ordered_tags_list
#strs = list(map(lambda x: ', '.join(x), copy_tags))
# df1['tags'] = pd.DataFrame(strs)
# df1.head()

In [19]:
df1 = df.loc[range(300)]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 0 to 299
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            300 non-null    object
 1   Text          300 non-null    object
 2   Title         300 non-null    object
 3   embeddings    300 non-null    object
 4   Cluster       300 non-null    int64 
 5   combined      300 non-null    object
 6   Common_Theme  300 non-null    object
dtypes: int64(1), object(6)
memory usage: 18.8+ KB


In [20]:
df1['Text'] = df1['Text'].apply(lambda x: x[:170])
df1.Text[0]

'GENEVA – The remains of a climber discovered in the Swiss Alps in 2022 have been identified as those of a British mountaineer who went missing 52 years ago, local police '

In [None]:
llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 2 most relevant tags that best categorize the main events, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely, and the response should not include any "-" or backslash n.
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
                    
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)
        
def fetch_tags(article_pair):
    article_text, article_id = article_pair
    final_prompt = prompt.format(text=article_text)
    response = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    time.sleep(1)
    try:
        tags = str(response.parts[0])[7:].strip().replace("\n","").replace("\"","" ).split(", ")
        return article_id, tags
    except ValueError:
        error_msg = "error, message"
        return article_id, error_msg

def process_articles(df):
    results = {}
    max_workers = 10
    batch_size = 100
    cooldown_period = 90

    articles = df1['combined'].tolist()
    article_ids = df1['id'].tolist()
    article_id_pairs = list(zip(articles, ids))

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for i in range(0, len(article_id_pairs), batch_size):
            current_batch = article_id_pairs[i:i+batch_size]
            print(f"Starting batch processing for articles {i+1} to {min(i+batch_size, len(article_id_pairs))}")
            futures = {executor.submit(fetch_tags, pair): pair for pair in current_batch}

            processed_count = i
            for future in as_completed(futures):
                article_id, tags = future.result()
                results[article_id] = tags
                processed_count += 1
                print(f"Received tags for article {processed_count}")
                
            if processed_count >= len(article_id_pairs):
                return results
            
            print(f"All tasks in batch {i//batch_size + 1} completed, cooling down for {cooldown_period} seconds...")
            time.sleep(cooldown_period)
    return results

In [None]:
tags = process_articles(df1)

In [56]:
#Run this tomorrow:
llm = genai.GenerativeModel('gemini-1.0-pro')

template = '''
    Task Description: Given the following news article, identify and suggest 6 most relevant tags that best categorize the main events, 
    topics, entities, and geographical locations mentioned. 
    The tags should be concise, informative, and reflect the content accurately to facilitate effective searching and organization within a database.
    
    Combined Title and Summaries:
    {text}
    
    Formatting convention: List the tags to me in this example format:
    Singapore, Big family, climbing, Baby, crying, hungry
    
    Ensure that the tags generated follow the formatting convention very closely, and the response should not include any "-" or backslash n.
    Generated tags:
    
    Check again that the format follows the formatting convention stated above
        '''
            
prompt = PromptTemplate(
            input_variables=["text"],
            template=template)

all_tags = []
for i in trange(len(df)):
    article = df.combined[i]
    final_prompt  = prompt.format(text=article)
    article_tags = llm.generate_content(final_prompt, safety_settings={
                                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
                                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, 
                                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE
                                    })
    all_tags.append(str(article_tags.parts[0])[7:].strip().replace("\n","").replace("\"","" ).split(", "))
    if i ==1:
        break
    
all_tags


  0%|          | 1/2019 [00:05<2:55:21,  5.21s/it]


[['British Climber',
  'Swiss Alps',
  'Glacial Melt',
  'Missing Person',
  'DNA Identification',
  'Global Warming'],
 ,
 ['Generative AI',
  'AI Ethics',
  'Youth Engagement',
  'Risk Management',
  'AI Governance',
  'ChatGPT'],
 ]

In [57]:
article_tags

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({'candidates': [{'content': {'parts': [{'text': 'Generative AI, AI Ethics, Youth Engagement, Risk Management, AI Governance, ChatGPT'}], 'role': 'model'}, 'finish_reason': 1, 'index': 0, 'safety_ratings': [{'category': 9, 'probability': 1, 'blocked': False}, {'category': 8, 'probability': 1, 'blocked': False}, {'category': 7, 'probability': 1, 'blocked': False}, {'category': 10, 'probability': 1, 'blocked': False}], 'token_count': 0, 'grounding_attributions': []}]}),
)

In [60]:
def clean_tags(nested_tags):
    cleaned_nested_tags = []
    for tag_list in nested_tags:
        # Include condition to filter out empty or whitespace-only tags
        cleaned_tags = [tag.strip().replace('*', '').title() for tag in tag_list if tag.strip()]
        cleaned_nested_tags.append(cleaned_tags)
    return cleaned_nested_tags

cleaned_tags = clean_tags(all_tags)
cleaned_tags

[['Swiss Alps',
  'Missing Climber',
  'Glacial Melt',
  'Dna Identification',
  'Climate Change'],
 ['Artificial Intelligence',
  'Generative Ai',
  'Ethics',
  'Risk Management',
  'Youth Engagement'],
 ['Tennis', 'U.S. Open', 'Grand Slam', 'Novak Djokovic', 'Iga Swiatek'],
 ['Myanmar', 'United Nations', 'Military Coup', 'Asean', 'Rohingya Crisis'],
 ['Israel', 'Politics', 'Currency', 'Economy', 'Judicial Reform']]

In [None]:
# For the enew article that come sin, use the generate_tag function, (create one),\
    # then find similar articles that have like 2 or 3 of the same tags, means relevant