In [210]:
from pymongo import MongoClient
import openai
import json
from bson import ObjectId

In [211]:
# Connection URI
uri = "mongodb://egemenNewcheAdmin:passNewche@localhost:27017/newcheDB"
# Connect to the MongoDB client
client = MongoClient(uri)
# Select the database
db = client['newcheDB']
# Select the collection
collection = db['unprocessedNews']

In [212]:
def gather_news_id_and_summary():
    # Query to find documents where tags array is empty or tags field does not exist
    query = {
        "$or": [
            {"tags": {"$exists": False}},  # Tags field does not exist
            {"tags": {"$size": 0}}         # Tags array is empty
        ]
    }
    # Projection to specify only to return the _id and summary fields
    projection = {
        "_id": 1,
        "summary": 1
    }
    # Find documents based on the query and projection
    news_items = collection.find(query, projection)
    
    # Convert the cursor to a list (optional, depending on your use case)
    news_list = list(news_items)
    
    return news_list

In [213]:
# Call the function and store the result
news_without_tags = gather_news_id_and_summary()

In [214]:
# Depending on your application, you can print, return, or process the news_list further.
# For example, to print each news item:
for news in news_without_tags:
    print(news)

{'_id': ObjectId('65bc1c6a2b814ceb11b00c2c'), 'summary': 'Measles Cases Soaring Worldwide as WHO Reports Alarming 45-Fold Rise in Europe Health 01 February 2024 By Jaya Dantas, The Conversation. In recent weeks a series of measles alerts have been issued around Australia.'}
{'_id': ObjectId('65bc1c6a2b814ceb11b00c2d'), 'summary': 'Financial Stress Could Impact Your Health More Than Grief, Study Finds. Financial stress, bereavement, and longstanding illness showed the greatest long-term changes in immune and neuroendocrine biomarkers. This indicates an ongoing physical effect of chronic stress.'}
{'_id': ObjectId('65bc1c6a2b814ceb11b00c2e'), 'summary': 'We Finally Know How Ancient Roman Concrete Was Able to Last Thousands of years. Researchers studied 2,000-year-old samples of Roman concrete from Privernum in Italy.'}
{'_id': ObjectId('65bc1c6a2b814ceb11b00c2f'), 'summary': "Biogen's Controversial Alzheimer's Drug Withdrawn From Market. Biogen's Leqembi, which it co-manufactures with Ei

In [215]:
def save_first_3_news_as_string(news_list):
    # Initialize an empty string to store the formatted news
    news_string = ""
    # Loop through the first 3 news items in the list
    for news in news_list[:2]:
        # Format each news item and append it to the news_string
        news_string += f"ID: {news['_id']} \nSUMMARY: {news.get('summary')}\n\n"
    return news_string


In [216]:
news_string_result = save_first_3_news_as_string(news_without_tags)

In [217]:
news_string_result

'ID: 65bc1c6a2b814ceb11b00c2c \nSUMMARY: Measles Cases Soaring Worldwide as WHO Reports Alarming 45-Fold Rise in Europe Health 01 February 2024 By Jaya Dantas, The Conversation. In recent weeks a series of measles alerts have been issued around Australia.\n\nID: 65bc1c6a2b814ceb11b00c2d \nSUMMARY: Financial Stress Could Impact Your Health More Than Grief, Study Finds. Financial stress, bereavement, and longstanding illness showed the greatest long-term changes in immune and neuroendocrine biomarkers. This indicates an ongoing physical effect of chronic stress.\n\n'

In [218]:
from openai import OpenAI
client = OpenAI()

def generate_tags_for_news_batch_json(news_string_result):
    prompt_base = """Generate multi-level clustering tags for the following news summaries in JSON format. Each level should allow for multiple tags, progressively narrowing down the focus from the broadest category to the most specific details. Format the output as a JSON object with "id" as the news ID and "tags" as a dictionary containing arrays of tags for each level.

Tagging structure should be as follows:
- Level 1: The broadest category covering the general subject of the news, such as broad sectors including but not limited to sports, health, technology.
- Level 2: Sub-categories within the broad sector, focusing on more specific domains or types within the general subject.
- Level 3: Even more specific themes, types, or areas within the sub-category, detailing particular aspects or fields.
- Level 4: Detailed elements, focusing on very specific outcomes, implications, or aspects within the themes or areas identified in Level 3.
- Level 5: The most detailed tags, pinpointing precise topics, events, technologies, or outcomes mentioned in the news summary.

Analyze each news summary to identify appropriate tags for each level, ensuring that the tags are relevant and specific to the content of the news. Avoid using the provided examples; instead, derive tags directly from the summary.

Please provide tags for each level without using the provided examples, ensuring that the tags reflect the content of the news summary accurately. and do not write result in "```json\n```", do not specify the format you give me just give the result in response. and always give in the json format
"""

    prompt = prompt_base + news_string_result + "\n\nProvide the tags for each news item formatted as specified."

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": prompt},
            {"role": "user", "content": "Please generate the tags for each news item formatted as specified above."}
        ]
    )

    # Assuming the last message in the response is what we're interested in
    if response.choices and response.choices[0].message:
        gpt_result = response.choices[0].message.content
    else:
        gpt_result = "No response generated."

    return gpt_result

In [219]:
from openai import OpenAI
client = OpenAI()

# Call the function and store the output in `gpt_result`
gpt_result = generate_tags_for_news_batch_json(news_string_result)

# Assuming you want to print or further process the `gpt_result`
print(gpt_result)

{
    "id": "65bc1c6a2b814ceb11b00c2c",
    "tags": {
        "Level 1": ["Health"],
        "Level 2": ["Disease Outbreak"],
        "Level 3": ["Measles"],
        "Level 4": ["Global Impact"],
        "Level 5": ["World Health Organization"]
    }
} 

{
    "id": "65bc1c6a2b814ceb11b00c2d",
    "tags": {
        "Level 1": ["Health"],
        "Level 2": ["Impact on Immune System"],
        "Level 3": ["Chronic Stress Effects"],
        "Level 4": ["Immune and Neuroendocrine Biomarkers"],
        "Level 5": ["Financial Stress"]
    }
}


In [220]:
type(gpt_result)

str

In [221]:
json_strings = gpt_result.strip().split('\n\n')

tags_data = [json.loads(json_str) for json_str in json_strings]



In [222]:
tags_data

[{'id': '65bc1c6a2b814ceb11b00c2c',
  'tags': {'Level 1': ['Health'],
   'Level 2': ['Disease Outbreak'],
   'Level 3': ['Measles'],
   'Level 4': ['Global Impact'],
   'Level 5': ['World Health Organization']}},
 {'id': '65bc1c6a2b814ceb11b00c2d',
  'tags': {'Level 1': ['Health'],
   'Level 2': ['Impact on Immune System'],
   'Level 3': ['Chronic Stress Effects'],
   'Level 4': ['Immune and Neuroendocrine Biomarkers'],
   'Level 5': ['Financial Stress']}}]

In [223]:
def convert_to_objectid(doc_id):
    try:
        # Validate if doc_id is already an ObjectId
        if isinstance(doc_id, ObjectId):
            return doc_id
        
        # Check if doc_id is in ObjectId format
        if len(doc_id) == 24 and all(c in "0123456789abcdef" for c in doc_id):
            mongo_id = ObjectId(doc_id)
            return mongo_id
        else:
            raise ValueError("Invalid ObjectId format")
    except Exception as e:
        # If conversion fails, handle the exception (e.g., print an error message)
        print(f"Error converting '{doc_id}' to ObjectId: {e}")
        return None  # Return None or any other value to indicate failure


In [224]:
def process_tags_data(tags_data, news_list):
    for tag_item in tags_data:
        tag_id = tag_item['id']
        # Find the corresponding news item by ID
        corresponding_news_item = next((item for item in news_list if str(item['_id']) == tag_id), None)
        if not corresponding_news_item:
            print(f"No corresponding news item found for ID: {tag_id}")
            continue
        doc_summary = corresponding_news_item['summary']
        
        # Flatten the tags into a single list
        tags_array = [tag for level_tags in tag_item['tags'].values() for tag in level_tags]
        
        # Update the document in MongoDB by matching the summary
        result = collection.update_one(
            {"summary": doc_summary},  # Use the summary to match the document
            {"$set": {"tags": tags_array}}
        )
        
        if result.matched_count == 0:
            print(f"No document found with the summary: '{doc_summary}'")
        else:
            print(f"Document with summary: '{doc_summary}' updated with tags.")


In [225]:
process_tags_data(tags_data, news_without_tags)
# Count the number of items in the list
news_count = len(news_without_tags)

# Print the count
print("Number of news summaries without tags:", news_count)

Document with summary: 'Measles Cases Soaring Worldwide as WHO Reports Alarming 45-Fold Rise in Europe Health 01 February 2024 By Jaya Dantas, The Conversation. In recent weeks a series of measles alerts have been issued around Australia.' updated with tags.
Document with summary: 'Financial Stress Could Impact Your Health More Than Grief, Study Finds. Financial stress, bereavement, and longstanding illness showed the greatest long-term changes in immune and neuroendocrine biomarkers. This indicates an ongoing physical effect of chronic stress.' updated with tags.
Number of news summaries without tags: 1001
