In [None]:
import sys

sys.path.append("..")

In [None]:
import os
import asyncio
from toolkit.AI.openai_configuration import OpenAIConfiguration
from toolkit.AI.openai_embedder import OpenAIEmbedder
from toolkit.helpers.constants import CACHE_PATH

import toolkit.query_text_data.input_processor as input_processor
import toolkit.query_text_data.question_answerer as question_answerer
import toolkit.query_text_data.pattern_detector as pattern_detector
import toolkit.graph.graph_fusion_encoder_embedding as gfee
import toolkit.query_text_data.helper_functions as helper_functions

import newspaper  # poetry add newspaper3k lxml_html_clean
import nest_asyncio  # poetry add nest_asyncio

nest_asyncio.apply()

In [None]:
# Uncomment the following lines to access library code updates
# import importlib
# importlib.reload(input_processor)
# importlib.reload(pattern_detector)
# importlib.reload(helper_functions)
# importlib.reload(question_answerer)
# importlib.reload(gfee)

In [None]:
newspaper.popular_urls()

In [None]:
target_news = "https://www.cnn.com"
target_articles = 50
target_chars = 1000

In [None]:
news = newspaper.build(target_news, language="en", memoize_articles=False)
file_to_text_json = {}
for article in news.articles:
    article.download()
    article.parse()
    title = article.title
    text = article.text
    article_text_json = {}
    timestamp = article.publish_date.isoformat() if article.publish_date else None
    if timestamp != None and len(text) >= target_chars:
        article_text_json["title"] = title
        article_text_json["timestamp"] = timestamp
        article_text_json["text"] = text
        file_to_text_json[title] = article_text_json
        print(f"Processed {len(file_to_text_json)} articles")
    if len(file_to_text_json) == target_articles:
        break

In [None]:
ai_configuration = OpenAIConfiguration(
    {
        "api_type": "OpenAI",
        "api_key": os.environ["OPENAI_API_KEY"],
        "model": "gpt-4o-2024-08-06",
    }
)

text_embedder = OpenAIEmbedder(
    configuration=ai_configuration,
)

In [None]:
window_period = input_processor.PeriodOption.DAY
file_to_chunks = input_processor.process_json_texts(file_to_text_json, window_period)
file_to_chunks

In [None]:
(
    cid_to_text,
    text_to_cid,
    period_concept_graphs,
    community_to_concepts,
    concept_to_community,
    concept_to_cids,
    cid_to_concepts,
    previous_cid,
    next_cid,
    period_to_cids,
    node_period_counts,
    edge_period_counts,
) = input_processor.process_chunks(file_to_chunks=file_to_chunks, max_cluster_size=25)
print(f"Processed chunks")

In [None]:
node_to_period_to_pos = None
node_to_period_to_shift = None
if window_period != input_processor.PeriodOption.NONE:
    node_to_period_to_pos, node_to_period_to_shift = (
        gfee.generate_graph_fusion_encoder_embedding(
            period_to_graph=period_concept_graphs,
            node_to_label=concept_to_community,
            correlation=True,
            diaga=True,
            laplacian=True,
        )
    )
print(f"Generated graph fusion encoder embedding")

In [None]:
cid_to_converging_pairs = None
if window_period != input_processor.PeriodOption.NONE:
    cid_to_converging_pairs = pattern_detector.detect_converging_pairs(
        period_to_cids,
        cid_to_concepts,
        node_to_period_to_pos,
    )
print(f"Detected converging pairs")

In [None]:
cid_to_summary = None
if window_period != input_processor.PeriodOption.NONE:
    cid_to_summary = pattern_detector.explain_chunk_significance(
        period_to_cids,
        cid_to_converging_pairs,
        node_period_counts,
        edge_period_counts,
    )
print(f"Explained chunk significance")

In [None]:
cid_to_explained_text = cid_to_text
if window_period != input_processor.PeriodOption.NONE:
    cid_to_explained_text = pattern_detector.combine_chunk_text_and_explantion(
        cid_to_text, cid_to_summary
    )
print(f"Combined chunk text and explanation")

In [None]:
cid_to_vector = await helper_functions.embed_texts(
    cid_to_text=cid_to_explained_text,
    text_embedder=text_embedder,
)
print(f"Embedded chunk text")

In [None]:
question = "What events are discussed?"


async def answer():
    (
        relevant_cids,
        partial_answers,
        chunk_progress,
        answer_progress,
    ) = await question_answerer.answer_question(
        ai_configuration=ai_configuration,
        question=question,
        cid_to_text=cid_to_explained_text,
        cid_to_concepts=cid_to_concepts,
        concept_to_cids=concept_to_cids,
        cid_to_vector=cid_to_vector,
        concept_graph=period_concept_graphs["ALL"],
        community_to_concepts=community_to_concepts,
        concept_to_community=concept_to_community,
        previous_cid=previous_cid,
        next_cid=next_cid,
        embedder=text_embedder,
        embedding_cache=CACHE_PATH,
        select_logit_bias=5,
        adjacent_search_steps=1,
        relevance_test_budget=10,
        community_relevance_tests=5,
        relevance_test_batch_size=5,
        irrelevant_community_restart=3,
        answer_batch_size=10,
    )
    return relevant_cids, partial_answers, chunk_progress, answer_progress


if __name__ == "__main__":
    relevant_cids, partial_answers, chunk_progress, answer_progress = await answer()
    print(f"Answered question")

In [None]:
print(chunk_progress)
print(answer_progress)

In [None]:
final_answer = partial_answers[0]
print(final_answer)