In [None]:
import os
import sys
sys.path.append('..')
import importlib
import asyncio
import python.question_answering.process_inputs as process_inputs
import python.question_answering.generate_answer as generate_answer
import python.AI.embedder
import python.AI.client
from python.AI.embedder import Embedder
from python.AI.openai_configuration import OpenAIConfiguration
from python.helpers.constants import CACHE_PATH
importlib.reload(process_inputs)
importlib.reload(generate_answer)
importlib.reload(python.AI.embedder)
importlib.reload(python.AI.client)
import newspaper # pip install newspaper3k, pip install lxml_html_clean
import nest_asyncio # pip install nest_asyncio
nest_asyncio.apply()

In [None]:
target_news = 'https://www.bbc.com/news/articles'
target_articles = 50
target_chars = 1000

In [None]:
news = newspaper.build(target_news, language='en', memoize_articles=False)
article_texts = {}
for article in news.articles:
    article.download()
    article.parse()
    title = article.title
    text = article.text
    if len(text) >= target_chars:
        article_texts[title] = text
        print(f'Processed {len(article_texts)} articles')
    if len(article_texts) == target_articles:
        break

In [None]:
ai_configuration = OpenAIConfiguration({
    "api_type": "OpenAI",
    "api_key": os.environ['OPENAI_API_KEY'],
    "model": "gpt-4o-2024-08-06",
})

text_embedder = Embedder(
    configuration=ai_configuration,
    pickle_path=CACHE_PATH,
    local=False
)

In [None]:
title_to_chunks = process_inputs.process_texts(article_texts)

In [None]:
(
    text_to_vectors,
    concept_graph,
    community_to_concepts,
    concept_to_community,
    concept_to_chunks,
    chunk_to_concepts,
    previous_chunk,
    next_chunk
) = process_inputs.process_chunks(
    text_to_chunks=title_to_chunks,
    embedder=text_embedder,
    embedding_cache=CACHE_PATH,
    max_cluster_size=25
)
print(f'Processed chunks')

In [None]:
question = "What events are discussed?"

async def answer():
    relevant_chunks, partial_answers, chunk_progress, answer_progress = generate_answer.answer_question(
        ai_configuration=ai_configuration,
        question=question,
        text_to_chunks=title_to_chunks,
        chunk_to_concepts=chunk_to_concepts,
        concept_to_chunks=concept_to_chunks,
        text_to_vectors=text_to_vectors,
        community_to_concepts=community_to_concepts,
        concept_to_community=concept_to_community,
        previous_chunk=previous_chunk,
        next_chunk=next_chunk,
        embedder=text_embedder,
        embedding_cache=CACHE_PATH,
        select_logit_bias=5,
        semantic_search_depth=5,
        structural_search_steps=1,
        relational_search_depth=5,
        relevance_test_limit=20,
        relevance_test_batch_size=5,
        answer_batch_size=5,
        augment_top_concepts=10
    )
    return relevant_chunks, partial_answers, chunk_progress, answer_progress

relevant_chunks, partial_answers, chunk_progress, answer_progress = asyncio.run(answer())


In [None]:
final_answer = partial_answers[0]
print(final_answer)

