In [None]:
import sys
sys.path.append('..')

In [None]:
import os
import asyncio
from toolkit.AI.openai_configuration import OpenAIConfiguration
from toolkit.AI.openai_embedder import OpenAIEmbedder
from toolkit.helpers.constants import CACHE_PATH

import toolkit.question_answering.input_processor as input_processor
import toolkit.question_answering.question_answerer as question_answerer
import toolkit.AI.client
import toolkit.question_answering.graph_builder
import toolkit.question_answering.answer_builder

import newspaper # pip install newspaper3k lxml_html_clean
import nest_asyncio # pip install nest_asyncio
nest_asyncio.apply()

In [None]:
target_news = 'https://www.bbc.com/news/articles'
target_articles = 50
target_chars = 1000

In [None]:
news = newspaper.build(target_news, language='en', memoize_articles=False)
article_text_jsons = []
for article in news.articles:
    article.download()
    article.parse()
    title = article.title
    text = article.text
    article_text_json = {}
    if len(text) >= target_chars:
        article_text_json['title'] = title
        article_text_json['text'] = text
        article_text_jsons.append(article_text_json)
        print(f'Processed {len(article_text_jsons)} articles')
    if len(article_text_jsons) == target_articles:
        break

In [None]:
ai_configuration = OpenAIConfiguration({
    "api_type": "OpenAI",
    "api_key": os.environ['OPENAI_API_KEY'],
    "model": "gpt-4o-2024-08-06",
})

text_embedder = OpenAIEmbedder(
    configuration=ai_configuration,
)

In [None]:
title_to_chunks = input_processor.process_json_texts(article_text_jsons)
title_to_chunks

In [None]:
(
    text_to_vectors,
    concept_graph,
    community_to_concepts,
    concept_to_community,
    concept_to_chunks,
    chunk_to_concepts,
    previous_chunk,
    next_chunk
) = input_processor.process_chunks(
    text_to_chunks=title_to_chunks,
    embedder=text_embedder,
    embedding_cache=CACHE_PATH,
    max_cluster_size=25
)
print(f'Processed chunks')

In [None]:
question = "What events are discussed?"

async def answer():
    relevant_chunks, partial_answers, chunk_progress, answer_progress = question_answerer.answer_question(
        ai_configuration=ai_configuration,
        question=question,
        text_to_chunks=title_to_chunks,
        chunk_to_concepts=chunk_to_concepts,
        concept_to_chunks=concept_to_chunks,
        text_to_vectors=text_to_vectors,
        concept_graph=concept_graph,
        community_to_concepts=community_to_concepts,
        concept_to_community=concept_to_community,
        previous_chunk=previous_chunk,
        next_chunk=next_chunk,
        embedder=text_embedder,
        embedding_cache=CACHE_PATH,
        select_logit_bias=5,
        semantic_search_depth=5,
        structural_search_steps=1,
        community_search_breadth=5,
        relevance_test_limit=20,
        relevance_test_batch_size=5,
        answer_batch_size=5,
        augment_top_concepts=10
    )
    return relevant_chunks, partial_answers, chunk_progress, answer_progress

relevant_chunks, partial_answers, chunk_progress, answer_progress = asyncio.run(answer())
print(f'Answered question')


In [None]:
print(chunk_progress)
print(answer_progress)

In [None]:
final_answer = partial_answers[0]
print(final_answer)