In [1]:
newsletter_descriptions = [
    "I want a newsletter about current Hollywood celebrity news and gossip.",
    "I want a newsletter about the space sector. I am not interested in scientific or technological advancements. I am mostly interested in space-sector financial and political news, like new laws, new startup initiatives, VC funding, etc.",
    "I am interested in a newsletter on Electric Vehicle news. I am mostly interested in the scientific discoveries that enable improvements in the vehicles.",
    "I want a newsletter focused on emerging trends in renewable energy, specifically solar and wind power.",
    "I'm looking for a newsletter about advancements in AI and machine learning, particularly in healthcare applications.",
    "I need a newsletter that covers the latest in cybersecurity threats and protection strategies for small businesses.",
    "I'm interested in a newsletter that provides updates on major infrastructure projects worldwide, like bridges, tunnels, and highways.",
    "I want a newsletter about the latest developments in virtual reality technology, especially in gaming and education.",
    "I'm looking for a newsletter that discusses new trends and techniques in digital marketing, with a focus on social media platforms.",
    "I need a newsletter that tracks the progress and challenges in the quest for Mars colonization.",
    "I'm interested in a newsletter about the impact of climate change on coastal cities and the measures being taken to mitigate it.",
    "I want a newsletter that covers breakthroughs in medical research, particularly in cancer treatment.",
    "I'm looking for a newsletter about the evolution of smart home technology and IoT devices.",
    "I need a newsletter focused on the global impact of blockchain technology beyond cryptocurrencies.",
    "I'm interested in a newsletter that provides insights into the future of transportation, including hyperloop and drone delivery systems.",
    "I want a newsletter about the changing landscape of the film industry, with a focus on independent filmmaking and streaming platforms.",
    "I'm looking for a newsletter that delves into sustainable fashion, covering eco-friendly materials and ethical manufacturing practices.",
    "I need a newsletter about the latest in agricultural technology and its role in addressing food security.",
    "I'm interested in a newsletter that covers the intersection of technology and education, focusing on e-learning platforms and digital classrooms.",
    "I want a newsletter about the developments in 3D printing technology, particularly in healthcare and construction."
]

In [2]:
from app.path_utils import root_path

data_folder_path = root_path() / ".." / ".." / "training-data" / "relevancy"

def write_to_newsletter_description_file(
    description: str, new_text: str, only_if_not_exist: bool = False
):
    if not data_folder_path.exists():
        data_folder_path.mkdir()
        
    description_hash = hash(description)
    description_file_path = data_folder_path / f"{description_hash}.txt"

    if not only_if_not_exist or not description_file_path.exists():
        with open(description_file_path, "a") as f:
            f.write(new_text)
    

In [3]:
for description in newsletter_descriptions:
    write_to_newsletter_description_file(
        description=description,
        new_text=f"DESCRIPTION: {description}",
        only_if_not_exist=True,
    )


In [4]:
import random

from app.core.newsletter_creator.newsletter_creator import (
    get_news_items
)
from app.core.newsletter_creator.utils import (
    get_article_selection_text,
    is_valid_candidate_item,
)
from app.schemas.newsletter_issue import NewsletterIssueCreate
from app.schemas.issue_metrics import IssueMetricsCreate
from app.cache.redis.aio_redis_cache import AioRedisCache
from app.cache.redis.redis_utils import redis_config
from app.core.api_provider import APIProvider
from app.core.data_processors.openai.openai import OpenAI
from app.core.data_processors.openai.openai_utils import (
    OpenAIModels,
    openai_config, call_openai_api_with_rate_limit_protection,
)
from app.core.newsletter_creator.newsletter_creator_utils import (
    newsletter_creator_config,
)
from app.core.selection_algos.representative_items_algo import (
    generate_most_representative_items,
)


async def generate_llm_training_prompts(description: str):
    description_hash = hash(description)
    
    api_provider_ = APIProvider()
    openai = OpenAI(api_provider=api_provider_, config=openai_config)
    cache = AioRedisCache(config=redis_config)

    await cache.initialize()
    
    news_items = await get_news_items(
        cache=cache, newsletter_issue_id=-1
    )

    newsletter_description_embedding = await call_openai_api_with_rate_limit_protection(
        newsletter_issue_id=-1,
        async_func=openai.get_embedding,
        model=OpenAIModels.TEXT_EMBEDDING_ADA_002,
        text=description,
    )

    representative_items_generator = generate_most_representative_items(
        target_vector=newsletter_description_embedding.vector,
        cache_items=news_items,
    )

    processed_articles = 0
    candidate_titles = []
    candidates = []

    async for item in representative_items_generator:
        if is_valid_candidate_item(
            newsletter_issue_id=-1,
            item=item,
            candidate_titles=candidate_titles,
        ):
            candidates.append(item)
            candidate_titles.append(item.article.title)
            if (
                len(candidate_titles)
                == newsletter_creator_config.max_processed_articles_per_newsletter
            ):
                break

    all_candidates_text = ""
    
    for candidate in candidates:
        article_selection_text = await get_article_selection_text(
            openai=openai,
            cache=cache,
            newsletter_issue_id=-1,
            item=candidate,
            in_issue=NewsletterIssueCreate(
                issue_id=0,
                subscription_id=0,
                timestamp=0,
                metrics=IssueMetricsCreate(
                    metrics_id="",
                    newsletter_generation_config_id=-1,
                    time_to_generate=-1,
                ),
            ),
        )
        
        relevancy_prompt = newsletter_creator_config.article_relevancy_prompt.format(
            newsletter_description=description,
            current_article_summary=article_selection_text,
        )

        all_candidates_text += (
            f"\n\nPROMPT:"
            f"\n\n{relevancy_prompt}"
            f"\n\nRESPONSE:"
            f"\n\nyes/no"
        )

    write_to_newsletter_description_file(
        description=description, new_text=all_candidates_text
    )

2024-01-27 10:45:57,843 - passlib.utils.compat - DEBUG - loaded lazy attr 'SafeConfigParser': <class 'configparser.ConfigParser'>
2024-01-27 10:45:57,843 - passlib.utils.compat - DEBUG - loaded lazy attr 'NativeStringIO': <class '_io.StringIO'>
2024-01-27 10:45:57,844 - passlib.utils.compat - DEBUG - loaded lazy attr 'BytesIO': <class '_io.BytesIO'>
2024-01-27 10:45:57,851 - passlib.registry - DEBUG - registered 'bcrypt' handler: <class 'passlib.handlers.bcrypt.bcrypt'>


In [5]:
import asyncio

for description in newsletter_descriptions:
    await generate_llm_training_prompts(description=description)

2024-01-27 10:46:42,469 - app.core.newsletter_creator.logging_utils - DEBUG - ni--1: Fetching new items since 1706240802.469415
2024-01-27 10:46:42,728 - app.cache.redis.aio_redis_cache - DEBUG - Found 59466 keys in the cache
2024-01-27 10:46:56,068 - app.core.newsletter_creator.logging_utils - DEBUG - ni--1: Retrieved 12643 new items since 1706240802.469415
2024-01-27 10:46:59,176 - app.core.newsletter_creator.logging_utils - INFO - ni--1: Article too short:

Super League news: Jonny Lomax confirmed as new St Helens captain following James Roby retirement
Stay up to date with the latest rugby league news and gossip from Super League and beyond. - www.skysports.com
2024-01-27 10:46:59,177 - app.core.newsletter_creator.logging_utils - INFO - ni--1: Article too short:

Samsung Galaxy S24 Series Breaks Pre-order Record For the Galaxy S Lineup
Samsung unrolled the curtains of the Galaxy S24 series just a week ago. After the official unveiling, the company started to take pre-orders, which 