### Metadata source

In [None]:
import sys
import os
import pandas as pd
import json
from tqdm import tqdm
from dotenv import load_dotenv
from pydantic import BaseModel
from langchain_core.rate_limiters import InMemoryRateLimiter

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.articles import create_static_metadata
from src.llm import get_llm_client
from src.prompts import get_metadata_prompt

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [None]:
rate_limiter = InMemoryRateLimiter(
    requests_per_second=0.5,  # <-- Gemini Free Tier
    check_every_n_seconds=0.1,
)

llm_client = get_llm_client(
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
    max_tokens=1024,
    temperature=0.2,
    rate_limiter=rate_limiter,
)

Setup the paths to data sources

In [None]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
METADATA_PATH = os.path.join("..", "data", "metadata.csv")

Extract metadata

In [None]:
class ArticleTags(BaseModel):
    tags: list[str]

In [None]:
articles = os.listdir(ARTICLES_CLEAN_DIR)

metadata = []

for article in tqdm(articles):
    
    article_path = os.path.join(ARTICLES_CLEAN_DIR, article)    
    with open(article_path, "r", encoding="utf-8") as file:
        article = json.load(file)
    
    article_metadata = create_static_metadata(article, article_path)

    tags_prompt = get_metadata_prompt()
    query = tags_prompt.format(article_text=article["text"])
    llm = llm_client.with_structured_output(ArticleTags)
    response = llm.invoke([query])
    article_metadata["tags"] = response.tags
    
    metadata.append(article_metadata)

df_metadata = pd.DataFrame(metadata)
df_metadata.to_csv(METADATA_PATH, index=False)

In [None]:
df_metadata.describe()

In [None]:
df_metadata.explode("tags")["tags"].value_counts()