# Task Overview

## Summarization
This consists of a query whereby the agent (Validator) requests a summary of a topic from the miners. Always uses API

The context is from Wikipedia. The context is used for creating the reference and may or may not be also sent to the miners.

1. Select wikipedia article at random and use to define TOPIC (e.g. Pearl Harbour) and CONTEXT (article content)
2. Extract TAGS (history, WW2, Japan, USA) associated with article
3. Generate SYSTEM PROMPT such as 'You are a student who wants a summary of the main events of TOPIC (TAGS) in a XYZ tone'.
4. Generate QUERY using MODEL and SYSTEM PROMPT
5. Generate K REFERENCES using MODEL with & without CONTEXT (helps us understand the efficacy of tool use in miners)
6. Repeat step 5 using GPT and other models (e.g. mixtral, solar)

----
system_prompt = 'You are a student who want a summary of Pradeep Kumar Dubey (politics) in an interested tone.'

system prompt is given to our agent (LLM) and the agent generates a query:

query = 'Give me an overview of the politician Pradeep Kumar Dubey'
query = 'Provide me with a summary of Pradeep Kumar Dubey'
query = 'I want to know about Pradeep Kumar Dubey, can you give me a summary?'

Query is then sent to the miners.



## Question Answering
This consists of a query whereby the agent (Validator) requests an answer to a question from the miners. Always uses API.

## Debugging
This can consist of either:
- Non API: Reference answer (code snippet) provided by the agent, followed by a corruption step to create the challenge. Only a single reference answer exists
- API: Stack overflow is used to find a random thread containing a question and one or more accepted/upvoted answers. In this case the reference answers are weighted by upvotes and the challenge is the user question. Multiple reference answers exist.


In [None]:
!pip install beautifulsoup4

In [None]:
import os
import openai
openai.api_key = api_key = os.environ["OPENAI_API_KEY"]


import bittensor as bt

import pandas as pd
       
from utils import load_llm
from prompting.agent import Agent
from prompting.tasks import DebuggingTask, QuestionAnsweringTask, SummarizationTask


gpt_judge_prompt = """I'm using a roleplaying AI assistant to imitate human queries. You task is to assess whether the following query follows the instruction correctly.  If the assistant-generated query contains system messages such as 'sure i can help' or similar, this is a bad result because humans would not talk to an AI assistant in that way.

system_prompt = {system_prompt}

query = {query}'

Does the above query follow the system prompt and strongly resemble a human message? 

Simply answer 0 or 1, and your result must be enclosed in { } tags"""

In [None]:
model = 'gpt-4'
llm = load_llm(model, api_key=api_key)


In [None]:
llm

In [None]:
import requests


def get_random_wikipedia_article(min_length=1000, min_backlinks=1):
    # Wikipedia API endpoint for a random article
    url = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'info|linkshere|categories|categoryinfo|extracts',
        'generator': 'random',
        'grnnamespace': 0,  # Namespace 0 indicates articles
        'grnlimit': 10,     # Number of random articles to fetch
        'inprop': 'url|displaytitle|length',  # Requesting URL, title, and length of the page
        'lhprop': 'pageid',  # Properties for links here (backlinks)
        'lhlimit': 'max',    # Maximum number of backlinks to retrieve
        'exlimit': 'max',    # Get extracts for each page
        'cllimit': 'max'     # Get all categories for each page
    }

    
    max_tries = 10
    tries = 0
    while tries < max_tries:

        response = requests.get(url, params=params)
        tries += 1
        
        data = response.json()
        if not data.get('query'):
            continue

        for page_id, page_info in data['query']['pages'].items():

            length = page_info.get('length', 0)
            backlinks = len(page_info.get('linkshere', []))
            categories = [cat.get('title','').strip('Category:') for cat in page_info.get('categories', [{}])]
            extract = page_info.get('extract')
                
            if length >= min_length and backlinks >= min_backlinks and extract:# and views >= min_views:
                return {
                    'title': page_info['title'],
                    'url': page_info['fullurl'],
                    'length': length,
                    'extract': extract,
                    'backlinks': backlinks,
                    'categories': categories
                }
    raise Exception(f"Could not find an article with length >= {min_length} and backlinks >= {min_backlinks} after {max_tries} tries.")

# Example usage
filtered_data = get_random_wikipedia_article()
filtered_data


In [None]:
import requests

def get_random_wikipedia_article():
    # Wikipedia API endpoint for a random article
    url = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        'action': 'query',
        'format': 'json',
        'prop': 'info|linkshere|categories|categoryinfo|pageviews',#|extracts
        'generator': 'random',
        'grnnamespace': 0,  # Namespace 0 indicates articles
        'grnlimit': 20       # Number of random articles to fetch
    }

    # Making the API request
    response = requests.get(url, params=params)
    data = response.json()
    return data

    # Extracting the title of the random article
    title = data['query']['random'][0]['title']

    # URL of the random article
    article_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"

    return title, article_url

get_random_wikipedia_article()

In [None]:

def get_wikipedia_article_content(title, remove_headers=False):
    # Wikipedia API endpoint
    url = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request to get article content
    params = {
        'action': 'query',
        'format': 'json',
        'titles': title,
        'prop': 'extracts',
        'explaintext': True,  # Get plain text content
    }

    # Making the API request
    response = requests.get(url, params=params)
    data = response.json()

    # Extracting the page content
    page = next(iter(data['query']['pages'].values()))
    content = page.get('extract', 'Content not found.')
    
    text = ''
    for line in content.split('\n'):
        if remove_headers and line.startswith('==') and line.endswith('=='):
            continue
        text += line + '\n'

    return text

from bs4 import BeautifulSoup
# TODO: maybe this?
def extract_categories(url):
    # Fetch the webpage
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the category links
    categories = []
    for link in soup.find_all("a", href=lambda href: href and "Category:" in href):
        category = link.get_text()
        categories.append(category)

    return categories

# Assuming you have a title from the previous function
title, url = get_random_wikipedia_article()
content = get_wikipedia_article_content(title, remove_headers=True)
categories = extract_categories(url)
print(f"Title: {title}\nContent:\n{content}\nCategories: {categories}")


In [None]:
content['query']['pages']

In [None]:
import requests
import random

def get_pages_in_category(category):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f"Category:{category}",
        'cmnamespace': 0,  # Specify namespace 0 for articles
        'cmlimit': 'max'
    }

    response = requests.get(url, params=params)
    data = response.json()

    page_titles = [page['title'] for page in data['query']['categorymembers']]
    return page_titles


def select_random_page(pages):
    if pages:
        return random.choice(pages)
    else:
        return None

# Example: Fetch a random page from the "Physics" category
category = "Science"
for i in range(3):
    pages = get_pages_in_category(category)
    print(f'Category: {category}. Content: {pages}')
    random_page_title = category = select_random_page(pages)
    print(f"Random page from {category} category: {random_page_title}. Total pages: {len(pages)}")

print(f"Random page from {category} category: {random_page_title}. Total pages: {len(pages)}")
# content = get_wikipedia_article_content(random_page_title, remove_headers=False)
# print(content)

In [None]:
import requests

def get_top_level_categories(category="Contents"):
    # Wikipedia API endpoint
    url = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request to get subcategories
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f"Category:{category}",
        'cmtype': 'subcat',  # Fetch subcategories
        'cmlimit': 'max'     # Maximum number of subcategories
    }

    response = requests.get(url, params=params)
    data = response.json()

    subcategories = [subcat['title'].replace("Category:", "") for subcat in data['query']['categorymembers']]
    return subcategories

# Fetch top-level categories
top_level_categories = get_top_level_categories(category='History')
print(f"Top-level Categories:\n{top_level_categories}")


In [None]:
import requests
import random

def get_pages_in_category(category):
    # Encode spaces for URL
    category = category.replace(' ', '_')
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'list': 'categorymembers',
        'cmtitle': f"Category:{category}",
        'cmnamespace': 0,
        'cmlimit': 'max'
    }
    
    response = requests.get(url, params=params)
    data = response.json()

    page_titles = [page['title'] for page in data['query']['categorymembers']]
    return page_titles

def get_random_wikipedia_article(categories):
    # Select a random category
    selected_category = random.choice(categories)
    print(f"Selected Category: {selected_category}")

    # Get pages in the selected category
    pages = get_pages_in_category(selected_category)
    if not pages:
        raise ValueError( "No articles found in the category." )

    # Select a random page
    random_page_title = random.choice(pages)
    print(f"Selected Article: {random_page_title}")

    # Get the content of the random page
    url = f"https://en.wikipedia.org/w/api.php"
    params = {
        'action': 'query',
        'format': 'json',
        'titles': random_page_title,
        'prop': 'extracts',
        'explaintext': True,
    }

    response = requests.get(url, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    content = page.get('extract', 'Content not found.')

    # Construct the article URL
    article_url = f"https://en.wikipedia.org/wiki/{random_page_title.replace(' ', '_')}"

    return {'title': random_page_title, 'url': article_url, 'content': content, 'category': selected_category}

# List of categories
categories = ['Artificial intelligence', 'World history', 'Astrophysics', 'Classical music', 
              'Environmental science', 'Food', 'Mythology', 'Contemporary art', 'Linguistics']

categories = ['Machine learning algorithms']
# Get a random article
results = []
import tqdm
for i in tqdm.tqdm(range(100)):
    try:
        data = get_random_wikipedia_article(categories)
        results.append(data)
    except ValueError as e:
        print(e)
    
print(f"Title: {title}\nURL: {url}\nContent: {content[:500]}...")  # Print the first 500 characters of content


In [None]:
pd.DataFrame(results).title.value_counts().value_counts()

In [None]:

from collections.abc import Iterator

class Dataset(Iterator):
    def __init__(self):
        super().__init__()

    def __next__(self):
        max_tries = 20
        while True:
            
            bt.logging.debug("Retrieving data from prompting.dataset...")
            for _ in range(max_tries):
                title, url = get_random_wikipedia_article()
                content = get_wikipedia_article_content(title)
                
                if f'{title} may refer to:' in content:
                    continue

                if len(content.split()) < 250:
                    continue
                
                # TBD
                tags = []

                # TODO return useful addition fields
                if content.strip():
                    return {"text": content,'title': title, 'url': url, 'tags': tags}
            
            bt.logging.debug(f"Failed to retrieve data from prompting.dataset after {max_tries} tries")
            

In [None]:
dataset = Dataset()
next(dataset)

In [None]:

n_trials = 1
n_references = 3

tasks = [SummarizationTask(llm=llm, dataset=dataset)]#, DebuggingTask, QuestionAnsweringTask]

df = pd.DataFrame()
for i in range(n_trials):

    # loop over all task types
    # for now, just summarization
    for task in tasks:
        
        # loop over all the different formulations of a given task
        # for task_params in [{}]:
            
        # If this is debugging a reference has been created, otherwise there is no reference
        # For now we will ignore debugging tasks for simplicity
        # task = task_class(llm, **task_params)
        
        bt.logging.info("ðŸ¤– Creating agent...")
        agent = Agent(llm=llm, task=task)
        
        query = agent.query
        # Create reference answers
        bt.logging.info("ðŸ¤– Creating reference answers...")
        references = agent.generate_reference_answers(n=n_references)
        
        query_eval = GPT(gpt_judge_prompt).parse()
            
            
            
    


In [None]:
agent

In [None]:
task

In [None]:
print(task.challenge)

In [None]:
print( task.create_summary_prompt(task.challenge) )

In [None]:
references