# Date based Question Answering
Below is an example of how we can create novel questions and answers based on the date of famous events using wikipedia. 

> Retrieving https://en.wikipedia.org/wiki/January_01...

> Random Date: January 01

> Event: 1502 – The present-day location of Rio de Janeiro, Brazil, is first explored by the Portuguese.[13]

------
The **query** is based the event, e.g 

> "On what date was the present-day location of Rio de Janeiro, Brazil, first explored by the Portuguese?"

or simpler still

> "The present-day location of Rio de Janeiro, Brazil, is first explored by the Portuguese on what date?"

On the other hand the **challenge** should dress this up some more, e.g.

> "What date did the Portuguese first explore the present-day location of Rio de Janeiro, Brazil? Help me out here I'm trying to win a bet with my friend."

----
The reference can be simply the date.

> "January 01 1502"



In [56]:
import random
import datetime
import requests
from bs4 import BeautifulSoup
from tasks.dataset import WikiDataset

# Step 1: Generate a random date
year = 2000
month = random.randint(1, 12)
day = random.randint(1, 28)  # Simplified to avoid dealing with different month lengths
random_date = datetime.date(year, month, day)

# Step 2: Format the date for Wikipedia URL
formatted_date = random_date.strftime("%B_%d")  # E.g., "January_01"

# Step 3: Scrape Wikipedia
url = f"https://en.wikipedia.org/wiki/{formatted_date}"
print(f"Retrieving {url}...")
response = requests.get(url)
events = []

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    
    available_sections = []
    for name in ['Events', 'Births', 'Deaths']:
        section = soup.find('span', id=name)
        if section:
            available_sections.append(name)
    print(f"Available sections: {', '.join(available_sections)}")
    section = random.choice(available_sections)
    # Find the events section
    print(f'Extracting a random item from the "{section}" section...')
    events_list = soup.find('span', id=section).parent.find_next_sibling('ul')
    
    for li in events_list.find_all('li'):
        events.append(li)
    print(f'Section {section} has {len(events)} events.')

    # Step 4: Extract Event Information and Step 5: Select an Event
    if events:
        selected_event = random.choice(events)
        links = selected_event.find_all('a')
        link_titles = [link.get("title") for link in links]
        print(f'Date contains the following {len(links)} links: {link_titles}')
        if links:
            link = random.choice(links)
        print(f"Random Date: {random_date.strftime('%B %d')}")
        print(f"Event: {selected_event.get_text()}")
        print(f"Link: {link['title']}")
    else:
        print("No events found for this date.")
else:
    print("Failed to retrieve data from Wikipedia.")


WikiDataset().get_wikipedia_article_content(link['title'])

Retrieving https://en.wikipedia.org/wiki/August_21...
Available sections: Events, Births, Deaths
Extracting a random item from the "Births" section...
Section Births has 7 events.
Date contains the following 2 links: ['1481', 'Jorge de Lencastre, Duke of Coimbra']
Random Date: August 21
Event: 1481 – Jorge de Lencastre, Duke of Coimbra (d. 1550)
Link: Jorge de Lencastre, Duke of Coimbra


In [57]:
from tasks.dataset import WikiDataset

WikiDataset().get_wikipedia_article_content(link['title'])

{None: 'Jorge de Lencastre (English: George; 21 August 1481 – 22 July 1550) was a Portuguese prince, illegitimate son of King John II of Portugal and Ana de Mendonça, a lady-in-waiting to Joanna la Beltraneja.  He was created the second Duke of Coimbra in 1509. He was also master of the Order of Santiago and administrator of the Order of Aviz from 1492 to 1550.\n\n\n',
 'Early life': "Jorge de Lencastre was born in Abrantes on 21 August 1481 and raised by his aunt, the king's sister, Joan of Portugal, in the Convent of Jesus in Aveiro.  On Joan's death in 1490, Jorge was brought to the royal court, and was soon placed under the tutorship of monteiro-mor Diogo Fernandes de Almeida (the son of John II's late ally, Lopo de Almeida, Count of Abrantes).\n\n\n",
 'Succession Campaign': "After the death of the royal heir Prince Afonso in July 1491, King John II was left with no legitimate sons and no daughters he could marry off.  The next legitimate successor to the throne was his cousin (an

In [5]:
from datetime import datetime, timedelta


def get_events_on_dates(day_of_year, year=2021):
    
    start_of_year = datetime(year, 1, 1)
    date = start_of_year + timedelta(days=day_of_year - 1)
    # Step 2: Format the date for Wikipedia URL
    formatted_date = date.strftime("%B_%d")  # E.g., "January_01"

    # Step 3: Scrape Wikipedia
    url = f"https://en.wikipedia.org/wiki/{formatted_date}"
    # print(f"Retrieving {url}...")
    response = requests.get(url)
    events = []

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find the events section
        events_list = soup.find('span', id='Events').parent.find_next_sibling('ul')
        
        for li in events_list.find_all('li'):
            event_text = li.get_text()
            events.append(event_text)

        # Step 4: Extract Event Information and Step 5: Select an Event
        # if events:
        #     selected_event = random.choice(events)
        #     print(f"Random Date: {random_date.strftime('%B %d')}")
        #     print(f"Event: {selected_event}")
        # else:
        #     print("No events found for this date.")
    else:
        print("Failed to retrieve data from Wikipedia.")
    
    return events

events = []
import tqdm
pbar = tqdm.tqdm(range(1, 366), desc='Getting events', unit='day')
for doy in pbar:
    events += get_events_on_dates(doy)
    pbar.set_description(f'Total events: {len(events)}')
    # print(f'Added {len(events)} events from {doy} days')

Total events: 2047:  89%|████████▉ | 326/365 [05:08<00:36,  1.06day/s]


KeyboardInterrupt: 