# Custom Chatbot

TODO: In this cell, write an explanation of which dataset you have chosen and why it is appropriate for this task

In [104]:
import os
import dotenv
from dotenv import load_dotenv
import openai

In [99]:
load_dotenv(".env")

True

In [105]:
openai_client = openai.OpenAI(
  api_key=os.environ['OPENAI_API_KEY'],
)

## Data Wrangling

TODO: In the cells below, load your chosen dataset into a `pandas` dataframe with a column named `"text"`. This column should contain all of your text data, separated into at least 20 rows.

### Get the Data

In [69]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from datetime import datetime
from typing import Optional


def get_wikipedia_events(url: str) -> list[dict]:
    """Scrape events from a Wikipedia year-in-country page."""
    BASE_URL = "https://en.wikipedia.org"
    HEADERS = {"User-Agent": "Mozilla/5.0"}

    def _clean_event_text(text: str) -> str:
        """Normalize dashes, remove references like [2], and trim."""
        # Normalize en dash/em dash to plain ASCII hyphen
        text = text.replace("\u2013", "-").replace("\u2014", "-")

        # Remove [x] reference markers
        import re
        text = re.sub(r"\[\s*\d+\s*\]", "", text)
        return text.strip()

    def _parse_dates(date_str: str, year: int = 2024) -> tuple[datetime.date, Optional[datetime.date]]:
        """Parse single or span dates into datetime objects."""
        date_str = date_str.strip()
        # Normalize dashes
        date_str = date_str.replace("\u2013", "-").replace("\u2014", "-")

        if "-" in date_str:  # Handle spans like "30 January-3 February"
            start_str, end_str = [s.strip() for s in date_str.split("-", 1)]
            try:
                start_date = datetime.strptime(f"{start_str} {year}", "%d %B %Y").date()
            except ValueError:
                start_date = None
            try:
                end_date = datetime.strptime(f"{end_str} {year}", "%d %B %Y").date()
            except ValueError:
                end_date = None
            return start_date, end_date
        else:
            try:
                start_date = datetime.strptime(f"{date_str} {year}", "%d %B %Y").date()
            except ValueError:
                start_date = None
            return start_date, None

    response = requests.get(url, headers=HEADERS)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, "html.parser")

    events = []
    events_div = soup.find("h2", id="Events").parent
    sibling = events_div.find_next_sibling()

    while sibling and not (sibling.name == "div" and sibling.find("h2")):
        if sibling.name == "div" and sibling.find("h3"):  # month header
            month = sibling.get_text(strip=True).replace("[edit]", "")
        elif sibling.name == "ul":
            for li in sibling.find_all("li", recursive=False):
                text_raw = li.get_text(" ", strip=True)
                text_clean = _clean_event_text(text_raw)

                # Split into date + event
                if " - " in text_clean:
                    date_text, event_text = text_clean.split(" - ", 1)
                else:
                    date_text, event_text = None, text_clean

                # Parse dates
                date, date_end = _parse_dates(date_text or "", year=2024)

                refs = [a["href"][1:] for a in li.select("sup.reference a[href^='#cite_note']")]
                reference_urls, reference_entities = [], []

                # External reference section
                for ref in refs:
                    ref_li = soup.find("li", id=ref)
                    if ref_li:
                        link = ref_li.find("a", class_="external text")
                        if link:
                            url_ref = link["href"]
                            reference_urls.append(url_ref)
                            reference_entities.append(urlparse(url_ref).netloc.replace("www.", ""))

                # Fallback: inline links
                if not reference_urls:
                    for a in li.find_all("a", href=True):
                        href = a["href"]
                        if href.startswith("/wiki/"):
                            url_ref = urljoin(BASE_URL, href)
                            reference_urls.append(url_ref)
                            reference_entities.append("Wikipedia")

                events.append({
                    "month": month,
                    "date_text": date_text,
                    "date": date,
                    "date_end": date_end,
                    "event": event_text,
                    "refs": refs,
                    "reference_urls": reference_urls,
                    "reference_entities": reference_entities
                })
        sibling = sibling.find_next_sibling()

    return events


In [74]:
# Get events
URL = "https://en.wikipedia.org/wiki/2024_in_Spain"
events = get_wikipedia_events(URL)

print(f"Total events found: {len(events)}")
for e in events[:10]:
    print(e)

Total events found: 54
{'month': 'January', 'date_text': '30 January-3 February', 'date': datetime.date(2024, 1, 30), 'date_end': datetime.date(2024, 2, 3), 'event': 'Benidorm Fest 2024 (1st semifinal)', 'refs': ['cite_note-2', 'cite_note-3'], 'reference_urls': ['https://www.formulatv.com/noticias/fechas-benidorm-fest-2024-semifinales-final-123651/', 'https://www.escplus.es/eurovision/2023/los-candidatos-del-benidorm-fest-2024-se-presentaran-en-el-teatro-alameda-a-las-1830-el-proximo-11-de-noviembre/'], 'reference_entities': ['formulatv.com', 'escplus.es']}
{'month': 'February', 'date_text': '3 February', 'date': datetime.date(2024, 2, 3), 'date_end': None, 'event': '16th Gaudí Awards in Barcelona , Catalonia', 'refs': ['cite_note-4'], 'reference_urls': ['https://www.lavanguardia.com/vida/20230711/9102019/premios-gaudi-2024-otorgaran-3-febrero-teniendo-cuenta-codigo-conducta.html'], 'reference_entities': ['lavanguardia.com']}
{'month': 'February', 'date_text': '9 February', 'date': dat

In [None]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def get_event_reference_contents(
    events: list[dict],
    max_chars: int = 1000,
    timeout: int = 10,
) -> list[dict]:
    """
    Fetches reference content for each event and adds it as a new field.
    
    Args:
        events (list[dict]): List of event dictionaries.
        max_chars (int): Max number of characters to keep from scraped text.
        timeout (int): Timeout for HTTP requests.
    
    Returns:
        list[dict]: Enriched events with 'reference_content' field.
    """
    HEADERS = {"User-Agent": "Mozilla/5.0"}
    for e in tqdm(events):
        reference_content = []
        for url in e.get("reference_urls", []):
            try:
                resp = requests.get(url, headers=HEADERS, timeout=timeout)
                resp.raise_for_status()
                soup = BeautifulSoup(resp.text, "html.parser")

                # Grab visible text (first few paragraphs)
                paragraphs = soup.find_all("p")
                text = " ".join(p.get_text(" ", strip=True) for p in paragraphs)
                text = text.replace("\n", " ").strip()

                # Truncate long text
                if len(text) > max_chars:
                    text = text[:max_chars] + "..."

                reference_content.append(text)

            except Exception as ex:
                reference_content.append(None)

        # Join contents if multiple refs
        e["reference_content"] = reference_content
    
    return events


In [76]:
# Extend event context
events_extended = get_event_reference_contents(events)

100%|██████████| 54/54 [01:20<00:00,  1.48s/it]


In [94]:
events_extended[0]

{'month': 'January',
 'date_text': '30 January-3 February',
 'date': datetime.date(2024, 1, 30),
 'date_end': datetime.date(2024, 2, 3),
 'event': 'Benidorm Fest 2024 (1st semifinal)',
 'refs': ['cite_note-2', 'cite_note-3'],
 'reference_urls': ['https://www.formulatv.com/noticias/fechas-benidorm-fest-2024-semifinales-final-123651/',
  'https://www.escplus.es/eurovision/2023/los-candidatos-del-benidorm-fest-2024-se-presentaran-en-el-teatro-alameda-a-las-1830-el-proximo-11-de-noviembre/'],
 'reference_entities': ['formulatv.com', 'escplus.es'],
 'reference_content': ["CALENDARIO Así lo ha confirmado la organización en la rueda de prensa celebrada en Benidorm. Programa relacionado Benidorm Fest 2022 - Act España Concursos Entretenimiento Popularidad: #72 de 2.150 Vídeos FormulaTV 'Traitors: El debate' Programa 2 Tu Cara Me Suena El Debate! Analizamos la gala 6 Nos colamos en la grabación de Cifras y Letras Tu Cara Me Suena El Debate! Analizamos la gala 2 María Bernardeau y Biel Anton nos

### Persist the Data

In [None]:
import json
from datetime import datetime, date

def save_events_to_jsonl(events, path="events.jsonl") -> None:
    with open(path, "w", encoding="utf-8") as f:
        for event in events:
            event_copy = event.copy()
            # Convert dates to ISO string
            for k in ["date", "date_end"]:
                if isinstance(event_copy.get(k), (datetime, date)):
                    event_copy[k] = event_copy[k].isoformat()
            f.write(json.dumps(event_copy, ensure_ascii=False) + "\n")


def load_events_from_jsonl(path="events.jsonl") -> list[dict]:
    events = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            # Parse back dates
            for k in ["date", "date_end"]:
                if obj.get(k):
                    try:
                        obj[k] = datetime.fromisoformat(obj[k]).date()
                    except Exception:
                        pass
            events.append(obj)
    return events


In [89]:
# Save events to file
save_events_to_jsonl(events_extended, path="data/events_spain_2024.jsonl")

In [90]:
events_loaded = load_events_from_jsonl(path="data/events_spain_2024.jsonl")

In [93]:
events_loaded[0]

{'month': 'January',
 'date_text': '30 January-3 February',
 'date': datetime.date(2024, 1, 30),
 'date_end': datetime.date(2024, 2, 3),
 'event': 'Benidorm Fest 2024 (1st semifinal)',
 'refs': ['cite_note-2', 'cite_note-3'],
 'reference_urls': ['https://www.formulatv.com/noticias/fechas-benidorm-fest-2024-semifinales-final-123651/',
  'https://www.escplus.es/eurovision/2023/los-candidatos-del-benidorm-fest-2024-se-presentaran-en-el-teatro-alameda-a-las-1830-el-proximo-11-de-noviembre/'],
 'reference_entities': ['formulatv.com', 'escplus.es'],
 'reference_content': ["CALENDARIO Así lo ha confirmado la organización en la rueda de prensa celebrada en Benidorm. Programa relacionado Benidorm Fest 2022 - Act España Concursos Entretenimiento Popularidad: #72 de 2.150 Vídeos FormulaTV 'Traitors: El debate' Programa 2 Tu Cara Me Suena El Debate! Analizamos la gala 6 Nos colamos en la grabación de Cifras y Letras Tu Cara Me Suena El Debate! Analizamos la gala 2 María Bernardeau y Biel Anton nos

### Create the Dataframe with the Embeddings

In [121]:
import pandas as pd
import numpy as np
import ast

In [125]:
import pandas as pd
from openai import OpenAI
from typing import Union

def compute_embeddings(
    texts: Union[str, list[str]],
    openai_client: OpenAI,
    embeddings_model_name: str = "text-embedding-ada-002"
) -> list[list[float]]:
    """
    Compute embeddings for a single string or a list of strings using OpenAI.
    
    Args:
        texts: A string or list of strings to embed.
        openai_client: An OpenAI client instance.
        embeddings_model_name: Model name for embeddings.
    
    Returns:
        List of embeddings (one per input string).
    """
    if isinstance(texts, str):
        texts = [texts]

    response = openai_client.embeddings.create(
        model=embeddings_model_name,
        input=texts
    )

    return [item.embedding for item in response.data]


def compute_embeddings_from_df(
    df: pd.DataFrame,
    openai_client: OpenAI,
    embeddings_model_name: str = "text-embedding-ada-002",
    text_column: str = "text",
    batch_size: int = 100
) -> list[list[float]]:
    """
    Compute embeddings for a dataframe column of text using OpenAI, in batches.
    
    Args:
        df: DataFrame containing text data.
        openai_client: OpenAI client instance.
        embeddings_model_name: Model name for embeddings.
        text_column: Name of the column with text data.
        batch_size: Number of rows per batch for embedding computation.
    
    Returns:
        List of embeddings corresponding to each row in the DataFrame.
    """
    embeddings = []
    for i in range(0, len(df), batch_size):
        batch_texts = df.iloc[i:i + batch_size][text_column].tolist()
        batch_embeddings = compute_embeddings(batch_texts, openai_client, embeddings_model_name)
        embeddings.extend(batch_embeddings)

    return embeddings

In [126]:
def create_dataframe_from_events(
    events: list[dict],
    openai_client: openai.OpenAI,
) -> pd.DataFrame:
    """
    Create a DataFrame from events with columns:
      - date
      - event
      - text
    And compute embeddings for the `text` column.
    """
    # Build rows
    rows = []
    for e in events:
        date_text = e.get("date_text", "")
        event_text = e.get("event", "")
        reference_content = e.get("reference_content", [])

        # Ensure reference_content is always a list of strings
        if not isinstance(reference_content, list):
            reference_content = [str(reference_content)]

        text = f"Date: {date_text}. Event: {event_text}."
        if reference_content:
            ref_texts_valid = [str(rc) for rc in reference_content if rc]
            text += f" Event context: {'. '.join(ref_texts_valid)}"

        rows.append({
            "date": e.get("date"),
            "event": event_text,
            "text": text
        })

    # Create DataFrame
    df = pd.DataFrame(rows)

    # Compute embeddings
    df["embeddings"] = compute_embeddings_from_df(df, openai_client)

    return df


In [127]:
# Create the dataframe with embeddings
df = create_dataframe_from_events(events_extended, openai_client)

In [128]:
df.head()

Unnamed: 0,date,event,text,embeddings
0,2024-01-30,Benidorm Fest 2024 (1st semifinal),Date: 30 January-3 February. Event: Benidorm F...,"[-0.021250776946544647, -0.024269504472613335,..."
1,2024-02-03,"16th Gaudí Awards in Barcelona , Catalonia",Date: 3 February. Event: 16th Gaudí Awards in ...,"[-0.011150036007165909, 0.006784180179238319, ..."
2,2024-02-09,Two police officers are killed and two more ar...,Date: 9 February. Event: Two police officers a...,"[-0.012281500734388828, -0.0061645121313631535..."
3,2024-02-10,"38th Goya Awards in Valladolid , Castile and León",Date: 10 February. Event: 38th Goya Awards in ...,"[-0.020344838500022888, -0.020213834941387177,..."
4,2024-02-18,2024 Galician regional election,Date: 18 February. Event: 2024 Galician region...,"[-0.01590176671743393, -0.0005188596551306546,..."


In [129]:
# Dimensionality of the embeddings: 1536
print(len(df.loc[0, "embeddings"]))

1536


In [130]:
# Save to CSV
df.to_csv("data/events_spain_2024_embeddings.csv")

In [131]:
# Load from CSV
df = pd.read_csv("data/events_spain_2024_embeddings.csv", index_col=0)
df["embeddings"] = df["embeddings"].apply(ast.literal_eval)

In [132]:
df.head()

Unnamed: 0,date,event,text,embeddings
0,2024-01-30,Benidorm Fest 2024 (1st semifinal),Date: 30 January-3 February. Event: Benidorm F...,"[-0.021250776946544647, -0.024269504472613335,..."
1,2024-02-03,"16th Gaudí Awards in Barcelona , Catalonia",Date: 3 February. Event: 16th Gaudí Awards in ...,"[-0.011150036007165909, 0.006784180179238319, ..."
2,2024-02-09,Two police officers are killed and two more ar...,Date: 9 February. Event: Two police officers a...,"[-0.012281500734388828, -0.0061645121313631535..."
3,2024-02-10,"38th Goya Awards in Valladolid , Castile and León",Date: 10 February. Event: 38th Goya Awards in ...,"[-0.020344838500022888, -0.020213834941387177,..."
4,2024-02-18,2024 Galician regional election,Date: 18 February. Event: 2024 Galician region...,"[-0.01590176671743393, -0.0005188596551306546,..."


## Custom Query Completion

TODO: In the cells below, compose a custom query using your chosen dataset and retrieve results from an OpenAI `Completion` model. You may copy and paste any useful code from the course materials.

Total events found: 54
{'month': 'January', 'date_text': '30 January-3 February', 'date': datetime.date(2024, 1, 30), 'date_end': datetime.date(2024, 2, 3), 'event': 'Benidorm Fest 2024 (1st semifinal)', 'refs': ['cite_note-2', 'cite_note-3'], 'reference_urls': ['https://www.formulatv.com/noticias/fechas-benidorm-fest-2024-semifinales-final-123651/', 'https://www.escplus.es/eurovision/2023/los-candidatos-del-benidorm-fest-2024-se-presentaran-en-el-teatro-alameda-a-las-1830-el-proximo-11-de-noviembre/'], 'reference_entities': ['formulatv.com', 'escplus.es']}
{'month': 'February', 'date_text': '3 February', 'date': datetime.date(2024, 2, 3), 'date_end': None, 'event': '16th Gaudí Awards in Barcelona , Catalonia', 'refs': ['cite_note-4'], 'reference_urls': ['https://www.lavanguardia.com/vida/20230711/9102019/premios-gaudi-2024-otorgaran-3-febrero-teniendo-cuenta-codigo-conducta.html'], 'reference_entities': ['lavanguardia.com']}
{'month': 'February', 'date_text': '9 February', 'date': dat

## Custom Performance Demonstration

TODO: In the cells below, demonstrate the performance of your custom query using at least 2 questions. For each question, show the answer from a basic `Completion` model query as well as the answer from your custom query.

### Question 1

### Question 2