In [57]:
import re
from PyPDF2 import PdfReader


def extract_itinerary(pdf_path: str) -> str:
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text from the PDF file.

    """
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

        # Replace unpaired surrogates with a placeholder
        text = re.sub(r"[\ud800-\udfff]", "�", text)

        return text
    except Exception as e:
        print(e)
        return ""


if __name__ == "__main__":
    pdf_path = "/home/ubuntu/Desktop/golfthing/pdfs/event_3.pdf"
    print(extract_itinerary(pdf_path))

             
ARMY  INSTITUTE OF  MANAGEMENT  & TECHNOLOGY  
                                    GREATER  NOIDA  
Armotsav 2K22 -Annual Cultural and Sports Fest  
 
27th April 2022 - Sports Itinerary   
Time  Event  Venue  
5:00-5:30 PM  March Past  Football Ground  
5:30-6:00 PM  100 m Heats Boys and 
Girls  Football Ground  
5:30-6:30 PM  Chess Boys and Girls  Amphitheatre  
6:15-7:00PM  Football (Match -1) Football Ground  
 
 
28th April 2022 - Sports Event Itinerary  
Time  Event  Venue  
7:00-8:00 AM  Basketball Boys (Match -1) Basketball court  
7:00-8:30 AM  Badminton Boys and Girls  Badminton court  
7:00-8:30 AM  Table Tennis Boys and Girls  Cafeteria  
8:00-9:00 AM  Football (Match -2) Football Court  
5:30-6:15 PM  Volleyball Boys (Match -1) Volleyball Court  
6:20-7:00PM  Volleyball Boys (Match -2) Volleyball Court  
 
 
28th April 2022 - Cultu ral Event Itinerary  
Time  Event  Venue  
10:00 -11:30 Hrs  Quiz  Seminar hall  
11:45-13:00 Hrs Solo Singing  Seminar hall  
13:

In [58]:
# Optimized Prompt for Parsing Event Itinerary and Generating Structured Response

optimized_prompt = """
Parse the raw text event itinerary and generate a structured JSON response. Ensure the response is sorted by event start times. Handle missing data, multiple hosts, and events without end times.
- Only return the json response.
- The response should be sorted by event start times.

Required fields:

- `event_id`: Unique identifier.
- `title`: Event title.
- `start_time`: Start time.
- `end_time`: End time (if available).
- `duration`: Duration in minutes (if available).
- `host`: Host(s) as a list.
- `location`: Event location.
- `categories`: List of event categories.
- `description`: Event description.
- `date`: Event date.

Edge cases:

- Events without end times or durations: estimate based on context.
- Multiple hosts: include them as a list.
- Events with duration in minutes: convert to minutes for consistency.
- Multiple categories: include them as a list.
- If `end_time`, `duration`, `categories`, or `description` is missing, generate plausible values.
- Extract event date from the context.

Example raw text event itinerary:

Event 1: Golf Tournament, 2023-06-15 09:00 - 12:00, Hosted by Tiger Woods and Lee Trevino, Location: Pine Valley Golf Course
Categories: Golf, Social
Description: Join Tiger Woods and other golf enthusiasts for a thrilling golf tournament.

Event 2: Welcome to the Conference, 2023-06-15 10:00, Hosted by Conference Chair, Location: Main Hall
Duration: 15 minutes
Categories: Social, Educational
Description: Get ready to kick off the conference with a warm welcome from our esteemed chair.

### Structured JSON Response

```json
[
    {
        "date": "2023-06-15",
        "events": [
            {
                "event_id": "E001",
                "title": "Golf Tournament",
                "start_time": "09:00",
                "end_time": "12:00",
                "duration": 180,
                "host": ["Tiger Woods", "Lee Trevino"],
                "location": "Pine Valley Golf Course",
                "categories": ["Golf", "Social"],
                "description": "Join Tiger Woods and other golf enthusiasts for a thrilling golf tournament."
            },
            {
                "event_id": "E002",
                "title": "Welcome to the Conference",
                "start_time": "10:00",
                "end_time": "10:15",
                "duration": 15,
                "host": ["Conference Chair"],
                "location": "Main Hall",
                "categories": ["Social", "Educational"],
                "description": "Get ready to kick off the conference with a warm welcome from our esteemed chair."
            }
        ]
    }
]
"""

In [59]:
import os
import instructor
from groq import Groq
from dotenv import load_dotenv
from typing import List, Optional
from pydantic import BaseModel, Field, RootModel

load_dotenv()


class Event(BaseModel):
    event_id: str
    title: str
    start_time: str
    end_time: Optional[str] = None
    duration: Optional[int] = None
    host: List[str]
    location: str
    categories: List[str]
    description: Optional[str] = None


class DateEvents(BaseModel):
    date: str
    events: List[Event]


class EventItinerary(BaseModel):
    RootModel: List[DateEvents]


client = Groq(
    api_key=os.getenv("GROQ_API_KEY"),
)

client = instructor.from_groq(client, mode=instructor.Mode.TOOLS)


def parse_itinerary(pdf_text: str) -> EventItinerary:
    """
    Parses the raw text event itinerary and generates a structured JSON response.

    Args:
        pdf_text (str): The raw text event itinerary.

    Returns:
        EventItinerary: The structured JSON response.

    """
    # Your code here
    resp = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {
                "role": "system",
                "content": optimized_prompt,
                "role": "user",
                "content": (pdf_text),
            }
        ],
        response_model=EventItinerary,
        # temperature=0,
        # max_tokens=8192,
        # top_p=1,
    )

    return resp

In [60]:
pdf_text = extract_itinerary(pdf_path)

In [61]:
from groq import Groq


def generate_completion(pdf_text: str, optimized_prompt: str):
    client = Groq()
    completion = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[
            {"role": "system", "content": optimized_prompt},
            {
                "role": "user",
                "content": pdf_text,
            },
        ],
        temperature=0,
        max_tokens=8192,
        top_p=1,
        stream=False,
        stop=None,
    )
    return completion

In [62]:
json_response = generate_completion(pdf_text, optimized_prompt)

In [64]:
cont = json_response.choices[0].message.content

In [65]:
print(cont)

Here is the structured JSON response:

[
    {
        "date": "27th April 2022",
        "events": [
            {
                "event_id": "E001",
                "title": "March Past",
                "start_time": "5:00 PM",
                "end_time": "5:30 PM",
                "duration": 30,
                "host": [],
                "location": "Football Ground",
                "categories": ["Sports"],
                "description": "",
                "date": "27th April 2022"
            },
            {
                "event_id": "E002",
                "title": "100 m Heats Boys and Girls",
                "start_time": "5:30 PM",
                "end_time": "6:00 PM",
                "duration": 30,
                "host": [],
                "location": "Football Ground",
                "categories": ["Sports"],
                "description": "",
                "date": "27th April 2022"
            },
            {
                "event_id": "E003",
            