In [6]:
import json
import random
import tqdm
import jsonlines
from datetime import datetime

In [2]:
contents = []
with open('/<YOUR_OWN_PATH>/ToolQA/data/raw_data/agenda/agenda_events.jsonl', 'r') as f:
    for item in jsonlines.Reader(f):
        contents.append(item)

In [3]:
import pandas as pd
df = pd.DataFrame(contents)

In [4]:
print(df.head())

                      event start_time  end_time                location  \
0         Breakfast meeting    9:00 AM  10:00 AM           Bluebird Cafe   
1  Sales pitch presentation   10:30 AM  11:30 AM          Hilton Chicago   
2             Cooking class   11:00 AM   1:00 PM           Sur La Table    
3     Lunch with colleagues    1:00 PM   2:00 PM  The Cheesecake Factory   
4        Charity fundraiser    2:30 PM   4:30 PM        The Ritz-Carlton   

    person        date           id  
0      Mia  2022/12/15  agenda-0000  
1  William  2022/07/05  agenda-0001  
2    Emily  2022/03/04  agenda-0002  
3     Adam  2022/12/02  agenda-0003  
4    Layla  2022/08/21  agenda-0004  


In [7]:
def compute_duration(start_time: str, end_time: str, time_format: str = "%I:%M %p") -> str:
    start_time_obj = datetime.strptime(start_time, time_format)
    end_time_obj = datetime.strptime(end_time, time_format)

    duration = end_time_obj - start_time_obj

    return str(duration)


start_time = df.iloc[0]["start_time"]
end_time = df.iloc[0]["end_time"]
print(compute_duration(start_time, end_time))

1:00:00


In [8]:
question_id = 0
questions = []
num_question_per_template = 20

In [9]:
# How many events happen on {date}?
for index in tqdm.tqdm(range(num_question_per_template)):
    date_list = df['date'].unique()
    date = random.choice(date_list)
    question = "How many events happen on {} in the agenda table?".format(date)
    answer = len(df[df['date'] == date])
    questions.append({"qid": "hard-agenda-{:0>4d}".format(question_id), "question":question, "answer":answer})
    question_id += 1
print(questions[-1])

100%|██████████| 20/20 [00:00<00:00, 764.88it/s]

{'qid': 'hard-agenda-0019', 'question': 'How many events happen on 2022/05/06 in the agenda table?', 'answer': 21}





In [10]:
def convert_time(current_time: str, time_format: str = "%I:%M %p") -> str:
    time_obj = datetime.strptime(current_time, time_format)
    return time_obj
def convert_time2(current_time: str, time_format: str = "%I %p") -> str:
    time_obj = datetime.strptime(current_time, time_format)
    return time_obj

# How many people are unavailble between {start_time} and {end_time} on {date}?
for index in tqdm.tqdm(range(num_question_per_template)):
    date_list = df['date'].unique()
    date = random.choice(date_list)
    sub_data = df[df['date'] == date]
    random_indices = random.randint(0, len(sub_data)-1)
    start_time = sub_data.iloc[random_indices]['start_time']
    end_time = sub_data.iloc[random_indices]['end_time']
    question = "Who is unavailble between {} and {} on {} in the agenda table?".format(start_time, end_time, date)
    # answer = len(list(set(df[(df['date'] == date) & (df['end_time'] > start_time) & (df['start_time'] < end_time)]['person'].unique())))
    sub_table = df[(df['date'] == date)]
    answer = []
    for i in range(len(sub_table)):
        row = sub_table.iloc[i]
        try:
            row_start_time = convert_time(row['start_time'])
        except:
            row_start_time = convert_time2(row['start_time'])
        try:
            row_end_time = convert_time(row['end_time'])
        except:
            row_end_time = convert_time2(row['end_time'])
        try:
            st_time = convert_time(start_time)
        except:
            st_time = convert_time2(start_time)
        try:
            ed_time = convert_time(end_time)
        except:
            ed_time = convert_time2(end_time)
        if row_start_time < ed_time and row_end_time > st_time:
            answer.append(row['person'])
        
    answer = len(list(set(answer)))
    questions.append({"qid": "hard-agenda-{:0>4d}".format(question_id), "question":question, "answer":answer})
    question_id += 1
print(questions[-1])

100%|██████████| 20/20 [00:00<00:00, 230.82it/s]

{'qid': 'hard-agenda-0039', 'question': 'Who is unavailble between 9:00 AM and 2:00 PM on 2022/06/07 in the agenda table?', 'answer': 15}





In [11]:
def find_available_slots(meetings: list, meeting_duration: int, start_boundary: str, end_boundary: str, time_format: str = "%I:%M %p") -> list:
    sorted_meetings = sorted(meetings, key=lambda x: datetime.strptime(x[0], time_format))
    available_slots = []

    start_boundary_obj = datetime.strptime(start_boundary, time_format)
    end_boundary_obj = datetime.strptime(end_boundary, time_format)

    if (datetime.strptime(sorted_meetings[0][0], time_format) - start_boundary_obj).seconds >= meeting_duration * 60:
        available_slots.append((start_boundary, sorted_meetings[0][0]))

    for i in range(len(sorted_meetings) - 1):
        start_gap = datetime.strptime(sorted_meetings[i][1], time_format)
        end_gap = datetime.strptime(sorted_meetings[i + 1][0], time_format)
        gap_duration = int((end_gap - start_gap).total_seconds() / 60)

        if gap_duration >= meeting_duration:
            available_slots.append((sorted_meetings[i][1], sorted_meetings[i + 1][0]))

    if (end_boundary_obj - datetime.strptime(sorted_meetings[-1][1], time_format)).seconds >= meeting_duration * 60:
        available_slots.append((sorted_meetings[-1][1], end_boundary))

    return available_slots
# When should I schedule a meeting with {person} on {date}?
for index in tqdm.tqdm(range(num_question_per_template)):
    date_list = df['date'].unique()
    date = random.choice(date_list)
    person = random.choice(df[df['date'] == date]['person'].unique())
    question = "When should I schedule a meeting with {} from 9:00 AM to 6:00 PM on {} in the agenda table?".format(person, date)
    sub_data = df[(df['date'] == date) & (df['person'] == person)]
    times = []
    for i in range(len(sub_data)):
        row = sub_data.iloc[i]
        times.append((row['start_time'], row['end_time']))
    meeting_duration = 60  # Meeting duration in minutes
    start_boundary = "9:00 AM"
    end_boundary = "6:00 PM"

    available_slots = find_available_slots(times, meeting_duration, start_boundary, end_boundary)
    for i in range(len(available_slots)):
        available_slots[i] = available_slots[i][0]+'-'+available_slots[i][1]
    answer = ", ".join(available_slots)
    questions.append({"qid": "hard-agenda-{:0>4d}".format(question_id), "question":question, "answer":answer})
    question_id += 1
print(questions[-1])

100%|██████████| 20/20 [00:00<00:00, 339.14it/s]

{'qid': 'hard-agenda-0059', 'question': 'When should I schedule a meeting with Jessica from 9:00 AM to 6:00 PM on 2022/11/28 in the agenda table?', 'answer': '9:00 AM-7:00 PM, 9:00 PM-6:00 PM'}





In [12]:
# What events does {person} have on {date}?
for index in tqdm.tqdm(range(num_question_per_template)):
    date_list = df['date'].unique()
    date = random.choice(date_list)
    person = random.choice(df[df['date'] == date]['person'].unique())
    question = "What events does {} have on {} in the agenda table?".format(person, date)
    answer = ", ".join(list(df[(df['date'] == date) & (df['person'] == person)]['event'].unique()))
    questions.append({"qid": "hard-agenda-{:0>4d}".format(question_id), "question":question, "answer":answer})
    question_id += 1
print(questions[-1])

100%|██████████| 20/20 [00:00<00:00, 346.89it/s]

{'qid': 'hard-agenda-0079', 'question': 'What events does Emily have on 2022/09/19 in the agenda table?', 'answer': 'Theatre performance, Art Walk'}





In [13]:
 # How many dates in the agenda table have {person} scheduled?
for index in tqdm.tqdm(range(num_question_per_template)):
    person_list = df['person'].unique()
    person = random.choice(person_list)
    question = "How many dates in the agenda table have {} scheduled?".format(person)
    answer = len(df[df['person'] == person]['date'].unique())
    questions.append({"qid": "hard-agenda-{:0>4d}".format(question_id), "question":question, "answer":answer})
    question_id += 1
print(questions[-1])

100%|██████████| 20/20 [00:00<00:00, 680.72it/s]

{'qid': 'hard-agenda-0099', 'question': 'How many dates in the agenda table have Imogen scheduled?', 'answer': 88}





In [14]:
with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/genda-hard.jsonl', mode='w') as writer:
    for row in questions:
        writer.write(row)