In [1]:
import csv
import json
import random
import jsonlines
import time
import pandas as pd
import geopy
import geopy.distance
import re

In [2]:
def question11_gen(data, num_questions_per_template, questions, question_id):
    # What is the total price at least if you want to stay at {NAME} with id {id} for {number} nights?
    target = question_id + num_questions_per_template
    while question_id < target:
        random_indices = random.randint(0, len(data) - 1)
        row = data.iloc[random_indices]
        Name = row["NAME"]
        id = row["id"]
        neighbourhood = row["neighbourhood"]
        number = random.randint(1, 20)
        least_days = row["minimum nights"]
        question = "What is the total price at least if you want to stay at {} in {} for {} nights?".format(Name, neighbourhood, number)
        if type(row["price"]) == str:
            price1 = float(re.sub(",","",row["price"][1:]))
        else:
            price1 = row["price"]
        if type(row["service fee"]) == str:
            price2 = float(re.sub(",","",row["service fee"][1:]))
        else:
            price2 = row["service fee"]
        if number < least_days:
            answer = "$ {:.1f}".format((price1 + price2) * least_days)
        else:
            answer = "$ {:.1f}".format((price1 + price2) * number)
        questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
        question_id += 1
    return questions, question_id

In [3]:
def question12_gen(data, num_questions_per_template, questions, question_id):
    # How many airbnbs are there in {neighbourhood}?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        question = "How many airbnbs are there in {}?".format(neighbourhood)
        answer = str(len(data[data["neighbourhood"] == neighbourhood]))
        questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
        question_id += 1
    return questions, question_id

In [4]:
def question13_gen(data, num_questions_per_template, questions, question_id):
    # What is the average price of airbnbs in {neighbourhood}?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        average_price = 0
        for row_id in range(len(sub_data)):
            content = sub_data.iloc[row_id]
            if type(content["price"]) == str:
                average_price += float(re.sub(",", "", content["price"][1:]))
            else:
                average_price += content["price"]
        if len(sub_data) != 0:
            average_price = average_price / len(sub_data)
            question = "What is the average price of airbnbs in {}?".format(neighbourhood)
            answer = "$ {:.1f}".format(average_price)
            questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
            question_id += 1
    return questions, question_id

In [5]:
def question14_gen(data, num_questions_per_template, questions, question_id):
    # What is the average review rates within 5 miles from {NAME} with id {id}?
    target = question_id + num_questions_per_template
    while question_id < target:
        random_indices = random.randint(0, len(data) - 1)
        row = data.iloc[random_indices]
        name = row["NAME"]
        id = row["id"]
        latitude = row["lat"]
        longitude = row["long"]
        neighbourhood = row["neighbourhood"]
        _, lo_max, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=90)
        _, lo_min, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=270)
        la_max, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=0)
        la_min, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=180)
        sub_data = data[(data["lat"] <= la_max) & (data["lat"] >= la_min) & (data["long"] <= lo_max) & (data["long"] >= lo_min)]
        if len(sub_data) != 0:
            question = "What is the average review rates within 5 miles from {} in {}?".format(name, neighbourhood)
            answer = round(sub_data["review rate number"].mean(), 2)
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [6]:
def question15_gen(data, num_questions_per_template, questions, question_id):
    # How much proporion of airbnbs in {neighbourhood} have a flexible cancellation policy?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        sub_data = sub_data[sub_data["cancellation_policy"] == "flexible"]
        question = "How much proporion of airbnbs in {} have a flexible cancellation policy?".format(neighbourhood)
        if len(data[data["neighbourhood"] == neighbourhood]) != 0:
            answer = str(round(len(sub_data) / len(data[data["neighbourhood"] == neighbourhood]), 2))
            questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
            question_id += 1
    return questions, question_id

In [7]:
def question16_gen(data, num_questions_per_template, questions, question_id):
    # How much does it cost per night to stay at the most expensive entire home/apt in {neighbourhood}?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        sub_data = sub_data[sub_data["room type"] == "Entire home/apt"]
        sub_data = sub_data.sort_values(by=["price"], ascending=False)
        question = "How much does it cost per night to stay at the most expensive entire home/apt in {}?".format(neighbourhood)
        if len(sub_data) != 0:
            if type(sub_data.iloc[0]["price"]) == str:
                answer = sub_data.iloc[0]["price"]
            else:
                answer = "$ {}".format(sub_data.iloc[0]["price"])
            questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
            question_id += 1
    return questions, question_id

In [8]:
def question17_gen(data, num_questions_per_template, questions, question_id):
    # How many airbnbs are there in {neighbourhood} that have a review rate higher than 4?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        sub_data = sub_data[sub_data["review rate number"] >= 4]
        question = "How many airbnbs are there in {} that have a review rate higher than 4?".format(neighbourhood)
        answer = str(len(sub_data))
        questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
        question_id += 1
    return questions, question_id

In [9]:
def question18_gen(data, num_questions_per_template, questions, question_id):
    # Can you recommend me a hotel room with the lowest price in {neighbourhood}?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        sub_data = sub_data.sort_values(by=["price"], ascending=True)
        question = "Can you recommend me a hotel room with the lowest price in {}?".format(neighbourhood)
        answer = sub_data.iloc[0]["NAME"]
        questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
        question_id += 1
    return questions, question_id

In [10]:
def question19_gen(data, num_questions_per_template, questions, question_id):
    # Can you recommend me a private room with the highest reviews per month that can host at least 2 people in  {neighbourhood}?
    target = question_id + num_questions_per_template
    neighbourhood_list = data["neighbourhood"].unique()
    while question_id < target:
        neighbourhood = random.choice(neighbourhood_list)
        sub_data = data[data["neighbourhood"] == neighbourhood]
        sub_data = sub_data[sub_data["room type"] == "Private room"]
        sub_data = sub_data[sub_data["calculated host listings count"] >= 2]
        sub_data = sub_data.sort_values(by=["reviews per month"], ascending=False)
        if len(sub_data) != 0:
            question = "Can you recommend me a private room with the highest review rate that can host at least 2 people in {}?".format(neighbourhood)
            answer = sub_data.iloc[0]["NAME"]
            questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
            question_id += 1
    return questions, question_id

In [11]:
def question20_gen(data, num_questions_per_template, questions, question_id):
    # Can you recommend a shared room with the lowest price within 5 miles from {longitude} longitude and {latitude} latitude?
    target = question_id + num_questions_per_template
    while question_id < target:
        longitude = random.uniform(-74.2589, -73.7004)
        latitude = random.uniform(40.4774, 40.9176)
        sub_data = data[(data["long"] - longitude)**2 + (data["lat"] - latitude)**2 <= 0.0169]
        sub_data = sub_data[sub_data["room type"] == "Shared room"]
        sub_data = sub_data.sort_values(by=["price"], ascending=True)
        if len(sub_data) != 0:
            question = "Can you recommend a shared room with the lowest price within 10 miles from {} longitude and {} latitude?".format(longitude, latitude)
            answer = sub_data.iloc[0]["NAME"]
            questions.append({"qid": "hard-airbnb-{:0>4d}".format(question_id), "question":question, "answer":answer})
            question_id += 1
    return questions, question_id

In [12]:
start_time = time.time()
file_path = "/<YOUR_OWN_PATH>/ToolQA/data/raw_data/airbnb/Airbnb_Open_Data.csv"
data = pd.read_csv(file_path)

num_questions_per_template = 10
question_id = 0
questions = []

  data = pd.read_csv(file_path)


In [13]:
# question template 11
questions, question_id = question11_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 11: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0000', 'question': 'What is the total price at least if you want to stay at Bright, modern room with panoramic window in Maspeth for 8 nights?', 'answer': '$ 42450.0'}, {'qid': 'hard-airbnb-0001', 'question': 'What is the total price at least if you want to stay at Quiet and spacious room for 2 in Maspeth for 4 nights?', 'answer': '$ 3848.0'}, {'qid': 'hard-airbnb-0002', 'question': 'What is the total price at least if you want to stay at Spacious Lovely 1B1B in UP Manhattan 4Min to Train in Washington Heights for 19 nights?', 'answer': '$ nan'}, {'qid': 'hard-airbnb-0003', 'question': 'What is the total price at least if you want to stay at Beautiful loft in DUMBO Brooklyn in Vinegar Hill for 7 nights?', 'answer': '$ 3661.0'}, {'qid': 'hard-airbnb-0004', 'question': 'What is the total price at least if you want to stay at Ditmas Park Beauty in Flatbush for 2 nights?', 'answer': '$ 3498.0'}, {'qid': 'hard-airbnb-0005', 'question': 'What is the total price at least

In [14]:
# question template 12
questions, question_id = question12_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 12: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0010', 'question': 'How many airbnbs are there in Rosedale?', 'answer': '171'}, {'qid': 'hard-airbnb-0011', 'question': 'How many airbnbs are there in Flatlands?', 'answer': '217'}, {'qid': 'hard-airbnb-0012', 'question': 'How many airbnbs are there in Brighton Beach?', 'answer': '168'}, {'qid': 'hard-airbnb-0013', 'question': 'How many airbnbs are there in Whitestone?', 'answer': '26'}, {'qid': 'hard-airbnb-0014', 'question': 'How many airbnbs are there in Jamaica Estates?', 'answer': '50'}, {'qid': 'hard-airbnb-0015', 'question': 'How many airbnbs are there in Windsor Terrace?', 'answer': '331'}, {'qid': 'hard-airbnb-0016', 'question': 'How many airbnbs are there in Port Richmond?', 'answer': '25'}, {'qid': 'hard-airbnb-0017', 'question': 'How many airbnbs are there in Bronxdale?', 'answer': '48'}, {'qid': 'hard-airbnb-0018', 'question': 'How many airbnbs are there in Roosevelt Island?', 'answer': '159'}, {'qid': 'hard-airbnb-0019', 'question': 'How many airbnbs

In [15]:
# question template 13
questions, question_id = question13_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 13: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0020', 'question': 'What is the average price of airbnbs in Boerum Hill?', 'answer': '$ 627.0'}, {'qid': 'hard-airbnb-0021', 'question': 'What is the average price of airbnbs in Woodside?', 'answer': '$ nan'}, {'qid': 'hard-airbnb-0022', 'question': 'What is the average price of airbnbs in Upper East Side?', 'answer': '$ nan'}, {'qid': 'hard-airbnb-0023', 'question': 'What is the average price of airbnbs in Financial District?', 'answer': '$ nan'}, {'qid': 'hard-airbnb-0024', 'question': 'What is the average price of airbnbs in Grant City?', 'answer': '$ 558.2'}, {'qid': 'hard-airbnb-0025', 'question': 'What is the average price of airbnbs in East Morrisania?', 'answer': '$ 764.7'}, {'qid': 'hard-airbnb-0026', 'question': 'What is the average price of airbnbs in Howland Hook?', 'answer': '$ 657.3'}, {'qid': 'hard-airbnb-0027', 'question': 'What is the average price of airbnbs in Glen Oaks?', 'answer': '$ 550.5'}, {'qid': 'hard-airbnb-0028', 'question': 'What is th

In [16]:
# question template 14
questions, question_id = question14_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 14: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0030', 'question': 'What is the average review rates within 5 miles from Spacious 1 BR in Midtown East in Midtown?', 'answer': 3.27}, {'qid': 'hard-yelp-0031', 'question': 'What is the average review rates within 5 miles from 10 mins/Airports JFK/LGA/Hosp/malls bus/train#4 in Richmond Hill?', 'answer': 3.39}, {'qid': 'hard-yelp-0032', 'question': 'What is the average review rates within 5 miles from Williamsburg Home with a View in Williamsburg?', 'answer': 3.28}, {'qid': 'hard-yelp-0033', 'question': 'What is the average review rates within 5 miles from Spacious Midtown East 2 Bedroom in Midtown?', 'answer': 3.27}, {'qid': 'hard-yelp-0034', 'question': 'What is the average review rates within 5 miles from Slick Prime Lower East East ~BRAND NEW 1br Apt~ in Lower East Side?', 'answer': 3.27}, {'qid': 'hard-yelp-0035', 'question': 'What is the average review rates within 5 miles from Heart of Park Slope in Park Slope?', 'answer': 3.26}, {'qid': 'hard-yelp-0036', 'ques

In [17]:
# question template 15
questions, question_id = question15_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 15: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0040', 'question': 'How much proporion of airbnbs in West Brighton have a flexible cancellation policy?', 'answer': '0.31'}, {'qid': 'hard-airbnb-0041', 'question': 'How much proporion of airbnbs in Fordham have a flexible cancellation policy?', 'answer': '0.31'}, {'qid': 'hard-airbnb-0042', 'question': 'How much proporion of airbnbs in Ditmars Steinway have a flexible cancellation policy?', 'answer': '0.3'}, {'qid': 'hard-airbnb-0043', 'question': 'How much proporion of airbnbs in Co-op City have a flexible cancellation policy?', 'answer': '0.2'}, {'qid': 'hard-airbnb-0044', 'question': 'How much proporion of airbnbs in SoHo have a flexible cancellation policy?', 'answer': '0.34'}, {'qid': 'hard-airbnb-0045', 'question': 'How much proporion of airbnbs in Wakefield have a flexible cancellation policy?', 'answer': '0.4'}, {'qid': 'hard-airbnb-0046', 'question': 'How much proporion of airbnbs in East Elmhurst have a flexible cancellation policy?', 'answer': '0.33'},

In [18]:
# question template 16
questions, question_id = question16_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 16: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0050', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Eastchester?', 'answer': '$961 '}, {'qid': 'hard-airbnb-0051', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Throgs Neck?', 'answer': '$984 '}, {'qid': 'hard-airbnb-0052', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Kingsbridge?', 'answer': '$978 '}, {'qid': 'hard-airbnb-0053', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Emerson Hill?', 'answer': '$317 '}, {'qid': 'hard-airbnb-0054', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Williamsbridge?', 'answer': '$990 '}, {'qid': 'hard-airbnb-0055', 'question': 'How much does it cost per night to stay at the most expensive entire home/apt in Edgemere?', 'answer': '$964 '}, {'qid': 'hard-airbnb-0056', 'question': 'How much does i

In [19]:
# question template 17
questions, question_id = question17_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 17: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0060', 'question': 'How many airbnbs are there in Little Italy that have a review rate higher than 4?', 'answer': '119'}, {'qid': 'hard-airbnb-0061', 'question': 'How many airbnbs are there in Springfield Gardens that have a review rate higher than 4?', 'answer': '115'}, {'qid': 'hard-airbnb-0062', 'question': 'How many airbnbs are there in Eltingville that have a review rate higher than 4?', 'answer': '6'}, {'qid': 'hard-airbnb-0063', 'question': 'How many airbnbs are there in Belmont that have a review rate higher than 4?', 'answer': '21'}, {'qid': 'hard-airbnb-0064', 'question': 'How many airbnbs are there in Concord that have a review rate higher than 4?', 'answer': '28'}, {'qid': 'hard-airbnb-0065', 'question': 'How many airbnbs are there in Douglaston that have a review rate higher than 4?', 'answer': '10'}, {'qid': 'hard-airbnb-0066', 'question': 'How many airbnbs are there in Prospect-Lefferts Gardens that have a review rate higher than 4?', 'answer': '550

In [20]:
# question template 18
questions, question_id = question18_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 18: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0070', 'question': 'Can you recommend me a hotel room with the lowest price in College Point?', 'answer': 'Doris’s Fresh House'}, {'qid': 'hard-airbnb-0071', 'question': 'Can you recommend me a hotel room with the lowest price in SoHo?', 'answer': 'Greenwich village 1 bedroom - spacious!'}, {'qid': 'hard-airbnb-0072', 'question': 'Can you recommend me a hotel room with the lowest price in Eltingville?', 'answer': '“No Place Like Home”\n1st Floor Suburban Apt.'}, {'qid': 'hard-airbnb-0073', 'question': 'Can you recommend me a hotel room with the lowest price in Holliswood?', 'answer': 'ROOM EN NEW YORK, MANHATTAN'}, {'qid': 'hard-airbnb-0074', 'question': 'Can you recommend me a hotel room with the lowest price in Concourse?', 'answer': 'One Bedroom Apartment  in TownHouse'}, {'qid': 'hard-airbnb-0075', 'question': 'Can you recommend me a hotel room with the lowest price in Silver Lake?', 'answer': 'New Apartment, Close to Ferry'}, {'qid': 'hard-airbnb-0076', 'ques

In [21]:
# question template 19
questions, question_id = question19_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 19: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0080', 'question': 'Can you recommend me a private room with the highest review rate that can host at least 2 people in College Point?', 'answer': 'Beautiful and Clean Private Bedroom with Bathroom'}, {'qid': 'hard-airbnb-0081', 'question': 'Can you recommend me a private room with the highest review rate that can host at least 2 people in St. George?', 'answer': '@FERRY, Private Cozy Room, Renovated&Stylish.'}, {'qid': 'hard-airbnb-0082', 'question': 'Can you recommend me a private room with the highest review rate that can host at least 2 people in Flatiron District?', 'answer': 'Sonder Flatiron | Spacious King Room+'}, {'qid': 'hard-airbnb-0083', 'question': 'Can you recommend me a private room with the highest review rate that can host at least 2 people in Co-op City?', 'answer': 'A unique apartment,  clean, quiet, and peaceful !'}, {'qid': 'hard-airbnb-0084', 'question': 'Can you recommend me a private room with the highest review rate that can host at least 

In [22]:
# question template 20
questions, question_id = question20_gen(data, num_questions_per_template, questions, question_id)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 20: {} seconds".format(end_time - start_time))

[{'qid': 'hard-airbnb-0090', 'question': 'Can you recommend a shared room with the lowest price within 10 miles from -74.06269614778004 longitude and 40.5813729578663 latitude?', 'answer': 'Premium Spot for female. Close to subway & Park'}, {'qid': 'hard-airbnb-0091', 'question': 'Can you recommend a shared room with the lowest price within 10 miles from -73.79998226317166 longitude and 40.581286157201006 latitude?', 'answer': 'Quiet & Elegant, 30 mins to midtown Manhattan!'}, {'qid': 'hard-airbnb-0092', 'question': 'Can you recommend a shared room with the lowest price within 10 miles from -73.70247078883878 longitude and 40.58720602526452 latitude?', 'answer': 'Home Away From Home'}, {'qid': 'hard-airbnb-0093', 'question': 'Can you recommend a shared room with the lowest price within 10 miles from -73.83109495286112 longitude and 40.835727665445795 latitude?', 'answer': 'Brand New Renovated Shared Room In  Manhattan'}, {'qid': 'hard-airbnb-0094', 'question': 'Can you recommend a shar

In [23]:
with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/airbnb-hard.jsonl', mode='w') as writer:
    for row in questions:
        writer.write(row)