In [2]:
import json
import pandas as pd
import random
import time
import jsonlines
import geopy
import geopy.distance

In [3]:
def read_data(file_path):
    data_file = open(file_path)
    data = []
    for line in data_file:
        data.append(json.loads(line))
    data = pd.DataFrame(data)
    data_file.close()
    return data

In [4]:
def question11_gen(data, num_questions_per_template, questions, question_id, categories):
    # How many {category} businesses are there in {city}, {state}?
    city_list = data["city"].unique()
    target = question_id + num_questions_per_template
    while question_id < target:
        city = random.choice(city_list)
        cat = random.choice(categories)
        sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
        if len(sub_data) != 0:
            question = "How many {} businesses are there in {}, {}?".format(cat, city, data[data["city"] == city]["state"].iloc[0])
            answer = len(sub_data)
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [5]:
def question12_gen(data, num_questions_per_template, questions, question_id, categories):
    # How many bussinesses are there in {postal_code} area of {city}, {state}?
    postalcode_list = data["postal_code"].unique()
    target = question_id + num_questions_per_template
    while question_id < target:
        postalcode = random.choice(postalcode_list)
        sub_data = data[data["postal_code"] == postalcode]
        if len(sub_data) != 0:
            question = "How many businesses are there in {} area of {}, {}?".format(postalcode, sub_data["city"].iloc[0], sub_data["state"].iloc[0])
            answer = len(sub_data)
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [6]:
def question13_gen(data, num_questions_per_template, questions, question_id, categories):
    # Which {category} business has the highest star rating in {city}, {state}?
    city_list = data["city"].unique()
    target = question_id + num_questions_per_template
    while question_id < target:
        city = random.choice(city_list)
        cat = random.choice(categories)
        sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
        if len(sub_data) != 0:
            question = "Which {} business has the highest star rating in {}, {}?".format(cat, city, data[data["city"] == city]["state"].iloc[0])
            answer = sub_data[sub_data["stars"] == max(sub_data["stars"])]["name"].iloc[0]
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [7]:
def question14_gen(data, num_questions_per_template, questions, question_id, categories):
    # Which {category} business has the highest review count in {city}, {state}?
    city_list = data["city"].unique()
    target = question_id + num_questions_per_template
    while question_id < target:
        city = random.choice(city_list)
        cat = random.choice(categories)
        sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
        if len(sub_data) != 0:
            question = "Which {} business has the highest review count in {}, {}?".format(cat, city, data[data["city"] == city]["state"].iloc[0])
            answer = sub_data[sub_data["review_count"] == max(sub_data["review_count"])]["name"].iloc[0]
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [8]:
def question15_gen(data, num_questions_per_template, questions, question_id, categories):
    # What is the average review counts of businesses within a 5-mile radius from {name}?
    target = question_id + num_questions_per_template
    while question_id < target:
        random_indices = random.randint(0, len(data) - 1)
        selected_data = data.iloc[random_indices]
        latitude = selected_data["latitude"]
        longitude = selected_data["longitude"]
        _, lo_max, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=90)
        _, lo_min, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=270)
        la_max, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=0)
        la_min, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=180)
        sub_data = data[(data["latitude"] <= la_max) & (data["latitude"] >= la_min) & (data["longitude"] <= lo_max) & (data["longitude"] >= lo_min)]
        if len(sub_data) != 0:
            question = "What is the average review counts of businesses within a 5-mile radius from {}?".format(selected_data["name"])
            answer = round(sub_data["review_count"].mean(), 2)
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [9]:
def question16_gen(data, num_questions_per_template, questions, question_id, categories):
    # Which is the nearest {category} business to {name}?
    target = question_id + num_questions_per_template
    while question_id < target:
        random_indices = random.randint(0, len(data) - 1)
        selected_data = data.iloc[random_indices]
        city = selected_data["city"]
        try:
            cat = random.choice(categories)
            sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
            if len(sub_data) != 0:
                question = "Which is the nearest {} business to {}?".format(cat, selected_data["name"])
                answer = sub_data[(sub_data["latitude"] - selected_data["latitude"])**2 + (sub_data["longitude"] - selected_data["longitude"])**2 == min((sub_data["latitude"] - selected_data["latitude"])**2 + (sub_data["longitude"] - selected_data["longitude"])**2)]["name"].iloc[0]
                questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
                question_id += 1
        except:
            print(cat)
            input()
    return questions, question_id

In [10]:
def question17_gen(data, num_questions_per_template, questions, question_id, categories):
    # Can you recommend a {category} business with the highest star rating within a 5-mile radius of {address}?
    target = question_id + num_questions_per_template
    while question_id < target:
        random_indices = random.randint(0, len(data) - 1)
        selected_data = data.iloc[random_indices]
        address = selected_data["address"]
        latitude = selected_data["latitude"]
        longitude = selected_data["longitude"]
        city = selected_data["city"]
        cat = random.choice(categories)
        sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
        _, lo_max, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=90)
        _, lo_min, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=270)
        la_max, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=0)
        la_min, _, _ = geopy.distance.distance(kilometers=5).destination(point=(latitude, longitude), bearing=180)
        sub_data = data[(data["latitude"] <= la_max) & (data["latitude"] >= la_min) & (data["longitude"] <= lo_max) & (data["longitude"] >= lo_min)]
        if len(sub_data) != 0:
            question = "Can you recommend a {} business with the highest star rating within a 5-mile radius of {}?".format(cat, address)
            answer = sub_data[sub_data["stars"] == max(sub_data["stars"])]["name"].iloc[0]
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [11]:
def question18_gen(data, num_questions_per_template, questions, question_id, categories):
    # How many businesses are not open currently in {city}?
    target = question_id + num_questions_per_template
    city_list = data["city"].unique()
    while question_id < target:
        city = random.choice(city_list)
        sub_data = data[data["city"] == city]
        if len(sub_data) != 0:
            question = "How many businesses are not open currently in {}?".format(city)
            answer = len(sub_data[sub_data["is_open"] == 0])
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [12]:
def question19_gen(data, num_questions_per_template, questions, question_id, categories):
    # What is the average star rating of {category} businesses in {city}?
    target = question_id + num_questions_per_template
    city_list = data["city"].unique()
    while question_id < target:
        city = random.choice(city_list)
        cat = random.choice(categories)
        sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]
        if len(sub_data) != 0:
            question = "What is the average star rating of {} businesses in {}?".format(cat, city)
            answer = round(sub_data["stars"].mean(), 2)
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [13]:
def question20_gen(data, num_questions_per_template, questions, question_id, categories):
    # Which region has most bussinesses in {city}, {state}?
    target = question_id + num_questions_per_template
    city_list = data["city"].unique()
    while question_id < target:
        city = random.choice(city_list)
        state = data[data["city"] == city]["state"].iloc[0]
        sub_data = data[(data["city"] == city) & (data["state"] == state)]
        if len(sub_data) != 0:
            question = "Which postal code region has most bussinesses in {}, {}?".format(city, state)
            answer = sub_data["postal_code"].value_counts().index[0]
            questions.append({"qid": "hard-yelp-{:0>4d}".format(question_id), "question": question, "answer": answer})
            question_id += 1
    return questions, question_id

In [14]:
start_time = time.time()
file_path = "/<YOUR_OWN_PATH>/ToolQA/data/raw_data/yelp/yelp_academic_dataset_business.json"
data = read_data(file_path)
num_questions_per_template = 10
question_id = 0
questions = []

In [17]:
categories_list = data["categories"].unique()
categories = []
for cat in categories_list:
    if cat != None:
        categories += cat.split(", ")
categories = sorted(list(set(categories)))

In [18]:
# question template 11
questions, question_id = question11_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 11: {} seconds".format(end_time - start_time))

  sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]


[{'qid': 'hard-yelp-0000', 'question': 'How many Home Organization businesses are there in Franklin, TN?', 'answer': 4}, {'qid': 'hard-yelp-0001', 'question': 'How many Hair Salons businesses are there in Trooper, PA?', 'answer': 1}, {'qid': 'hard-yelp-0002', 'question': 'How many Tours businesses are there in Wayne, PA?', 'answer': 1}, {'qid': 'hard-yelp-0003', 'question': 'How many Southern businesses are there in Glenolden, PA?', 'answer': 1}, {'qid': 'hard-yelp-0004', 'question': 'How many Brewpubs businesses are there in Lumberton, NJ?', 'answer': 1}, {'qid': 'hard-yelp-0005', 'question': 'How many Wedding Planning businesses are there in Cinnaminson, NJ?', 'answer': 1}, {'qid': 'hard-yelp-0006', 'question': 'How many Florists businesses are there in Lansdale, PA?', 'answer': 5}, {'qid': 'hard-yelp-0007', 'question': 'How many Wholesale Stores businesses are there in Boise, ID?', 'answer': 7}, {'qid': 'hard-yelp-0008', 'question': 'How many Discount Store businesses are there in S

In [19]:
# question template 12
questions, question_id = question12_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 12: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0010', 'question': 'How many businesses are there in 18917 area of Dublin, PA?', 'answer': 25}, {'qid': 'hard-yelp-0011', 'question': 'How many businesses are there in T5L 2L4 area of Edmonton, AB?', 'answer': 1}, {'qid': 'hard-yelp-0012', 'question': 'How many businesses are there in 19120 area of Philadelphia, PA?', 'answer': 161}, {'qid': 'hard-yelp-0013', 'question': 'How many businesses are there in T6H 1M2 area of Edmonton, AB?', 'answer': 1}, {'qid': 'hard-yelp-0014', 'question': 'How many businesses are there in T6V 1B1 area of Edmonton, AB?', 'answer': 5}, {'qid': 'hard-yelp-0015', 'question': 'How many businesses are there in T5N 1R1 area of Edmonton, AB?', 'answer': 3}, {'qid': 'hard-yelp-0016', 'question': 'How many businesses are there in T5Y 2W7 area of Edmonton, AB?', 'answer': 5}, {'qid': 'hard-yelp-0017', 'question': 'How many businesses are there in T8H 2A2 area of Sherwood Park, AB?', 'answer': 3}, {'qid': 'hard-yelp-0018', 'question': 'How many b

In [20]:
# question template 13
questions, question_id = question13_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 13: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0020', 'question': 'Which Dentists business has the highest star rating in Wenonah, NJ?', 'answer': 'Nester & Wyckoff General Dentistry'}, {'qid': 'hard-yelp-0021', 'question': 'Which Sandwiches business has the highest star rating in Garnet Valley, PA?', 'answer': 'Lancaster County Sausage'}, {'qid': 'hard-yelp-0022', 'question': 'Which Restaurants business has the highest star rating in Affton, MO?', 'answer': 'Sushi Hana'}, {'qid': 'hard-yelp-0023', 'question': 'Which Dry Cleaning business has the highest star rating in Sun City Center, FL?', 'answer': 'Amazing Cleaners'}, {'qid': 'hard-yelp-0024', 'question': 'Which French business has the highest star rating in Indian Rocks Beach, FL?', 'answer': 'Cafe de Paris Bakery'}, {'qid': 'hard-yelp-0025', 'question': 'Which ATV Rentals/Tours business has the highest star rating in Sparks, NV?', 'answer': 'SWARMFIRE'}, {'qid': 'hard-yelp-0026', 'question': 'Which Videos & Video Game Rental business has the highest star r

In [21]:
# question template 14
questions, question_id = question14_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 14: {} seconds".format(end_time - start_time))

  sub_data = data[(data["city"] == city) & (data["categories"].str.contains(cat))]


[{'qid': 'hard-yelp-0030', 'question': 'Which Condominiums business has the highest review count in Tampa, FL?', 'answer': 'The Place At Channelside'}, {'qid': 'hard-yelp-0031', 'question': 'Which Imported Food business has the highest review count in Santa Barbara, CA?', 'answer': 'El Sitio'}, {'qid': 'hard-yelp-0032', 'question': 'Which Windshield Installation & Repair business has the highest review count in Lutz, FL?', 'answer': 'Gerber Collision & Glass'}, {'qid': 'hard-yelp-0033', 'question': 'Which Diagnostic Imaging business has the highest review count in New Orleans, LA?', 'answer': 'All American Healthcare New Orleans'}, {'qid': 'hard-yelp-0034', 'question': 'Which Restaurants business has the highest review count in Fortville, IN?', 'answer': "Simeri's Italian"}, {'qid': 'hard-yelp-0035', 'question': 'Which Bridal business has the highest review count in Ardmore, PA?', 'answer': 'Bijou Bridal & Special Occasion Ardmore'}, {'qid': 'hard-yelp-0036', 'question': 'Which Pet Sto

In [22]:
# question template 15
questions, question_id = question15_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 15: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0040', 'question': 'What is the average review counts of businesses within a 5-mile radius from Uptown Delivery Pharmacy?', 'answer': 101.2}, {'qid': 'hard-yelp-0041', 'question': 'What is the average review counts of businesses within a 5-mile radius from Residence Inn Tampa Suncoast Parkway at NorthPointe Village?', 'answer': 27.21}, {'qid': 'hard-yelp-0042', 'question': "What is the average review counts of businesses within a 5-mile radius from Kohl's?", 'answer': 50.91}, {'qid': 'hard-yelp-0043', 'question': 'What is the average review counts of businesses within a 5-mile radius from Massage Heights?', 'answer': 16.17}, {'qid': 'hard-yelp-0044', 'question': 'What is the average review counts of businesses within a 5-mile radius from Sublime Yoga and Wellness?', 'answer': 56.52}, {'qid': 'hard-yelp-0045', 'question': 'What is the average review counts of businesses within a 5-mile radius from Duvall Flooring?', 'answer': 41.82}, {'qid': 'hard-yelp-0046', 'questi

In [23]:
# question template 16
questions, question_id = question16_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 16: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0050', 'question': 'Which is the nearest Chinese business to Sugar Mamas Custom Cakes?', 'answer': 'Hong Kong Restaurant'}, {'qid': 'hard-yelp-0051', 'question': 'Which is the nearest Cooking Classes business to 5-Star Staffing Solutions?', 'answer': 'Sur La Table'}, {'qid': 'hard-yelp-0052', 'question': 'Which is the nearest Orthopedists business to TNS Diamonds?', 'answer': "Benjamin's On the Row"}, {'qid': 'hard-yelp-0053', 'question': 'Which is the nearest Immigration Law business to MYNT Cannabis Dispensary Downtown Reno?', 'answer': 'Law Office Of Mark Mausert'}, {'qid': 'hard-yelp-0054', 'question': 'Which is the nearest Himalayan/Nepalese business to Roller City?', 'answer': 'Himalaya Kabob Korner'}, {'qid': 'hard-yelp-0055', 'question': 'Which is the nearest Weight Loss Centers business to Everingham Elecrtic?', 'answer': 'Fresh Vitality Medical Spa & Center for Health'}, {'qid': 'hard-yelp-0056', 'question': 'Which is the nearest Bangladeshi business to Ha

In [24]:
# question template 17
questions, question_id = question17_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 17: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0060', 'question': 'Can you recommend a Art Supplies business with the highest star rating within a 5-mile radius of 615 Channelside Dr?', 'answer': 'Thach Used Tires'}, {'qid': 'hard-yelp-0061', 'question': 'Can you recommend a Rock Climbing business with the highest star rating within a 5-mile radius of 2428 Nolensville Rd?', 'answer': 'barre3 Nashville - The Gulch'}, {'qid': 'hard-yelp-0062', 'question': 'Can you recommend a Salvadoran business with the highest star rating within a 5-mile radius of 540 Simpson Dr?', 'answer': 'Monkey Fish Toys'}, {'qid': 'hard-yelp-0063', 'question': 'Can you recommend a General Festivals business with the highest star rating within a 5-mile radius of 719 Toulouse St?', 'answer': 'Ann Becnel Companion Dogs'}, {'qid': 'hard-yelp-0064', 'question': 'Can you recommend a Pet Waste Removal business with the highest star rating within a 5-mile radius of 598 Sam Ridley Pkwy W?', 'answer': "WilkerSon's Heating & Cooling"}, {'qid': 'hard-

In [25]:
# question template 18
questions, question_id = question18_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 18: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0070', 'question': 'How many businesses are not open currently in Cottage Hills?', 'answer': 0}, {'qid': 'hard-yelp-0071', 'question': 'How many businesses are not open currently in Apollo beach?', 'answer': 0}, {'qid': 'hard-yelp-0072', 'question': 'How many businesses are not open currently in West Norriton ?', 'answer': 0}, {'qid': 'hard-yelp-0073', 'question': 'How many businesses are not open currently in Tarpon springs?', 'answer': 0}, {'qid': 'hard-yelp-0074', 'question': 'How many businesses are not open currently in Mt Laurel Township?', 'answer': 0}, {'qid': 'hard-yelp-0075', 'question': 'How many businesses are not open currently in Glen Carbon?', 'answer': 3}, {'qid': 'hard-yelp-0076', 'question': 'How many businesses are not open currently in Washington Township?', 'answer': 2}, {'qid': 'hard-yelp-0077', 'question': 'How many businesses are not open currently in McCordsville?', 'answer': 0}, {'qid': 'hard-yelp-0078', 'question': 'How many businesses are

In [26]:
# question template 19
questions, question_id = question19_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 19: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0080', 'question': 'What is the average star rating of Limos businesses in St. Ann?', 'answer': 5.0}, {'qid': 'hard-yelp-0081', 'question': 'What is the average star rating of Television Service Providers businesses in Saint Petersburg?', 'answer': 1.5}, {'qid': 'hard-yelp-0082', 'question': 'What is the average star rating of Body Shops businesses in Brentwood?', 'answer': 3.12}, {'qid': 'hard-yelp-0083', 'question': 'What is the average star rating of Asian Fusion businesses in Ladue?', 'answer': 3.0}, {'qid': 'hard-yelp-0084', 'question': 'What is the average star rating of Pet Groomers businesses in Hermitage?', 'answer': 2.83}, {'qid': 'hard-yelp-0085', 'question': 'What is the average star rating of Car Buyers businesses in Woodbury?', 'answer': 2.0}, {'qid': 'hard-yelp-0086', 'question': 'What is the average star rating of Local Services businesses in Mount Holly?', 'answer': 3.4}, {'qid': 'hard-yelp-0087', 'question': 'What is the average star rating of Arts

In [27]:
# question template 20
questions, question_id = question20_gen(data, num_questions_per_template, questions, question_id, categories)
end_time = time.time()
print(questions[-10:])
print("Time elapsed for Question 20: {} seconds".format(end_time - start_time))

[{'qid': 'hard-yelp-0090', 'question': 'Which region has most bussinesses in Port Richey, FL?', 'answer': '34668'}, {'qid': 'hard-yelp-0091', 'question': 'Which region has most bussinesses in Spring House, PA?', 'answer': '19477'}, {'qid': 'hard-yelp-0092', 'question': 'Which region has most bussinesses in Mission Canyon, CA?', 'answer': '93105'}, {'qid': 'hard-yelp-0093', 'question': 'Which region has most bussinesses in Goodlettsville, TN?', 'answer': '37072'}, {'qid': 'hard-yelp-0094', 'question': 'Which region has most bussinesses in Upper Gwynedd, PA?', 'answer': '19446'}, {'qid': 'hard-yelp-0095', 'question': 'Which region has most bussinesses in Webster Grvs, MO?', 'answer': '63119'}, {'qid': 'hard-yelp-0096', 'question': 'Which region has most bussinesses in Schwenksville, PA?', 'answer': '19473'}, {'qid': 'hard-yelp-0097', 'question': 'Which region has most bussinesses in Brentwood, MO?', 'answer': '63144'}, {'qid': 'hard-yelp-0098', 'question': 'Which region has most bussines

In [28]:
with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/hard/yelp-hard.jsonl', mode='w') as writer:
    for row in questions:
        writer.write(row)