In [1]:
from dotenv import load_dotenv
import os
from googlesearch import search
import langchain
from pydantic import BaseModel

In [2]:
load_dotenv()

True

In [3]:
CLIENT_ID = "eL0dPt3KXcqqRG5Ka79cZQ"

In [4]:
secret = os.environ.get("SECRET_KEY")

In [5]:
x = search("new york city restaurant recommendations site:reddit.com", num_results=1)
urls = list(x)

In [6]:
urls

['https://www.reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/',
 'https://www.reddit.com/r/nyc/comments/13pltp/best_of_new_york_restaurants/',
 'https://www.reddit.com/r/FoodNYC/comments/yvpr48/top_5_must_eat_food_for_people_visiting_nyc/']

In [7]:
import praw

In [8]:
reddit = praw.Reddit(client_id=CLIENT_ID,
                     client_secret=secret,
                     password='Calculus2015!',
                        user_agent='APIMagic/0.0.1',
                        username='illumi_natu')
                    

In [9]:
posts = []

comment_urls = []
comment_upvotes = []

for url in urls:
    submission = reddit.submission(url=url)
    submission.comment_sort = 'top'

    comments = 0
    # Top 5 comments in this post by upvotes
    for top_level_comment in submission.comments:

        # print url to the comment
        comment_url = "https://reddit.com" + top_level_comment.permalink
        comment_urls.append(comment_url)
        comment_upvotes.append(top_level_comment.ups)

        if comments == 5:
            break
        if top_level_comment.body != "[deleted]" and top_level_comment.body != "[removed]":
            posts.append(top_level_comment.body)
            comments += 1

    # posts.append(submission.selftext)

print(posts, len(posts))


['I do appreciate whomever added Olive Garden to the list. I assume they mean the one in Times Square for maximum authenticity.', 'On a technical side of things: anyone else getting only the first \'page\' loading and you have to keep scrolling through each \'page\' to get all the pins to load?  Very screwy.\n\nThoughts/suggestions:\n\nFor someone working around Washington Sq Pk, you\'ve got not much there, so let\'s do that.\n\n* Saigon Shack - If you have good pho where you\'re from, skip, but I happen to love their oxtail and their lemon grass chicken pho.\n* Spicy Moon - Fucking fantastic vegan Szechuan\n* Pommes Frites - One of my favorite places in the world.  get fries, some sauces and go sit in the park.  War Sauce was my staple in HS, but I\'ve switched to Curry Ketchup Especial, and Ziggy\'s is fucking amazing, but honestly, they\'re all good and varied, ask for samples.  Much more fun w/ friends, b/c there\'s more sauces.\n* The Kati Roll Company - You have the Bryant Park l

In [10]:
from langchain import PromptTemplate

In [11]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

In [12]:
class Recommendation(BaseModel):
    body: str

In [13]:
from langchain.output_parsers import CommaSeparatedListOutputParser

output_parser = CommaSeparatedListOutputParser()

format_instructions = output_parser.get_format_instructions()

In [14]:
format_instructions

'Your response should be a list of comma separated values, eg: `foo, bar, baz`'

In [15]:
prompt = PromptTemplate(
    template = """I would like you to extract the recommendated businesses and attractions from this Reddit comment. Do not include the address or physical locations, only the business names. Do not create extra locations that are not in the comment, only extract locations that are stated in the comment itself.

{body}

{format_instructions}
""",
    input_variables = ["body"],
    partial_variables={"format_instructions": format_instructions}
)

In [16]:
model = OpenAI(temperature=0, openai_api_key=os.environ.get("OPENAI_API_KEY"))

In [17]:
inp = prompt.format(body=posts[0])
inp

'I would like you to extract the recommendated businesses and attractions from this Reddit comment. Do not include the address or physical locations, only the business names. Do not create extra locations that are not in the comment, only extract locations that are stated in the comment itself.\n\nI do appreciate whomever added Olive Garden to the list. I assume they mean the one in Times Square for maximum authenticity.\n\nYour response should be a list of comma separated values, eg: `foo, bar, baz`\n'

In [18]:
output = model(inp)

In [19]:
output_parser.parse(output)

['Olive Garden']

In [20]:
outputs = []

for post in posts:
    inp = prompt.format(body=post)
    output = model(inp)
    outputs.append(output_parser.parse(output))

In [21]:
outputs_formatted = []

for i, out in enumerate(outputs):
    outputs_formatted.append(f"List {i+1}: " + ', '.join(out).replace('\n', ''))

outputs_formatted = '\n'.join(outputs_formatted)
outputs_formatted


"List 1: Olive Garden\nList 2: Saigon Shack, Spicy Moon, Pommes Frites, The Kati Roll Company, The Dosa Man, Third Rail, Meskerem, Jajaja, The Donut Project, Jeju Noodle Bar, Hide Chan, Nakamura, TabeTomo, Joe's, 100 Mott St, New Green Bo, The Bao, Memories of Shanghai, Kajitsu, Grace St, Katsu-Hama, Peter Lugar, Keens, Playground, Khao Kang\nList 3: Lucali, Zooba, Veselka, Haile, Thursday Kitchen, 4 Charles, Suki, Raku, Salma, Forma, Taim, Cote, Misi, Lilia, Bernie's, 4 Horsemen, Haidilao Hotpot, Claro, Oxomoco, Le Crocodile, Jing Fong, Golden Unicorn, East Harbor Seafood Palace, Carbone, Kopitiam, Dhamaka, Semma, Magnolia Bakery, Levain, B&H Dairy, Golda, Maya Congee, Astoria Seafood, Queen's Night Market, Wayla, Casa Mono, Paulie Gee's, Atomix, 63 Clinton, Oxalis, Xixa, Torien, Blue Hill at Stone Barns, Contra, Atera, Aska, Fulgarances\nList 4: Jackson Heights, Birria Landia, Samudra, Angel Indian, King of Falafel, Taverna Kyclades, Sal Kris & Charlies, Harlem, Washington Heights, P

In [22]:
prompt_2 = PromptTemplate(
    template = """Please find the best 20 locations to visit in New York City, based on the locations given in these lists. Do not return more than 20 locations in the list. The lists are:

{body}

Your response should be a list of comma separated values of the location and the list number paired together using parentheses, eg: `(Bakery, 1), (Bar, 7), (Park, 5)`
""",
    input_variables = ["body"],
)

In [23]:
inp2 = prompt_2.format(body=outputs_formatted)
inp2

"I to find the best 20 locations to visit in New York City, based on the locations given in these lists. Do not return more than 20 locations in the list. The lists are:\n\nList 1: Olive Garden\nList 2: Saigon Shack, Spicy Moon, Pommes Frites, The Kati Roll Company, The Dosa Man, Third Rail, Meskerem, Jajaja, The Donut Project, Jeju Noodle Bar, Hide Chan, Nakamura, TabeTomo, Joe's, 100 Mott St, New Green Bo, The Bao, Memories of Shanghai, Kajitsu, Grace St, Katsu-Hama, Peter Lugar, Keens, Playground, Khao Kang\nList 3: Lucali, Zooba, Veselka, Haile, Thursday Kitchen, 4 Charles, Suki, Raku, Salma, Forma, Taim, Cote, Misi, Lilia, Bernie's, 4 Horsemen, Haidilao Hotpot, Claro, Oxomoco, Le Crocodile, Jing Fong, Golden Unicorn, East Harbor Seafood Palace, Carbone, Kopitiam, Dhamaka, Semma, Magnolia Bakery, Levain, B&H Dairy, Golda, Maya Congee, Astoria Seafood, Queen's Night Market, Wayla, Casa Mono, Paulie Gee's, Atomix, 63 Clinton, Oxalis, Xixa, Torien, Blue Hill at Stone Barns, Contra, At

In [24]:
output2 = model(inp2)
final = output_parser.parse(output2)

In [25]:
temp = output2.strip().split(',')[:40]

# join the ith and i+1th elements together, where i are the even indices
final = [','.join(temp[i:i+2]) for i in range(0, len(temp), 2)]

# parse the final outputs into a list of tuples
final = [tuple(x.strip().replace('(', '').replace(')', '').split(',')) for x in final]

# Convert the numbers into integers, removing anything that is not a number
final = [(x[0].strip(), int(''.join(filter(str.isdigit, x[1])))) for x in final]
final

[('Olive Garden', 1),
 ('Saigon Shack', 2),
 ('Spicy Moon', 2),
 ('Pommes Frites', 2),
 ('The Kati Roll Company', 2),
 ('The Dosa Man', 2),
 ('Third Rail', 2),
 ('Meskerem', 2),
 ('Jajaja', 2),
 ('The Donut Project', 2),
 ('Jeju Noodle Bar', 2),
 ('Hide Chan', 2),
 ('Nakamura', 2),
 ('TabeTomo', 2),
 ("Joe's", 2),
 ('100 Mott St', 2),
 ('New Green Bo', 2),
 ('The Bao', 2),
 ('Memories of Shanghai', 2),
 ('Kajitsu', 2)]

In [26]:
# Create final list of the location, the number of upvotes, and the link to the comment using the list number minus 1
final_return = [(x[0], comment_upvotes[x[1]-1], comment_urls[x[1]-1]) for x in final]
sorted(final_return, key = lambda x: -x[1])

[('Olive Garden',
  43,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i66zygl/'),
 ('Saigon Shack',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('Spicy Moon',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('Pommes Frites',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('The Kati Roll Company',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('The Dosa Man',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('Third Rail',
  27,
  'https://reddit.com/r/FoodNYC/comments/ubxf1s/looking_for_input_on_my_nyc_staple_restaurant_list/i677hz5/'),
 ('Meskerem',
  27,
  'https://reddit.com/r/FoodN