In [2]:
import pandas as pd
import numpy as np
import csv
import sys

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma 
from langchain.document_loaders.csv_loader import CSVLoader

In [2]:
df = pd.read_csv('/Users/mrunmayeerane/Desktop/progress/Flavors/Merged_sorted_data.csv')
df.shape

(24716, 18)

In [3]:
%%time
# creates a hashmap to aggregate multiple reviews associated with each business.
business_data = {}
for _, row in df.iterrows():
    business_id = row['business_id']
    
    if business_id not in business_data:
       
        business_data[business_id] = {
            'name': row['name'],
            'address': row['address'],
            'city': row['city'],
            'state': row['state'],
            'postal_code': row['postal_code'],
            'latitude': row['latitude'],
            'longitude': row['longitude'],
            'stars_x': [row['stars_x']] if pd.notnull(row['stars_y']) else 0, 
            'review_count': row['review_count'],
            'is_open': row['is_open'],
            'attributes': row['attributes'],  
            'categories': row['categories'],  
            'hours': row['hours'],  
            'user_reviews': [row['text']], 
            'stars_y': [row['stars_y']] if pd.notnull(row['stars_y']) else 0
        }
    else:
        if pd.notnull(row['stars_x']):
            business_data[business_id]['stars_x'].append(row['stars_x'])
        
        if pd.notnull(row['stars_y']):
            business_data[business_id]['stars_y'].append(row['stars_y'])
       
        business_data[business_id]['user_reviews'].append((row['text']))     

CPU times: user 939 ms, sys: 12.1 ms, total: 951 ms
Wall time: 953 ms


In [4]:
# creates a dataframe from hashmap for mapping.
aggregated_df = pd.DataFrame.from_dict(business_data, orient='index')

In [None]:
# calculates the mean for business ratings
def calculate_mean(stars_list):
    if stars_list: 
        return round(sum(stars_list) / len(stars_list), 2)
    else:
        return None 
aggregated_df['stars_x'] = aggregated_df['stars_x'].apply(calculate_mean)
aggregated_df['stars_y'] = aggregated_df['stars_y'].apply(calculate_mean)

In [None]:
# saving csv file
aggregated_df.to_csv('aggregated_data.csv', index=False)

In [None]:
# loading csv file and splitting it in to chunks
csv.field_size_limit(sys.maxsize)

loader =  CSVLoader(file_path = "/Users/mrunmayeerane/Desktop/progress/Flavors/aggregated_data.csv")
data = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(data)

In [4]:
# Text embeddings using sentence Transformer for CSV file
# !pip install sentence-transformers
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

model_args = {'device':'cpu'}

encode_args = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     
    model_kwargs=model_args, 
    encode_kwargs=encode_args)

In [None]:
%%time
db = Chroma.from_documents(docs, embeddings)

In [6]:
query = "I want to have a vegan food around santa Barbara"

In [None]:
retriever = db.as_retriever(search_type="mmr")
results = retriever.get_relevant_documents(query, num_results=7)
results

In [None]:
%%time
db2 = Chroma.from_documents(docs, embeddings, persist_directory="./chroma_db_1")
db2.persist()

In [14]:
# !pip install chromadb
db3 = Chroma(embedding_function= embeddings,persist_directory="./chroma_db_1")
result = db3.similarity_search_with_score(query, 5)[::-1]

In [22]:
docs = db3.similarity_search(query,5)[::-1]
docs

[Document(page_content='name: Farmer Boy\naddress: 3427 State St\ncity: Santa Barbara\nstate: CA\npostal_code: 93105.0\nlatitude: 34.4400429\nlongitude: -119.7373433\nstars_x: 4.0\nreview_count: 255\nis_open: 1', metadata={'row': 416, 'source': '/Users/mrunmayeerane/Desktop/progress/Flavors/aggregated_data.csv'}),
 Document(page_content="name: C'est Cheese\naddress: 827 Santa Barbara St\ncity: Santa Barbara\nstate: CA\npostal_code: 93103.0\nlatitude: 34.422317\nlongitude: -119.698081\nstars_x: \nreview_count: 272\nis_open: 0", metadata={'row': 946, 'source': '/Users/mrunmayeerane/Desktop/progress/Flavors/aggregated_data.csv'}),
 Document(page_content='name: Santa Barbara Chicken Ranch\naddress: 149 N Fairview Ave\ncity: Goleta\nstate: CA\npostal_code: 93117.0\nlatitude: 34.441799\nlongitude: -119.832712\nstars_x: \nreview_count: 127\nis_open: 1', metadata={'row': 545, 'source': '/Users/mrunmayeerane/Desktop/progress/Flavors/aggregated_data.csv'}),
 Document(page_content='name: Santa Ba

In [24]:
df_ret = pd.read_csv('/Users/mrunmayeerane/Desktop/progress/Flavors/aggregated_data.csv')

In [25]:
from langchain import PromptTemplate

# Define the PromptTemplate
prompt_template = PromptTemplate(
    template="""Business Review:\n"""
              """Name: {Name}\n"""
              """Address: {Address}, {City}, {State}, {PostalCode}\n"""
              """Hours: {Hours}\n"""
              """Rating: {Stars} stars\n""",
    input_variables=["Name", "Address", "City", "State", "PostalCode", "Hours", "Stars"]
)

combined_reviews = ""

# Iterate through your documents
for i in range(len(docs)):
    row_value = docs[i].metadata.get('row', None)

    if row_value is not None:
        # Extracting data from the DataFrame
        data = {
            "Name": df_ret.iloc[row_value]['name'],
            "Address": df_ret.iloc[row_value]['address'],
            "City": df_ret.iloc[row_value]['city'],
            "State": df_ret.iloc[row_value]['state'],
            "PostalCode": df_ret.iloc[row_value]['postal_code'],
            "Hours": df_ret.iloc[row_value]['hours'],
            "Stars": df_ret.iloc[row_value]['stars_y']
        }

        # Format the prompt and append to combined_reviews
        combined_reviews += prompt_template.format(**data) + "\n"

# Append the instruction at the end of the combined reviews
final_prompt = combined_reviews + "You are a smart recommender system, Please provide a recommendation based on this business information.\nRecommend places from suggested additional context only and from file aggregated_data.csv \nDo not suggest places on your own\n Do not mention aggregated_data.csv file in your response and your response must suggest all Business Reviews included in prompt"

print(final_prompt)


Business Review:
Name: Farmer Boy
Address: 3427 State St, Santa Barbara, CA, 93105.0
Hours: {'Monday': '0:0-0:0', 'Tuesday': '7:0-14:0', 'Wednesday': '7:0-14:0', 'Thursday': '7:0-14:0', 'Friday': '7:0-14:0', 'Saturday': '7:0-14:0', 'Sunday': '7:0-14:0'}
Rating: 3.96 stars

Business Review:
Name: C'est Cheese
Address: 827 Santa Barbara St, Santa Barbara, CA, 93103.0
Hours: {'Monday': '0:0-0:0', 'Saturday': '1:0-1:30'}
Rating: nan stars

Business Review:
Name: Santa Barbara Chicken Ranch
Address: 149 N Fairview Ave, Goleta, CA, 93117.0
Hours: {'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0', 'Wednesday': '11:0-21:0', 'Thursday': '11:0-21:0', 'Friday': '11:0-21:0', 'Saturday': '11:0-21:0', 'Sunday': '11:0-21:0'}
Rating: nan stars

Business Review:
Name: Santa Barbara Chicken Ranch
Address: 2618 De La Vina St, Santa Barbara, CA, 93105.0
Hours: {'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0', 'Wednesday': '11:0-22:0', 'Thursday': '11:0-22:0', 'Friday': '11:0-22:0', 'Saturday': '11:0-22:0', 'Su

In [26]:
import requests
def nvidia_api_call(query, api_key, invoke_url, fetch_url_format):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Accept": "application/json",
    }

    payload = {
        "messages": [
            {
                "content": query,
                "role": "user"
            }
        ],
        "temperature": 0.2,
        "top_p": 0.7,
        "max_tokens": 1024,
        "stream": False
    }

    session = requests.Session()
    response = session.post(invoke_url, headers=headers, json=payload)

    while response.status_code == 202:
        request_id = response.headers.get("NVCF-REQID")
        fetch_url = fetch_url_format + request_id
        response = session.get(fetch_url, headers=headers)

    response.raise_for_status()
    response_body = response.json()
    return response_body['choices'][0]['message']['content']

In [27]:
api_key = "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"
invoke_url = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/0e349b44-440a-44e1-93e9-abe8dcb27158"
fetch_url_format = "https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/"

In [28]:
recommendations = nvidia_api_call(final_prompt, api_key, invoke_url, fetch_url_format)
print(recommendations)

Based on the provided business information, I recommend the following places:

1. Farmer Boy - Located at 3427 State St, Santa Barbara, CA, 93105, this restaurant has a rating of 3.96 stars and is open from 7:00 AM to 14:00 PM from Monday to Friday, and 7:00 AM to 14:00 PM on Saturday and Sunday.
2. Santa Barbara Chicken Ranch - Located at 149 N Fairview Ave, Goleta, CA, 93117, this restaurant has a rating of nan stars and is open from 11:00 AM to 21:00 PM from Monday to Sunday.
3. Santa Barbara Chicken Ranch - Located at 2618 De La Vina St, Santa Barbara, CA, 93105, this restaurant has a rating of nan stars and is open from 11:00 AM to 22:00 PM from Monday to Sunday.
4. Vegan GreenGO - Located at 3613 State St, Santa Barbara, CA, 93105, this restaurant has a rating of nan stars and is open from 11:00 AM to 20:00 PM from Monday to Friday, and 12:00 PM to 20:00 PM on Saturday and Sunday.

I recommend these places based on their ratings and hours of operation, which suggest that they are