In [1]:
import csv
import re
import json
import os
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate

openai_api_key = os.environ['OPENAI_API_KEY']




In [2]:
def load_document(filename):
    if filename.endswith(".csv"):
        loader = CSVLoader(filename)
        documents = loader.load()
    else:
        raise ValueError("Invalid file type")

    text_splitter = CharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=30, separator="\n")
    docs = text_splitter.split_documents(documents=documents)

    return docs

In [13]:
def query_data(query, retriever):
    template = """
    Use the list of categories to give me the best category for this category id and name. Also, use your rationally from your own knowledge base.
    In addition, also be sure to use any information you have about this business from any other sources.
    Answer only with the category id. Don't explain your rationale, only the category id. nothing else.
    Avoid general categories. The category id must have at least one dot.
    GIVE ME ONLY ONE ID, NOT THE NAME. THE BEST MATCH. THE ID IS THE NUMBER NEXT TO THE CATEGORY NAME
    {context}
    Category Id and Name: {question}
    """
    PROMPT = PromptTemplate(template=template, input_variables=["context", "question"])
    chain_type_kwargs = {"prompt": PROMPT}
    qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(openai_api_key=openai_api_key,
                                                temperature=0, model_name="gpt-3.5-turbo"),
                                     chain_type="stuff", retriever=retriever, chain_type_kwargs=chain_type_kwargs)

    return qa.run(query)

In [4]:
def get_base_category_dict(base_categories_path):
    base_categories = {}
    with open(base_categories_path, 'r', newline='') as query_csvfile:
        reader = csv.DictReader(query_csvfile)
        for row in reader:
            base_categories[row["id"]] = row["name"]
    return base_categories

def extract_numbers(s):
    return ''.join(filter(str.isdigit, s))

def find_single_uuid(text):
    # Regular expression pattern to match a single UUID in the string
    uuid_pattern = r'([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})'

    # Search for all matches in the text
    matches = re.findall(uuid_pattern, text)

    if len(matches) == 1:
        return matches[0]  # Return the single matched UUID
    else:
        return None  # Return None if there are zero or multiple UUIDs found

In [5]:
def backup(row):
    results.append(row)
    o = open("backup.json", "w+")
    o.write(json.dumps(results))
    o.close()

In [6]:
def process_csv(base_categories_path):
    docs = load_document(base_categories_path)
    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
    vectorstore = FAISS.from_documents(docs, embeddings)
    vectorstore.save_local("faiss_index_constitution")
    persisted_vectorstore = FAISS.load_local("faiss_index_constitution", embeddings)
    return persisted_vectorstore

In [7]:
base_categories_path = "CATEGORIES-YEXT.csv"
persisted_vectorstore = process_csv(base_categories_path)
base_categories = get_base_category_dict(base_categories_path)

In [18]:
try:
    o = open("backup.json", "r")
    results = json.loads(o.read())
    o.close()
except:
    o = open("backup.json", "w+")
    o.write('[]')
    o.close()
    results = []
resultsIds = []
for r in results:
   resultsIds.append(r["id"])
def process_csv_files(query_data_path):
        with open(query_data_path, 'r', newline='') as query_csvfile:
            reader = csv.DictReader(query_csvfile)
            for row in reader:
                if row['id'] in resultsIds:
                    continue
                rdata = query_data(row['name'], persisted_vectorstore.as_retriever())
                result = extract_numbers(rdata)
                while result is None or result not in base_categories.keys():
                    rdata = query_data(row['name'], persisted_vectorstore.as_retriever())
                    result = extract_numbers(rdata)
                    if not result in base_categories.keys():
                        if result is None:
                            prompt_addition = "YOU DIDNT GIVE ME ANY RESULT"
                        else:
                            prompt_addition = "THE CATEGORY ID "+ result + "DO NOT EXIST\n"
                        rdata = query_data(prompt_addition+row['name'], persisted_vectorstore.as_retriever())
                        result = extract_numbers(rdata)
                match = {'match': result, 'id': row['id'], 'name': row['name'], 'match_name': base_categories[result]}
                results.append(match)
                backup(match)
                print(match)

        with open('output.csv', 'w+') as output_csvfile:
            output_csvfile.write("match,id,name,match_name\n")
            ids = []
            for r in results:
                if r['id'] in ids:
                    continue
                output_csvfile.write(f"{r['match']},{r['id']},{r['name']},{r['match_name']}\n")
                ids.append(r['id'])

process_csv_files('CATEGORIES-V_CATEGORY.csv')

{'match': '1324843', 'id': 'e646630a-e638-4321-bbc8-4932ccea46d5', 'name': 'Arts & Entertainment > Event ticket seller', 'match_name': 'Arts & Entertainment > Event Venue > Live Music Venue'}
{'match': '1324849', 'id': 'e654869c-9f54-4658-b9a8-16245304c654', 'name': 'Arts & Entertainment > Memorial park', 'match_name': 'Arts & Entertainment > Cultural Attractions > National Park'}
{'match': '255', 'id': 'e65f4770-eb59-4357-a9c7-df59b07acc77', 'name': 'real estate', 'match_name': 'Real Estate Services'}
{'match': '1191', 'id': 'e66967ab-8c75-49c5-8668-2476eb81a629', 'name': 'Restaurants > Palatine restaurant', 'match_name': 'Food & Dining > Restaurant > Italian Restaurant'}
{'match': '1206', 'id': 'e6777a62-ee87-46b3-906d-3e7293855b13', 'name': 'Restaurants > Sushi restaurant', 'match_name': 'Food & Dining > Restaurant > Sushi Restaurant'}
{'match': '1197', 'id': 'e688494f-d6fe-49e9-9e34-ae81b23e6e65', 'name': 'Restaurants > Burmese restaurant', 'match_name': 'Food & Dining > Restaurant

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


{'match': '1324846', 'id': 'f7681244-012a-4aab-8367-4930fe0297a3', 'name': 'Shopping > Adult DVD store', 'match_name': 'Arts & Entertainment > Adult Entertainment > Adult DVD Store'}
{'match': '1357', 'id': 'f77a2d98-e8be-455b-afb3-8457a10c31d6', 'name': 'Health & Medical > Gastroenterologist', 'match_name': 'Health & Medicine > Medical Specialties > Gastroenterology'}
{'match': '1190', 'id': 'f783c74f-2498-42dd-9fc6-5c7c479c8101', 'name': 'Restaurants > Scottish restaurant', 'match_name': 'Food & Dining > Restaurant > Irish Restaurant'}
{'match': '1192', 'id': 'f7898887-49c7-4947-bc42-c5ff220973cc', 'name': 'Restaurants > Sukiyaki and Shabu Shabu restaurant', 'match_name': 'Food & Dining > Restaurant > Japanese Restaurant'}
{'match': '1313', 'id': 'f7b97c4b-2faa-4cbc-96db-fc62dc76de96', 'name': 'Health & Medical > STD clinic', 'match_name': 'Health & Medicine > Medical Diagnostics & Labs > STD Testing'}
{'match': '11', 'id': 'f7ca968e-bea0-4310-a3ff-d2363ef23ff3', 'name': 'Industrial 