In [None]:
#NLP
!pip install langchain==0.0.191 chromadb==0.3.22 llama-cpp-python==0.1.66 \
pdfminer.six==20221105 InstructorEmbedding sentence-transformers faiss-cpu \
huggingface_hub transformers protobuf==3.20.0; sys_platform != 'darwin' protobuf==3.20.0; sys_platform == 'darwin' and platform_machine != 'arm64' \
protobuf==3.20.3; sys_platform == 'darwin' and platform_machine == 'arm64' auto-gptq==0.2.2 docx2txt unstructured
# Utilities
!pip install urllib3==1.26.6 accelerate bitsandbytes ; sys_platform != 'win32' bitsandbytes-windows ; sys_platform == 'win32' click flask requests
# Streamlit related
!pip install streamlit Streamlit-extras
# Excel File Manipulation
!pip install openpyxl
#GPTQ
!pip install auto_gptq
#PYNGROK
!pip install pyngrok
#Excel
!pip install xformers
!pip install unstructured


In [1]:
#Ngrok config
!ngrok authtoken 2U1L7h0W1hitgbVVijS1tsdCi0V_2KrJUQ3fibyro8UGqak7V
!pkill -f ngrok

Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [2]:
from flask import Flask, request, jsonify
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma

from pyngrok import ngrok

app = Flask(__name__)

In [3]:
!mkdir DB
!mkdir SOURCE_DOCUMENTS

In [None]:
import os

# from dotenv import load_dotenv
from chromadb.config import Settings

# https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/excel.html?highlight=xlsx#microsoft-excel
from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader, UnstructuredExcelLoader, Docx2txtLoader

# load_dotenv()
# ROOT_DIRECTORY = os.path.dirname(os.path.realpath(__file__))
ROOT_DIRECTORY = "/content"

# Define the folder for storing database
SOURCE_DIRECTORY = f"{ROOT_DIRECTORY}/SOURCE_DOCUMENTS"

PERSIST_DIRECTORY = f"{ROOT_DIRECTORY}/DB"

# Can be changed to a specific number
INGEST_THREADS = os.cpu_count() or 8

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
    chroma_db_impl="duckdb+parquet", persist_directory=PERSIST_DIRECTORY, anonymized_telemetry=False
)

# https://python.langchain.com/en/latest/_modules/langchain/document_loaders/excel.html#UnstructuredExcelLoader
DOCUMENT_MAP = {
    ".txt": TextLoader,
    ".md": TextLoader,
    ".py": TextLoader,
    ".pdf": PDFMinerLoader,
    ".csv": CSVLoader,
    ".xls": UnstructuredExcelLoader,
    ".xlsx": UnstructuredExcelLoader,
    ".docx": Docx2txtLoader,
    ".doc": Docx2txtLoader,
}

# Default Instructor Model
EMBEDDING_MODEL_NAME = "hkunlp/instructor-large"
# You can also choose a smaller model, don't forget to change HuggingFaceInstructEmbeddings
# to HuggingFaceEmbeddings in both ingest.py and run_localGPT.py
# EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# Select the Model ID and model_basename
# load the LLM for generating Natural Language responses

# for GPTQ (quantized) models
MODEL_ID = "TheBloke/WizardLM-7B-uncensored-GPTQ"
MODEL_BASENAME = "WizardLM-7B-uncensored-GPTQ-4bit-128g.compat.no-act-order.safetensors"

# for GGML (quantized cpu+gpu+mps) models - check if they support llama.cpp
# MODEL_ID = "TheBloke/Llama-2-7B-Chat-GGML"
# MODEL_BASENAME = "llama-2-7b-chat.ggmlv3.q4_0.bin"
# MODEL_ID = "TheBloke/orca_mini_3B-GGML"
# MODEL_BASENAME = "orca-mini-3b.ggmlv3.q4_0.bin"




In [None]:
import logging
import os
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed

import click
import torch
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

def load_single_document(file_path: str) -> Document:
    # Loads a single document from a file path
    file_extension = os.path.splitext(file_path)[1]
    loader_class = DOCUMENT_MAP.get(file_extension)
    if loader_class:
        loader = loader_class(file_path)
    else:
        raise ValueError("Document type is undefined")
    return loader.load()[0]

def load_document_batch(filepaths):
    logging.info("Loading document batch")
    # create a thread pool
    with ThreadPoolExecutor(len(filepaths)) as exe:
        # load files
        futures = [exe.submit(load_single_document, name) for name in filepaths]
        # collect data
        data_list = [future.result() for future in futures]
        # return data and file paths
        return (data_list, filepaths)

def load_documents(source_dir: str) -> list[Document]:
    # Loads all documents from the source documents directory
    all_files = os.listdir(source_dir)
    paths = []
    for file_path in all_files:
        file_extension = os.path.splitext(file_path)[1]
        source_file_path = os.path.join(source_dir, file_path)
        if file_extension in DOCUMENT_MAP.keys():
            paths.append(source_file_path)

    # Have at least one worker and at most INGEST_THREADS workers
    n_workers = min(INGEST_THREADS, max(len(paths), 1))
    chunksize = max(round(len(paths) / n_workers), 1)  # Ensure chunksize is not zero
    docs = []
    with ProcessPoolExecutor(n_workers) as executor:
        futures = []
        # split the load operations into chunks
        for i in range(0, len(paths), chunksize):
            # select a chunk of filenames
            filepaths = paths[i : (i + chunksize)]
            # submit the task
            future = executor.submit(load_document_batch, filepaths)
            futures.append(future)
        # process all results
        for future in as_completed(futures):
            # open the file and load the data
            contents, _ = future.result()
            docs.extend(contents)

    return docs

def split_documents(documents: list[Document]) -> tuple[list[Document], list[Document]]:
    # Splits documents for correct Text Splitter
    text_docs, python_docs = [], []
    for doc in documents:
        file_extension = os.path.splitext(doc.metadata["source"])[1]
        if file_extension == ".py":
            python_docs.append(doc)
        else:
            text_docs.append(doc)

    return text_docs, python_docs


In [None]:
def main():
    # Load documents and split in chunks
    logging.info(f"Loading documents from {SOURCE_DIRECTORY}")
    documents = load_documents(SOURCE_DIRECTORY)
    text_documents, python_documents = split_documents(documents)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    python_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON, chunk_size=1000, chunk_overlap=200
    )
    texts = text_splitter.split_documents(text_documents)
    texts.extend(python_splitter.split_documents(python_documents))
    logging.info(f"Loaded {len(documents)} documents from {SOURCE_DIRECTORY}")
    logging.info(f"Split into {len(texts)} chunks of text")

    # Create embeddings
    embeddings = HuggingFaceInstructEmbeddings(
        model_name=EMBEDDING_MODEL_NAME,
        model_kwargs={"device": "cuda"},  # Set device type to "tpu"
    )

    db = Chroma.from_documents(
        texts,
        embeddings,
        persist_directory=PERSIST_DIRECTORY,
        client_settings=CHROMA_SETTINGS,
    )
    db.persist()
    db = None

main()

In [None]:
from auto_gptq import AutoGPTQForCausalLM
from huggingface_hub import hf_hub_download
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.llms import HuggingFacePipeline, LlamaCpp
from langchain.memory import ConversationBufferMemory
from langchain.prompts import PromptTemplate

# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import Chroma
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    pipeline,
)

def load_model(device_type, model_id, model_basename=None):
    """
    Select a model for text generation using the HuggingFace library.
    If you are running this for the first time, it will download a model for you.
    subsequent runs will use the model from the disk.

    Args:
        device_type (str): Type of device to use, e.g., "cuda" for GPU or "cpu" for CPU.
        model_id (str): Identifier of the model to load from HuggingFace's model hub.
        model_basename (str, optional): Basename of the model if using quantized models.
            Defaults to None.

    Returns:
        HuggingFacePipeline: A pipeline object for text generation using the loaded model.

    Raises:
        ValueError: If an unsupported model or device type is provided.
    """
    logging.info(f"Loading Model: {model_id}, on: {device_type}")
    logging.info("This action can take a few minutes!")

    if model_basename is not None:
        if ".ggml" in model_basename:
            logging.info("Using Llamacpp for GGML quantized models")
            model_path = hf_hub_download(repo_id=model_id, filename=model_basename)
            max_ctx_size = 2048
            kwargs = {
                "model_path": model_path,
                "n_ctx": max_ctx_size,
                "max_tokens": max_ctx_size,
            }
            if device_type.lower() == "mps":
                kwargs["n_gpu_layers"] = 1000
            if device_type.lower() == "cuda":
                kwargs["n_gpu_layers"] = 1000
                kwargs["n_batch"] = max_ctx_size
            return LlamaCpp(**kwargs)

        else:
            # The code supports all huggingface models that ends with GPTQ and have some variation
            # of .no-act.order or .safetensors in their HF repo.
            logging.info("Using AutoGPTQForCausalLM for quantized models")

            if ".safetensors" in model_basename:
                # Remove the ".safetensors" ending if present
                model_basename = model_basename.replace(".safetensors", "")

            tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
            logging.info("Tokenizer loaded")

            model = AutoGPTQForCausalLM.from_quantized(
                model_id,
                model_basename=model_basename,
                use_safetensors=True,
                trust_remote_code=True,
                device="cuda:0",
                use_triton=False,
                quantize_config=None,
            )
    elif (
        device_type.lower() == "cuda"
    ):  # The code supports all huggingface models that ends with -HF or which have a .bin
        # file in their HF repo.
        logging.info("Using AutoModelForCausalLM for full models")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        logging.info("Tokenizer loaded")

        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
            # max_memory={0: "15GB"} # Uncomment this line with you encounter CUDA out of memory errors
        )
        model.tie_weights()
    else:
        logging.info("Using LlamaTokenizer")
        tokenizer = LlamaTokenizer.from_pretrained(model_id)
        model = LlamaForCausalLM.from_pretrained(model_id)

    # Load configuration from the model to avoid warnings
    generation_config = GenerationConfig.from_pretrained(model_id)

    # Create a pipeline for text generation
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_length=2048,
        temperature=0,
        top_p=0.95,
        repetition_penalty=1.15,
        generation_config=generation_config,
    )

    local_llm = HuggingFacePipeline(pipeline=pipe)
    logging.info("Local LLM Loaded")

    return local_llm


In [None]:
"""
This function implements the information retrieval task.


1. Loads an embedding model, can be HuggingFaceInstructEmbeddings or HuggingFaceEmbeddings
2. Loads the existing vectorestore that was created by inget.py
3. Loads the local LLM using load_model function - You can now set different LLMs.
4. Setup the Question Answer retreival chain.
5. Question answers.
"""

logging.info(f"Running on: cuda")
# logging.info(f"Running on: cpu")

# logging.info(f"Display Source Documents set to: {show_sources}")

# embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cpu"})
embeddings = HuggingFaceInstructEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cuda"})

# load the vectorstore
db = Chroma(
    persist_directory=PERSIST_DIRECTORY,
    embedding_function=embeddings,
    client_settings=CHROMA_SETTINGS,
)
retriever = db.as_retriever()


template = """Use the following pieces of context to answer the question at the end. If you don't know the answer,\
just say that you don't know, don't try to make up an answer.

{context}

{history}
Question: {question}
Helpful Answer:"""

prompt = PromptTemplate(input_variables=["history", "context", "question"], template=template)
memory = ConversationBufferMemory(input_key="question", memory_key="history")

# llm = load_model("cuda", model_id=MODEL_ID, model_basename=MODEL_BASENAME)
llm = load_model("cpu", model_id=MODEL_ID, model_basename=MODEL_BASENAME)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt, "memory": memory},
)
# Interactive questions and answers

# while True:
#     # query = input("\nEnter a query: ")
#     if query == "exit":
#         break
#     # Get the answer from the chain
#     res = qa(query)
#     answer, docs = res["result"], res["source_documents"]

#     # Print the result
#     print("\n\n> Question:")
#     print(query)
#     print("\n> Answer:")
#     print(answer)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk import word_tokenize, pos_tag

def getRecommendedProducts(inputQuery):

    productDf = pd.read_excel(SOURCE_DIRECTORY+'/db2kxl.xlsx')
    # sql = 'Blue Kurta with black jeans should go well'
    preprocessedSql = inputQuery.lower()
    preprocessedProducts = productDf.applymap(
        lambda x: x.lower() if isinstance(x, str) else x)

    # Tokenize the sentence into words
    words = word_tokenize(preprocessedSql)

    # Perform part-of-speech tagging
    pos_tags = pos_tag(words)
    print(pos_tags)

    # Find adjectives and nouns and combine them
    adjective_noun_pairs = []
    i = 0
    while (i != len(pos_tags)):
        if pos_tags[i][1].startswith('JJ'):
            j = i
            while (pos_tags[j][1].startswith('NN') == False):
                j += 1

            r = []
            for k in range(i, j+1):
                if (pos_tags[k][1].startswith('JJ') or pos_tags[k][1].startswith('RB') or pos_tags[k][1].startswith('NN')):
                    r.append(pos_tags[k][0])
            adjective_noun_pairs.append(r)
            i = j+1

        elif pos_tags[i][1].startswith('NN'):
            adjective_noun_pairs.append([pos_tags[i][0]])
            i += 1
        else:
            i += 1

    # print("Adjective-Noun Pairs:", adjective_noun_pairs[0])

    # Create TF-IDF vectorizer
    preprocessedProducts['Combined'] = preprocessedProducts['product_category_tree'].str.cat(
        preprocessedProducts['product_specifications'], sep=' ')

    ranking = []
    for i in range(len(adjective_noun_pairs)):
        vectorizer = TfidfVectorizer()
        productVectors = vectorizer.fit_transform(
            preprocessedProducts['Combined'].fillna(''))
        query_vector = vectorizer.transform(
            [" ".join(adjective_noun_pairs[i])])
        similarity_scores = cosine_similarity(
            query_vector, productVectors).flatten()

        ranked_indices = similarity_scores.argsort()[::-1]
        ranked_products = [preprocessedProducts['product_name'][i]
                           for i in ranked_indices]
        # Print ranked products
        for i, (rank, product) in enumerate(zip(range(1, 11), ranked_products[:10]), start=1):
            ranking.append(product)
    return ranking

In [None]:
from chromadb.api.types import QueryResult
# Define your endpoint for handling prompts
@app.route('/prompt', methods=['POST'])
def handle_prompt():
    try:
        data = request.get_json()
        query = data['query']
        # products=data['prod']
        # products="kurta,pajama,dhoti,suit,shirt,pant,trouser,blazer"
        print(query)
        if not query:
            return jsonify({"error": "Missing 'query' parameter"}), 400

        res = qa(query)
        print("hello")
        answer= res['result']
        # print(answer)
        responseToSend = getRecommendedProducts(answer)
        # answer="satvik"

        return jsonify({"question": query, "answer": responseToSend})

    except Exception as e:
        logging.error(e)
        return jsonify({"error": e}), 500

# Use ngrok to expose the local Flask app to a public URL
public_url = ngrok.connect(addr="5000", proto="http")

print('Public URL:', public_url)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)