## 0. Importing Libraries

In [1]:
import pandas as pd
import openai
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.docstore.document import Document
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain.agents import Tool
from langchain.agents import initialize_agent
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from tqdm.auto import tqdm
from uuid import uuid4
import pandas as pd
import os
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

### Loading sample json data

In [3]:
from langchain_community.document_loaders import CSVLoader

# Specify the path to your CSV file
csv_file_path = 'data/product-data/product_data.csv'

# Initialize the CSVLoader with a specified encoding (e.g., 'utf-8')
loader = CSVLoader(csv_file_path, encoding='utf-8')

# Load the documents
try:
    documents = loader.load()
    print("Documents loaded successfully.")
except Exception as e:
    print(f"Error loading documents: {e}")

Documents loaded successfully.


In [4]:
len(documents)

10002

In [5]:
documents[1]

Document(page_content='Uniq Id: 66d49bbed043f5be260fa9f7fbff5957\nProduct Name: Electronic Snap Circuits Mini Kits Classpack, FM Radio, Motion Detector, Music Box (Set of 5)\nBrand Name: \nAsin: \nCategory: Toys & Games | Learning & Education | Science Kits & Toys\nUpc Ean Code: \nList Price: \nSelling Price: $99.95\nQuantity: \nModel Number: 55324\nAbout Product: Make sure this fits by entering your model number. | Snap circuits mini kits classpack provides basic electronic circuitry activities for students in grades 2-6 | Includes 5 separate mini building kits- an FM radio, a motion detector, music box, space battle sound effects, and a flying saucer | Each kit includes separate components and instructions to build | Each component represents one function in a circuit; components snap together to create working models of everyday electronic devices | Activity guide provides additional projects to teach students how circuitry works\nProduct Specification: Product Dimensions:         1

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap = 100
)

doc = text_splitter.split_documents(documents)

In [7]:
doc[1]

Document(page_content='About Product: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. | COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. These boards combine fiberglass, epoxy, HD plastic and bamboo to create a perfect blend of performance and strength. | INSPIRED BY THE NORTHWEST: Our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills,', metadata={'source': 'data/product-data/product_data.csv', 'row': 0})

In [8]:
len(doc)

50885

In [9]:
doc_lim = doc[:1000]

In [10]:
len(doc_lim)

1000

### Embeddings and vector store

In [11]:
import torch
from langchain_community.embeddings import HuggingFaceInstructEmbeddings

model_name = 'hkunlp/instructor-base'

embedding_model = HuggingFaceInstructEmbeddings(
    model_name = model_name,
    model_kwargs = {"device" : device}
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [12]:
#locate vectorstore
vector_path = './vector-store'
if not os.path.exists(vector_path):
    os.makedirs(vector_path)
    print('create path done')

In [13]:
#save vector locally
from langchain_community.vectorstores import FAISS

vectordb = FAISS.from_documents(
    documents = doc_lim,
    embedding = embedding_model
)

db_file_name = 'nlp-project-product-data'

vectordb.save_local(
    folder_path = os.path.join(vector_path, db_file_name),
    index_name = 'nlp' #default index
)

In [14]:
#calling vector from local
vector_path = './vector-store'

from langchain.vectorstores import FAISS

vectordb = FAISS.load_local(
    folder_path = os.path.join(vector_path, db_file_name),
    embeddings = embedding_model,
    index_name = 'nlp' #default index
)  

In [15]:
#ready to use
retriever = vectordb.as_retriever()

In [16]:
retriever.get_relevant_documents("What is Glue")

[Document(page_content='Uniq Id: ed8d9032bc6d9f45b4db9209d693fdc0\nProduct Name: Martha Stewart Crafts Stencil Tape, 32292\nBrand Name: \nAsin: \nCategory: Arts, Crafts & Sewing | Crafting | Paper & Paper Crafts | Paper Craft Tools\nUpc Ean Code: \nList Price: \nSelling Price: $9.97\nQuantity: \nModel Number: 32292\nAbout Product: Create plaids, stripes and checkerboards | Low tack adhesive | Easily removed | Holds stencils securely in place | Prevents run under of paint', metadata={'source': 'data/product-data/product_data.csv', 'row': 101}),
 Document(page_content='moisture absorption, and is non-conductive. The ball is latex free for latex sensitivity. Science education products are commonly used as educational aids in scientific classrooms and office settings. Science education products incorporate applied math and science principles into classroom and homeschool-based projects. Teachers in pre-K, elementary, and secondary classrooms use science education kits and products alongsid

### Test model

In [17]:
#locate models
model_path = './models'
if not os.path.exists(model_path):
    os.makedirs(model_path)
    print('create path done')

In [18]:
# %cd ./models
# !git clone https://huggingface.co/anas-awadalla/gpt2-span-head-few-shot-k-16-finetuned-squad-seed-0

In [19]:
from transformers import AutoTokenizer, pipeline, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from langchain import HuggingFacePipeline
import torch

model_id = 'models/gpt2-span-head-few-shot-k-16-finetuned-squad-seed-0/'

tokenizer = AutoTokenizer.from_pretrained(
    model_id)

tokenizer.pad_token_id = tokenizer.eos_token_id

bitsandbyte_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bitsandbyte_config,
    device_map='cuda:0',
    load_in_8bit=True
)

pipe = pipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    max_new_tokens=100,
    model_kwargs={
        "temperature": 0,
        "repetition_penalty": 1.5
    }
)

llm = HuggingFacePipeline(pipeline=pipe)


In [20]:
from langchain.chains import LLMChain
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import ConversationalRetrievalChain

In [21]:
CONDENSE_QUESTION_PROMPT

PromptTemplate(input_variables=['chat_history', 'question'], template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:')

In [22]:
question_generator = LLMChain(
    llm = llm,
    prompt = CONDENSE_QUESTION_PROMPT,
    verbose = True
)

In [23]:
query = 'Comparing both of them'
chat_history = "Human:What is Machine Learning\nAI:\nHuman:What is Deep Learning\nAI:"

question_generator({'chat_history' : chat_history, "question" : query})

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.

Chat History:
Human:What is Machine Learning
AI:
Human:What is Deep Learning
AI:
Follow Up Input: Comparing both of them
Standalone question:[0m

[1m> Finished chain.[0m


{'chat_history': 'Human:What is Machine Learning\nAI:\nHuman:What is Deep Learning\nAI:',
 'question': 'Comparing both of them',
 'text': ' "What is Machine Learning?"\n"What type of Machine Learning is that?"This may seem counterintuitive. It may sound counterintuitive at first. Machine learning seems to be able to "learn" anything, anywhere. It can remember what you searched for the next time you looked, when you gave it a try, and then from that you could make a judgment about the product. It was able to figure out which states of affairs to buy, which currencies to buy, which parts to sell,'}

In [24]:
from langchain.prompts import PromptTemplate

prompt_template = """
    Test prompt for NLP Amazon sales chatbot.
    {context}
    Question: {question}
    Answer:
    """.strip()

PROMPT = PromptTemplate.from_template(
    template = prompt_template
)

PROMPT
#using str.format 
#The placeholder is defined using curly brackets: {} {}

PromptTemplate(input_variables=['context', 'question'], template='Test prompt for NLP Amazon sales chatbot.\n    {context}\n    Question: {question}\n    Answer:')

In [25]:
doc_chain = load_qa_chain(
    llm = llm,
    chain_type = 'stuff',
    prompt = PROMPT,
    verbose = True
)
doc_chain

StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=PromptTemplate(input_variables=['context', 'question'], template='Test prompt for NLP Amazon sales chatbot.\n    {context}\n    Question: {question}\n    Answer:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000001C7F6C86F10>)), document_variable_name='context')

In [26]:
memory = ConversationBufferWindowMemory(
    k=3, 
    memory_key = "chat_history",
    return_messages = True,
    output_key = 'answer'
)

chain = ConversationalRetrievalChain(
    retriever=retriever,
    question_generator=question_generator,
    combine_docs_chain=doc_chain,
    return_source_documents=True,
    memory=memory,
    verbose=True,
    get_chat_history=lambda h : h
)
chain

ConversationalRetrievalChain(memory=ConversationBufferWindowMemory(output_key='answer', return_messages=True, memory_key='chat_history', k=3), verbose=True, combine_docs_chain=StuffDocumentsChain(verbose=True, llm_chain=LLMChain(verbose=True, prompt=PromptTemplate(input_variables=['context', 'question'], template='Test prompt for NLP Amazon sales chatbot.\n    {context}\n    Question: {question}\n    Answer:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000001C7F6C86F10>)), document_variable_name='context'), question_generator=LLMChain(verbose=True, prompt=PromptTemplate(input_variables=['chat_history', 'question'], template='Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n\nChat History:\n{chat_history}\nFollow Up Input: {question}\nStandalone question:'), llm=HuggingFacePipeline(pipeline=<transformers.pipelines.text_genera

In [27]:
prompt_question = "Can you tell me what crossbows are available?"
answer = chain({"question":prompt_question})
answer

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




[1m> Entering new ConversationalRetrievalChain chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mTest prompt for NLP Amazon sales chatbot.
    About Product: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. | COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. These boards combine fiberglass, epoxy, HD plastic and bamboo to create a perfect blend of performance and strength. | INSPIRED BY THE NORTHWEST: Our founding ideal is chasing adventure & riding the best boards possible, inspired by the hills,

Uniq Id: 4c69b61db1fc16e7013b43fc926e502d
Product Name: 

{'question': 'Can you tell me what crossbows are available?',
 'chat_history': [],
 'answer': ' \n    Based on  _________\nIn other words,  _________\nX Previous image Next image Previous 3 remaining Images with http://stock.treasurer.com/item/4242X-ARB-STRONGBOOTS/ A search for "4241X-ARB-STRONGBOOTS" turned up no less than 36 box sets of the legendary Barbarian armor set! Now a whole lot of people are having fun plinking their way into our local',
 'source_documents': [Document(page_content='About Product: Make sure this fits by entering your model number. | RESPONSIVE FLEX: The Crossbow features a bamboo core encased in triaxial fiberglass and HD plastic for a responsive flex pattern that’s second to none. Pumping & carving have never been so satisfying! Flex 2 is recommended for people 120 to 170 pounds. | COREFLEX TECH: CoreFlex construction is water resistant, impact resistant, scratch resistant and has a flex like you won’t believe. These boards combine fiberglass, epoxy, HD pla