# Environment

### GPU

In [1]:
!nvidia-smi

Sat Nov 18 19:49:19 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Dependency

In [2]:
!pip -q install langchain huggingface_hub google-search-results tiktoken wikipedia

In [3]:
!pip install bpemb sentence_transformers numpy



In [4]:
!pip install pinecone-client faiss-cpu



In [5]:
!pip install gitpython



### Library

In [6]:
import langchain
import os
import git
import json

### Home

In [7]:
class GitHome():

    def __init__(self,
                 work_dir,
                 branch_name,
                 repo_name,
                 repo_home,
                 user_name,
                 user_token):
        ### Save
        os.environ["WORK_DIR"] = work_dir
        os.environ["BRANCH_NAME"] = branch_name
        os.environ["REPO_NAME"] = repo_name
        os.environ["GIT_HOME"] = repo_home
        os.environ["USER_NAME"] = user_name
        os.environ["USER_TOKEN"] = user_token # Github Personal Access Token
        ### Compose
        os.environ["GIT_REPO"] = os.environ["GIT_HOME"] + os.environ["REPO_NAME"]
        os.environ["REPO_DIR"] = os.environ["WORK_DIR"] + "/" + os.environ["REPO_NAME"]
        os.environ["CLONE_FROM"] = "https://" + os.environ["USER_NAME"] + ":" + os.environ["USER_TOKEN"] + "@github.com/" + os.environ["USER_NAME"] + "/" + os.environ["REPO_NAME"] + ".git"

In [8]:
GitHome(work_dir="/content/drive/MyDrive/StanfordLLM/thought-distillation",
        repo_name="thought-distillation",
        repo_home="https://github.com/pablo-tech/",
        branch_name="main",
        user_name="pablo-tech",
        user_token="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")

<__main__.GitHome at 0x7a8bb808b9d0>

### Git

In [9]:
os.environ["WORK_DIR"]

'/content/drive/MyDrive/StanfordLLM/thought-distillation'

In [10]:
try:
  os.chdir(os.environ["WORK_DIR"])
except:
  pass

!rm -rf $REPO_DIR

In [11]:
git.Repo.clone_from(os.environ["CLONE_FROM"], os.environ["REPO_DIR"])

<git.repo.base.Repo '/content/drive/MyDrive/StanfordLLM/thought-distillation/thought-distillation/.git'>

In [12]:
os.chdir(os.environ["REPO_DIR"] + "/source/main/py")

# Vector database

### Instantiate

In [13]:
from vector_db import PineconeDb

In [14]:
pinecone_db = PineconeDb(index_name="quickstart")

In [15]:
pinecone_db.db_init(is_create=True)

### Load FAQ

In [16]:
from domain_info import FaqData

faqs = FaqData().get_faq()

Cliq FAQs for use cases.xlsx
HDFC CVP FAQs.xlsx
PVA HDFC Referral FAQs test.xlsx
PVA Titan FAQs.xlsx
PVA TNGC FAQs for Croma.xlsx
PVA Qmin FAQs.xlsx
PVA_BGC_Tanishq_FAQs.xlsx
PVA Qik FAQs (2).xlsx
PVA_BGC_Titan_Eye_Plus_FAQs.xlsx
PVA HDFC FAQs.xlsx
PVA HDFC Updated FAQs.xlsx
PVA Qik FAQs.xlsx
PVA Bill Payments FAQs New.xlsx
UPI FAQs master sheet.xlsx
PVA Super Top up Insurance FAQs.xlsx
PVA AAI Updated FAQs.xlsx
PVA TataPay FAQs.xlsx
PVA Westside FAQs.xlsx
PVA Tata1mg FAQs.xlsx
PVA IHCL FAQs.xlsx
PVA ZipCare Extended Warranty FAQs.xlsx
PVA Qik Updated FAQs.xlsx
UPI FAQ.xlsx
PVA Cultfit FAQs.xlsx
Sample PVA DG FAQs.xlsx
PVA Cliq FAQs.xlsx
IPL Neupass FAQs.xlsx
UPI new FAQs.xlsx
PVA TNGC FAQs for Hotel.xlsx
PVA_BGC_MasterList_FAQs_v2.xlsx
KnowledgeBaseReplica.xlsx
PVA HDFC FAQs (2).xlsx
PVA TataPlay FAQs.xlsx
IIFL_FAQs.xlsx
PVA_BGC_FAQs.xlsx
PVA BB FAQs.xlsx
PVA AAI FAQs.xlsx
PVA Qik PL Updated FAQs.xlsx
PVA TNGC FAQs for BB.xlsx
PVA DG FAQs.xlsx
PVA Tata Pay Later FAQs.xlsx
TCap_FAQsV1.

In [17]:
faqs

[{'user_question': 'How do I find invoices for my past orders?',
  'agent_answer': "Don't worry, a copy of your invoice will be emailed to your registered email ID. You can also visit 'Order History' in the 'My Account' section of the Tata CliQ app to find a copy of your invoice.",
  'source': 'Cliq FAQs for use cases.xlsx',
  'row': 0},
 {'user_question': 'How will I know that my refund has been initiated?',
  'agent_answer': 'You never have to worry about that. Tata CLiQ will send you an SMS and email confirming the initiation of your refund. Your refund should reach you within 3-4 working days. In the case of CLiQ Cash refund, the same will be processed in one business day.',
  'source': 'Cliq FAQs for use cases.xlsx',
  'row': 1},
 {'user_question': 'I still haven’t got my refund. Why?',
  'agent_answer': 'If you’ve received a confirmation on the approval of your refund, you will definitely get it. However, in rare cases, there are technical difficulties that can delay refund trans

In [18]:
FaqData().export_faq(faqs)

In [19]:
pine_docs = pinecone_db.read_files(file_names=['joined_faq.csv'],
                                   directory_path='/content/drive/MyDrive/StanfordLLM/qa_data/faq_qa/')

pinecone_db.load_docs(pine_docs)
# pinecone_db.load_docs(pine_docs[0:100])

In [20]:
pine_docs[0]

Document(page_content='"How do I find invoices for my past orders?? Don\'t worry, a copy of your invoice will be emailed to your registered email ID. You can also visit \'Order History\' in the \'My Account\' section of the Tata CliQ app to find a copy of your invoice."\n"How will I know that my refund has been initiated?? You never have to worry about that. Tata CLiQ will send you an SMS and email confirming the initiation of your refund. Your refund should reach you within 3-4 working days. In the case of CLiQ Cash refund, the same will be processed in one business day."\n"I still haven’t got my refund. Why?? If you’ve received a confirmation on the approval of your refund, you will definitely get it. However, in rare cases, there are technical difficulties that can delay refund transfers. If the wait seems too long, you can always contact the Tata CLiQ Customer Care team for support."', metadata={'source': '/content/drive/MyDrive/StanfordLLM/qa_data/faq_qa/joined_faq.csv'})

### Load docs

In [21]:
pine_docs = pinecone_db.read_files(file_names=['pay-user.txt',
                                               'payoff-sale.txt',
                                               'refer-and-earn.txt',
                                               'sale-parade.txt',
                                               'savings-calculator.txt',
                                               'seller-terms.txt',
                                               'tdl-privacy.txt',
                                               'terms-conditions.txt',
                                               'tpl-privacy.txt'],
                                   directory_path='/content/drive/MyDrive/StanfordLLM/qa_data/legal_qa/')

pinecone_db.load_docs(pine_docs)
# pinecone_db.load_docs(pine_docs[0:100])

In [22]:
pinecone_db.__str__(), pine_docs[0]

['quickstart']
IndexDescription(name='quickstart', metric='cosine', replicas=1, dimension=768.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')


(None,
 Document(page_content='TATA PAY USER AGREEMENT', metadata={'source': '/content/drive/MyDrive/StanfordLLM/qa_data/legal_qa/pay-user.txt'}))

### Query

In [23]:
questions = ["What is NeuPass",
             "Tata Neu eligibility"
             ]

In [24]:
for q in questions:
    answer = pinecone_db.select_by_text(q)
    # print(answer)
    hit = answer['matches'][0]
    print("Q: " + q)
    print("A: " + hit['metadata']['text'] + " " + hit['metadata']['source'] + " " + str(hit['score']))
    print("---")

Q: What is NeuPass
A: "What is NeuPass?? NeuPass is a rewards program from Tata Neu packed with benefits across brands. Earn minimum 5% NeuCoins* every time you shop and enjoy free deliveries when you shop on Tata Neu. Use your NeuCoins to save whenever you shop, book travel, dine, and more!\n\nFor more details, check https://www.tatadigital.com/v2/cdp/about-neupass"
"loyaltyprogram? NeuPass is a rewards program from Tata Neu packed with benefits across brands. Earn minimum 5% NeuCoins* every time you shop and enjoy free deliveries when you shop on Tata Neu. Use your NeuCoins to save whenever you shop, book travel, dine, and more!\n\nFor more details, check https://www.tatadigital.com/v2/cdp/about-neupass" /content/drive/MyDrive/StanfordLLM/qa_data/faq_qa/joined_faq.csv 0.671044767
---
Q: Tata Neu eligibility
A: "Am I eligible for the Tata NeuCard?? The Tata Neu HDFC Bank Credit Card is a Co-branded credit card launched in partnership with HDFC Bank. Currently, this is an invite-only p