In [60]:
import os
from dotenv import load_dotenv

load_dotenv()

token = os.getenv('HUGGINGFACE_ACCESS_TOKEN')

## Explored Models



1. all-MiniLM-L6-v2

- Size: 22M parameters

- Embedding Dimensionality: 384

2. all-MiniLM-L12-v2

- Size: 33M parameters

- Embedding Dimensionality: 384


3. all-mpnet-base-v2


- Size: 110M parameters

- Embedding Dimensionality: 768

4. paraphrase-MiniLM-L6-v2

- Size: 22M parameters

- Embedding Dimensionality: 384

- Feature Effective in tasks involving paraphrasing and semantic textual similarity.


For large-scale retrieval tasks, models 1 and 2 are recommended for speed and accuracy.

Other Explored Models

- thenlper/gte-small
-  mixedbread-ai/mxbai-embed-large-v1
- OpenAI Embeddings
- HuggingFaceH4/zephyr-7b-beta




## Implementation

In [63]:
#Embedding Model
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings

embedding_model = HuggingFaceInferenceAPIEmbeddings(
    api_key=token, model_name="sentence-transformers/all-MiniLM-L12-v2"
)



# Import Data and Splitting into chunks

In [64]:
from langchain_community.document_loaders import PyPDFLoader

file_path = (
    r"D:\Project\HDFC_Virtual_Vault\python files\210005026_Industry_report.pdf"
)
loader = PyPDFLoader(file_path)

data=loader.load()


In [65]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

MARKDOWN_SEPARATORS = [
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
    "."
]

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,  
    chunk_overlap=100,  
    add_start_index=True, 
    strip_whitespace=True, 
    separators=MARKDOWN_SEPARATORS,
)

docs_processed = []
for doc in data:
    docs_processed += text_splitter.split_documents([doc])

# Embed all chunks and store in a Vector Databases

In [68]:
from langchain.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy



KNOWLEDGE_VECTOR_DATABASE = FAISS.from_documents(
    docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

# Retriever

In [69]:
user_query='what is the company visited?'

retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
print(retrieved_docs)

[Document(page_content='Nitheeshvar\n  210005026  \n \nIndustry Report  \n \nI had the privilege of touring Vacmet Industry, an organization dedicated to', metadata={'source': 'D:\\Project\\HDFC_Virtual_Vault\\python files\\210005026_Industry_report.pdf', 'page': 0, 'start_index': 2}), Document(page_content='Industry Report  \n \nI had the privilege of touring Vacmet Industry, an organization dedicated to \nprioritizing sustainability and efficiency in its production processes. The faci lity', metadata={'source': 'D:\\Project\\HDFC_Virtual_Vault\\python files\\210005026_Industry_report.pdf', 'page': 0, 'start_index': 30}), Document(page_content='showcases several key features that underscore its dedication to responsible \nmanufacturing:  \nWater Management: The facility demonstrates a robust commitment to water', metadata={'source': 'D:\\Project\\HDFC_Virtual_Vault\\python files\\210005026_Industry_report.pdf', 'page': 0, 'start_index': 215}), Document(page_content='packaging, among o

# Reader

## Models Tried
- flan-t5-large
- OpenAI's chatGPT

## Google Gemini 1.5 Flash, calling through API

In [3]:
import google.generativeai as genai


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#API key for google 

GOOGLE_API_KEY='AIzaSyAJhR6xvM3HQCEoRqWyYtWwyA7nM2RW3TY'

genai.configure(api_key=GOOGLE_API_KEY)

In [5]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [6]:
context='roles offered are ABC,DEF'
user_query='what are the roles offered?'

In [7]:

from langchain.prompts import PromptTemplate


prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    Using the information contained in the context, 
    give a comprehensive answer to the question.
    Respond only to the question asked, response should be concise and relevant to the question.
    Provide the number of the source document when relevant.
    If the answer cannot be deduced from the context, do not give an answer.

    Context:
    {context}
    ---
    Now here is the question you need to answer:

    {question}
    """
)



formatted_prompt = prompt_template.format(question=user_query,context=context)

response=model.generate_content(formatted_prompt)
print(response)

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "ABC, DEF \n"
              }
            ],
            "role": "model"
          },
          "finish_reason": "STOP",
          "index": 0,
          "safety_ratings": [
            {
              "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HATE_SPEECH",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HARASSMENT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
              "probability": "NEGLIGIBLE"
            }
          ]
        }
      ],
      "usage_metadata": {
        "prompt_token_cou