In [10]:
# HOW EMBEDDINGS LOOK LIKE
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

sentences = ["This is an example sentence"]

embeddings = model.encode(sentences)
print("Sentence embeddings:")
print(embeddings)

Sentence embeddings:
[[ 2.25025900e-02 -7.82918185e-02 -2.30307579e-02 -5.10000717e-03
  -8.03404152e-02  3.91321331e-02  1.13428524e-02  3.46482103e-03
  -2.94573922e-02 -1.88930500e-02  9.47433710e-02  2.92748269e-02
   3.94859463e-02 -4.63165864e-02  2.54245866e-02 -3.22000235e-02
   6.21928833e-02  1.55591909e-02 -4.67795469e-02  5.03901243e-02
   1.46113373e-02  2.31413450e-02  1.22066466e-02  2.50696056e-02
   2.93655344e-03 -4.19821963e-02 -4.01032111e-03 -2.27843802e-02
  -7.68595422e-03 -3.31091024e-02  3.22118886e-02 -2.09992398e-02
   1.16730640e-02 -9.85074118e-02  1.77932623e-06 -2.29931585e-02
  -1.31140519e-02 -2.80222502e-02 -6.99970201e-02  2.59314068e-02
  -2.89501827e-02  8.76335651e-02 -1.20919226e-02  3.98605093e-02
  -3.31381485e-02  3.59107666e-02  3.46099064e-02  6.49783835e-02
  -3.00817564e-02  6.98188469e-02 -3.99515918e-03 -1.01596548e-03
  -3.50185037e-02 -4.36567403e-02  5.08025736e-02  4.68757562e-02
   5.39663546e-02 -4.03008573e-02  3.20137292e-03  1.36

In [11]:
from PyPDF2 import PdfReader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

import os
from dotenv import load_dotenv
load_dotenv()
openai_key = os.environ.get("OPENAI_KEY")

embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

In [12]:
# FUNCTION TO LOAD PDF AND SPLIT INTO PARAGRAPHS
def split_paragraphs(rawText):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=200,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )

    return  text_splitter.split_text(rawText)

def load_pdfs(pdfs):
    text_chunks = []

    for pdf in pdfs:
        reader = PdfReader(pdf)
        for page in reader.pages:
            raw = page.extract_text()
            chunks = split_paragraphs(raw)
            text_chunks += chunks
    print(" HERE IS EXAMPLE OF WHAT WE HAVE IN TEXT CHUNKS")
    print(text_chunks[:5])  # Print first 5 chunks for inspection
    return text_chunks


In [13]:
list_of_pdfs = ["1. dietary supplements - for whom.pdf"]
text_chunks = load_pdfs(list_of_pdfs)

 HERE IS EXAMPLE OF WHAT WE HAVE IN TEXT CHUNKS
['International  Journal  of \nEnvironmental Research\nand Public Health\nReview\nDietary Supplements—For Whom? The Current State of\nKnowledge about the Health Effects of Selected\nSupplement Use', 'Supplement Use\nRegina Ewa Wierzejska\n/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045 /gid00001\n/gid00048/gid00043/gid00031/gid00028/gid00047/gid00032/gid00046', 'Citation: Wierzejska, R.E. Dietary\nSupplements—For Whom? The\nCurrent State of Knowledge about the\nHealth Effects of Selected Supplement\nUse. Int. J. Environ. Res. Public Health', '2021 ,18, 8897. https://doi.org/\n10.3390/ijerph18178897\nAcademic Editor: Paul B. Tchounwou\nReceived: 15 July 2021\nAccepted: 21 August 2021\nPublished: 24 August 2021', 'Publisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional afﬁl-\niations.\nCopyright: © 2021 by the author.\nLicensee MDPI, Basel, Switzerland

In [14]:
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings

embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
store = FAISS.from_texts(text_chunks, embed_model)


In [15]:
store.save_local("./myVectorStore")

In [None]:
from langchain_community.chat_models import ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA

# Settings contains the env: OPENAI_API_KEY
openai_key = os.environ.get("OPENAI_KEY")

# Load the saved FAISS store from the disk.
store = FAISS.load_local("myVectorStore",  OpenAIEmbeddings(openai_api_key=openai_key), allow_dangerous_deserialization=True)

# Create an instance of a ChatGPT turbo model

llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0, openai_api_key=openai_key)

# Build our Langchain chain instance.
chain = RetrievalQA.from_chain_type(
   llm=llm,
   retriever=store.as_retriever()
)

# Ask the LLM a question.
result = chain({"query": "What is Nepolian pizza ?"})
print(result)

In [5]:
!pip install -U sentence-transformers

Requirement already up-to-date: sentence-transformers in c:\users\user\desktop\manish\.venv\lib\site-packages (5.0.0)


You should consider upgrading via the 'c:\users\user\desktop\manish\.venv\scripts\python.exe -m pip install --upgrade pip' command.


In [15]:
pip install PyPDF2

Collecting PyPDF2
  Using cached pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\USER\Desktop\MANISH\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [17]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp39-cp39-win_amd64.whl (14.9 MB)
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\USER\Desktop\MANISH\.venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [2]:
!pip install openai

Collecting openai
  Downloading openai-1.97.0-py3-none-any.whl (764 kB)
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.10.0-cp39-cp39-win_amd64.whl (208 kB)
Installing collected packages: distro, jiter, openai
Successfully installed distro-1.9.0 jiter-0.10.0 openai-1.97.0


You should consider upgrading via the 'c:\users\user\desktop\manish\.venv\scripts\python.exe -m pip install --upgrade pip' command.


In [7]:
!pip install tiktoken


Collecting tiktoken
  Downloading tiktoken-0.9.0-cp39-cp39-win_amd64.whl (894 kB)
Installing collected packages: tiktoken
Successfully installed tiktoken-0.9.0


You should consider upgrading via the 'c:\users\user\desktop\manish\.venv\scripts\python.exe -m pip install --upgrade pip' command.


In [17]:
import os
from dotenv import load_dotenv
from langchain_community.embeddings import OpenAIEmbeddings

# Load .env file
load_dotenv()

# Get key
openai_key = os.getenv("OPENAI_API_KEY")

# Check key is loaded
if not openai_key:
    raise ValueError("API key not found. Make sure it's in your .env file.")

# Use the key
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)


In [18]:
print("OpenAI API Key:", openai_key)


OpenAI API Key: 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
