langchain,
numpy,
streamlit,
langchain-commnunity,
pypdf2,
unstructured,
nltk,
sentence-huggingface,
daiss-cpu,
python-docx,

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [40]:
!pip install langchain numpy streamlit langchain-community streamlit PyPDF2 unstructured nltk sentence-transformers langchain-huggingface faiss-cpu python-docx



In [41]:
# web interface for the application
import streamlit as st

# facebooks library for efficient similarity search and clustering of dense vecrors
import faiss

# interact with the operating system for environment variable and
import os

# handle bytes stream: used file uploading and handling memory
from io import BytesIO

# handling array and mathmetical operations
import numpy as np

# RetrievalQA : Langchain utility to create a question answering system with retrievers
from langchain.chains import RetrievalQA

# CharacterTextSplitter: To split documents into smaller chunks based on characters
# https://python.langchain.com/docs/concepts/text_splitters/
from langchain.text_splitter import CharacterTextSplitter

# to generate embedding text
from langchain_huggingface import HuggingFaceEmbeddings

# used for vector storage and similarity search
from langchain_community.vectorstores import FAISS

# In-memory storage for documents (used for quick lookup without persistent storage)
from langchain_community.docstore.in_memory import InMemoryDocstore

# interface to  interact with hugging face models via endpoints
from langchain_huggingface import HuggingFaceEndpoint

In [42]:
from google.colab import userdata
from dotenv import load_dotenv
# load_dotenv(dotenv_path = "/content/drive/MyDrive/Thesis/env/.env")
# api_key = os.getenv("HUG_FACE_API_KEY")
# os.environ['HUGGINGFACEHUB_API_TOKEN'] = api_key

os.environ['HUGGINGFACEHUB_API_TOKEN'] = userdata.get('huggingface_key')

In [43]:
#print(os.environ['HUGGINGFACEHUB_API_TOKEN'])

In [44]:
# userdata.get('secretName')

In [45]:
# langchain_token : https://smith.langchain.com/o/a9bb362c-b44a-485d-9c5a-d7c037c7a461/
# langchain introduction: https://python.langchain.com/docs/introduction/f


In [46]:
documents = '''
Artificial Intelligence has experienced rapid advancements in the past few years, transforming industries, research, and everyday life. From powerful language models to AI-driven robotics and healthcare solutions, the period between 2020 and 2024 has seen significant innovation.

The release of OpenAI's GPT-3 in 2020 and its successor GPT-4 in 2023 marked a turning point in natural language processing. These models demonstrated capabilities such as translation, summarization, content creation, and even basic reasoning, all with minimal input examples. Other major contributions include Anthropic's Claude, Google's Gemini, and Meta’s LLaMA and Mistral models, offering more efficient and often open-source alternatives for developers and researchers. These language models have become core tools across sectors, enabling new applications in customer service, legal analysis, education, and software development.

Retrieval-Augmented Generation (RAG) architectures gained popularity for enhancing factual accuracy in AI-generated content. By combining large language models with external data retrieval systems, RAG pipelines ensure responses are grounded in real information, making them valuable for fact-checking, academic research, and enterprise knowledge access.

In healthcare and science, AI models like AlphaFold, developed by DeepMind, solved the long-standing challenge of protein structure prediction. This achievement accelerated advances in drug discovery, disease research, and synthetic biology. Generative models are also being used in chemistry and materials science to design novel molecules and simulate chemical reactions, drastically reducing the time required for scientific experimentation.

AI systems have also advanced in understanding and generating multimodal content. Models such as CLIP and DALL·E can process and generate both text and images, while newer tools like OpenAI’s Sora enable video generation from text descriptions. These capabilities are being used in art, design, marketing, and assistive technologies.

In response to growing concerns about misinformation and bias, the AI community has also made progress in responsible AI development. New tools and frameworks are emerging to detect AI-generated content, ensure fairness, and provide transparency in model behavior.

Overall, AI in recent years has evolved from specialized tools into powerful general-purpose technologies. As these systems continue to improve, they are becoming integral to the way people communicate, discover, and innovate.
'''

# split the documents into manageable chunks

In [47]:
# chunk_size : define the size of each chunk (300 characters in this case)
# chunk_overlap: defines the number of overlapping characters between chunks (50 characters)
# https://python.langchain.com/api_reference/text_splitters/character/langchain_text_splitters.character.CharacterTextSplitter.html
text_splitter  = CharacterTextSplitter(chunk_size = 300, chunk_overlap = 50)
# split the text form teh document into smaller, overlapping chunk
# this allows for better handling of large documents during embedding anf retrieval operations
texts = text_splitter.split_text(documents)



In [48]:
texts

['Artificial Intelligence has experienced rapid advancements in the past few years, transforming industries, research, and everyday life. From powerful language models to AI-driven robotics and healthcare solutions, the period between 2020 and 2024 has seen significant innovation.',
 "The release of OpenAI's GPT-3 in 2020 and its successor GPT-4 in 2023 marked a turning point in natural language processing. These models demonstrated capabilities such as translation, summarization, content creation, and even basic reasoning, all with minimal input examples. Other major contributions include Anthropic's Claude, Google's Gemini, and Meta’s LLaMA and Mistral models, offering more efficient and often open-source alternatives for developers and researchers. These language models have become core tools across sectors, enabling new applications in customer service, legal analysis, education, and software development.",
 'Retrieval-Augmented Generation (RAG) architectures gained popularity for 

# generate embeddings

### https://python.langchain.com/api_reference/huggingface/embeddings/langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings.html#langchain_huggingface.embeddings.huggingface.HuggingFaceEmbeddings

In [49]:
# sentence-transformation
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v1')
model_name = "sentence-transformers/all-mpnet-base-v1"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
# initialize the sentence transformers
# utilize Hugging Face models within the LangChain ecosystem : https://huggingface.co/blog/langchain
hf = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)



In [50]:
sentence = ["This is an exampe sentence", "Each sentence in converted", "My name is chandan"]

embeddings = model.encode(sentence)
print(embeddings)

[[ 0.00846218 -0.00343644 -0.04158461 ...  0.0038013   0.04360511
  -0.08415008]
 [ 0.03378892 -0.00156719 -0.0125569  ... -0.02279112 -0.05688161
  -0.00735118]
 [ 0.03496669  0.02725157 -0.015148   ...  0.01317524 -0.05194363
  -0.01112446]]


In [51]:
embeddings.shape

(3, 768)

# Vector datasset for similarity search : FAISS

https://python.langchain.com/docs/integrations/vectorstores/faiss/

In [54]:
# create FAISS index for efficient similarity search

# Generate a sample embedding to determine the dimensionality of the vector space
sample_embedding = np.array(hf.embed_query("my name is chandan"))
# print(sample_embedding)
# print("\n\n")
# print(len(sample_embedding))
# print("\n\n")

dimension = sample_embedding.shape[0]
# print(dimension)
# print("\n\n")


index = faiss.IndexFlatL2(dimension)
# print(index)
# print("\n\n")



In [33]:
vector_store = FAISS(
    embedding_function = hf.embed_query, # function that converts documents into embeddings (hf.embed_query)
    index = index, # the initialized FASIS index
    docstore = InMemoryDocstore(),
    index_to_docstore_id = {},
)