# Model Initiation

In [1]:
import sys

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# Load the Necessary API Keys

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

huggingface_api_key = os.getenv("hugging_face_key")
pinecone_key = os.getenv("pinecone_api_key")
mongo_uri = os.getenv("mongo_db_key")
open_ai_key = os.getenv("open_ai_api_key")


# Set-up/Connect to Pinecone

In [5]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app-images"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=512,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index = pc.Index(index_name)

#vector_store = PineconeVectorStore(embedding=embeddings, index=index)

In [6]:
from huggingface_hub import login
from huggingface_hub import whoami

login(huggingface_api_key)
whoami()

{'type': 'user',
 'id': '66d5147ab005ad82ca47182f',
 'name': 'dorukozar',
 'fullname': 'Doruk Ozar',
 'email': 'dorukozar@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': None,
 'isPro': False,
 'avatarUrl': '/avatars/06335824f9a6991ec7b901b31802dd5b.svg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'Presentation',
   'role': 'read',
   'createdAt': '2025-01-16T00:00:59.134Z'}}}

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [8]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index_text = pc.Index(index_name)

vector_store_text = PineconeVectorStore(embedding=huggingface_embeddings, index=index_text)

In [9]:
user_query = "What is supervised and unsupervised machine learning?"
retriever = vector_store_text.as_retriever(search_kwargs={"k":5})
# retriever.get_relevant_documents(query)
retrieved_docs = retriever.invoke(user_query)
retrieved_docs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[Document(id='3fa65848-c8ee-43e9-8856-541127519cf5', metadata={'source_id': 'ML_Overview.txt'}, page_content="getting very specific into the models i mean there's there's another step right here it might be like semi-supervised or reinforcement but generally we have two really broad categories supervised and unsupervised the difference between these is very simple supervised has labels we know where the data comes from and we know what the target classes are and unsupervised does not all right and we're going to talk about this a little bit more detail here as i get into some examples but that's it i mean this is just categorized right this is not all right much of the advancements when we think of like in the ai systems or advances in technology all come from labeled data right so it's you have armies of intern somewhere labeling it whether it's a dog or a hot dog somewhere right we pass all those all those images in and a machine knows it or anything the same for ibm or the deep blue

In [10]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

print(format_docs(retrieved_docs))

getting very specific into the models i mean there's there's another step right here it might be like semi-supervised or reinforcement but generally we have two really broad categories supervised and unsupervised the difference between these is very simple supervised has labels we know where the data comes from and we know what the target classes are and unsupervised does not all right and we're going to talk about this a little bit more detail here as i get into some examples but that's it i mean this is just categorized right this is not all right much of the advancements when we think of like in the ai systems or advances in technology all come from labeled data right so it's you have armies of intern somewhere labeling it whether it's a dog or a hot dog somewhere right we pass all those all those images in and a machine knows it or anything the same for ibm or the deep blue or the other really famous uh kind of ai machines that uh that you think of right they all learn to play

of 

# Embed the query

In [11]:
inputs = clip_processor(text=[user_query], return_tensors="pt", padding=True)
text_embedding = clip_model.get_text_features(**inputs).detach().numpy().tolist()[0]

# Query the Pinecone vector db and return top 5 matches

In [12]:
query_results = index.query(vector=text_embedding, top_k=5, include_metadata=True)

In [13]:
query_results["matches"]

[{'id': '3001_ETA/page_9_img_2.png',
  'metadata': {'file_name': '3001_ETA', 'image_key': 'page_9_img_2.png'},
  'score': 0.334316969,
  'values': []},
 {'id': 'machine_learning_bootcamp_II/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_bootcamp_II',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387695,
  'values': []},
 {'id': 'machine_learning_overview/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_overview',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387695,
  'values': []},
 {'id': 'machine_learning_III/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_III',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387695,
  'values': []},
 {'id': 'machine_learning_bootcamp_II copy/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_bootcamp_II copy',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387695,
  'values': []}]

# Connect to Mongodb

In [14]:
from pymongo import MongoClient
import gridfs

client = MongoClient(mongo_uri)

#client = MongoClient(MONGO_URI)

db = client["images"]

collection = db["images_for_rag"]

print("Connected to MongoDB successfully!")

Connected to MongoDB successfully!


In [15]:
#file_names_list = os.listdir("data_processed")

fs = gridfs.GridFS(db)



# Query MongDB

In [17]:
# image_data = fs.find_one({"filename": "clustering.png"})
image_data_list = []
for i in query_results["matches"]:
    filename = i["id"]
    image_data = fs.find_one({"filename": filename})
    image_data_list.append(image_data)

KeyboardInterrupt: 

In [17]:
image_data_list

[<gridfs.synchronous.grid_file.GridOut at 0x1bb7c7d30>,
 <gridfs.synchronous.grid_file.GridOut at 0x1bda55e40>,
 <gridfs.synchronous.grid_file.GridOut at 0x1bda55990>,
 <gridfs.synchronous.grid_file.GridOut at 0x1bda55ed0>,
 <gridfs.synchronous.grid_file.GridOut at 0x1bda55e70>]

In [18]:
from PIL import Image, ImageChops
import io
import hashlib
import numpy as np
from skimage.metrics import structural_similarity as ssim

def images_are_equal(img1, img2):
    """Check if two images are identical pixel by pixel."""
    return ImageChops.difference(img1, img2).getbbox() is None

def hash_image(image):
    """Compute hash of an image."""
    hasher = hashlib.md5()
    hasher.update(image.tobytes())  # Convert image to bytes and hash
    return hasher.hexdigest()

def images_are_similar(img1, img2, threshold=0.7):
    """Compare two images using SSIM after resizing them to the same dimensions."""
    
    # Convert to grayscale
    img1_gray = img1.convert('L')
    img2_gray = img2.convert('L')

    # Resize images to the same size
    common_size = (min(img1_gray.width, img2_gray.width), min(img1_gray.height, img2_gray.height))
    img1_resized = img1_gray.resize(common_size, Image.LANCZOS)
    img2_resized = img2_gray.resize(common_size, Image.LANCZOS)

    # Convert to NumPy arrays
    img1_np = np.array(img1_resized)
    img2_np = np.array(img2_resized)

    # Compute SSIM similarity
    similarity = ssim(img1_np, img2_np)
    print(similarity)
    return similarity > threshold  # Return True if similar

In [19]:
from PIL import Image, ImageChops
import io
import hashlib
import numpy as np
from skimage.metrics import structural_similarity as ssim



if len(image_data_list)>0:
    local_image_list = []
    hashes = set()
    user_input = input("""
    To use similarity, enter the number 1
    To use hashing comparison, enter the number 2
    To use pixel to pixel comparison, enter the number 3
    """)
    
    for j in image_data_list:
        if j:
            # Convert binary data to a PIL Image
            image = Image.open(io.BytesIO(j.read()))
            
            if user_input.strip() == "1":
                duplicate_found = False
                for stored_image in local_image_list:
                    if images_are_similar(stored_image, image):
                        print(f"⚠️ Similar image found at index {j}")
                        duplicate_found = True
                        break
        
                if not duplicate_found:
                    local_image_list.append(image)
                    image.show()
            
            elif user_input.strip() == "2":
                img_hash = hash_image(image)
                if img_hash in hashes:
                    print(f"⚠️ Duplicate image found at index {j}")
                else:
                    hashes.add(img_hash)
                    image.show()
            
            elif user_input.strip() == "3":
                duplicate_found = False
                for stored_image in local_image_list:
                    if images_are_equal(stored_image, image):
                        print(f"⚠️ Duplicate image found at index {j}")
                        duplicate_found = True
                        break
    
                if not duplicate_found:
                    local_image_list.append(image)
                    image.show()
            else:
                print("Wrong input!")
                sys.exit(1)
            
            
            # Display the image
            # image.show()
        else:
            print("❌ Image not found")
            
            


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0.7365119420482111
⚠️ Similar image found at index <gridfs.synchronous.grid_file.GridOut object at 0x1bda55e40>
0.7365119420482111
⚠️ Similar image found at index <gridfs.synchronous.grid_file.GridOut object at 0x1bda55990>
0.7365119420482111
⚠️ Similar image found at index <gridfs.synchronous.grid_file.GridOut object at 0x1bda55ed0>
0.7365119420482111
⚠️ Similar image found at index <gridfs.synchronous.grid_file.GridOut object at 0x1bda55e70>


In [20]:
local_image_list

[<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1300x917>]

---
break
---

# Embedding text and image at the same time

In [21]:
import base64
from PIL import Image
import io


encoded_images = []
count = 0

if len(image_data_list) > 0:
    for j in local_image_list:
        if j:
            print(f"Processing image {count + 1}")
            
            try:
                # Ensure the file pointer is at the start
                j.seek(0)
                
                
                # Convert to PNG format and encode
                buffered = io.BytesIO()
                j.save(buffered, format="JPEG")
                
                encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
                encoded_images.append(encoded_image)
                
                count += 1
            except Exception as e:
                print(f"❌ Error processing image {count + 1}: {e}")
        else:
            print("❌ Image not found")

# Closing file handlers
for img in image_data_list:
    img.close()

# Displaying the number of successfully encoded images
print(f"✅ Successfully encoded {count} images.")


Processing image 1
✅ Successfully encoded 1 images.


In [22]:
# prompt = f"""You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.
# 
# Guidelines:
# 1. Provide concise and informative answers.
# 2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
# 3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
# 4. Use examples where applicable to illustrate your answers.
# 5. Maintain a professional and helpful tone.
# 
# Question: {user_query}
# 
# Retrieved Information: {format_docs(retrieved_docs)}
# 
# Answer:
# """

prompt = f"""
Guidelines:
1. Provide extensive and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: {user_query}

Retrieved Information: {format_docs(retrieved_docs)}

Answer:
"""


In [23]:
content = [{"type":"text", "text":prompt}]
for img in encoded_images:
    content.append({
        "type":"image_url",
        "image_url":{
            "url":f"data:image/jpeg;base64, {img}"
        }
    })

In [24]:
from openai import OpenAI

client = OpenAI(api_key=open_ai_key)

try:
    chat = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
            "role":"system",
            "content":"You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question."
        },
            {
                "role": "user",
                "content": content
            }
        ]
    )
    
    print(chat.choices[0].message.content)
except Exception as e:
    print(f"An error occurred: {e}")

Supervised and unsupervised learning are two fundamental approaches in machine learning, each designed to address different types of tasks.

### Supervised Learning

**Definition:**
Supervised learning involves training a model on a labeled dataset, meaning that each training example is paired with an output label. The aim is to learn the mapping from inputs to outputs to predict labels for new, unseen data.

**How It Works:**
- **Input**: A set of input features \(X\).
- **Output**: A known label \(Y\), which is what the model attempts to predict.
- **Goal**: Learn a function \(f(X) = Y\) that can map inputs to outputs.
- **Examples**: 
  - **Classification**: Predicting a category. For example, email spam detection (spam or not spam).
  - **Regression**: Predicting a continuous value. For instance, predicting house prices based on features like size and location.

**Applications:**
Supervised learning is widely used in applications where past observations with known outcomes are avai

In [21]:
import numpy as np
from openai import OpenAI
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from transformers import CLIPProcessor, CLIPModel
from pymongo import MongoClient
import gridfs
import base64
from PIL import Image
import io

class RAGBot:
    def __init__(self, model="gpt-4o"):
        self.llm = OpenAI(api_key=open_ai_key)
        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
        self.doc_embeddings = None
        self.docs = None
        self.vector_store = PineconeVectorStore(embedding=self.embeddings, index=index_text)
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        self.mongo_client = client
        self.db = db
        self.fs = fs

    def load_documents(self, documents):
        """Load documents and compute their embeddings."""
        self.docs = documents
        self.doc_embeddings = self.embeddings.embed_documents(documents)

    def get_most_relevant_docs(self, query):
        """Find the most relevant document for a given query."""
        if not self.docs or not self.doc_embeddings:
            raise ValueError("Documents and their embeddings are not loaded.")

        query_embedding = self.embeddings.embed_query(query)
        similarities = [
            np.dot(query_embedding, doc_emb)
            / (np.linalg.norm(query_embedding) * np.linalg.norm(doc_emb))
            for doc_emb in self.doc_embeddings
        ]
        most_relevant_doc_index = np.argmax(similarities)
        return [self.docs[most_relevant_doc_index]]

    def generate_answer(self, query, relevant_doc):
        """Generate an answer for a given query based on the most relevant document."""
        prompt = f"question: {query}\n\nDocuments: {relevant_doc}"
        messages = [
            {"role": "system", "content": "You are a helpful assistant that answers questions based on given documents only."},
            {"role": "user", "content": prompt},
        ]
        try:
            chat = self.llm.chat.completions.create(
                model="gpt-4o",
                messages=messages
            )
            return chat.choices[0].message.content
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def embed_query(self, query):
        """Embed the query using CLIP model."""
        inputs = self.clip_processor(text=[query], return_tensors="pt", padding=True)
        text_embedding = self.clip_model.get_text_features(**inputs).detach().numpy().tolist()[0]
        return text_embedding

    def query_pinecone(self, text_embedding):
        """Query Pinecone vector db and return top 5 matches."""
        query_results = index.query(vector=text_embedding, top_k=5, include_metadata=True)
        return query_results["matches"]

    def retrieve_images(self, query_results):
        """Retrieve images from MongoDB based on Pinecone query results."""
        image_data_list = []
        for i in query_results:
            filename = i["id"]
            image_data = self.fs.find_one({"filename": filename})
            image_data_list.append(image_data)
        return image_data_list

    def encode_images(self, image_data_list):
        """Encode images to base64."""
        encoded_images = []
        for img_data in image_data_list:
            if img_data:
                img = Image.open(io.BytesIO(img_data.read()))
                buffered = io.BytesIO()
                img.save(buffered, format="JPEG")
                encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
                encoded_images.append(encoded_image)
        return encoded_images

In [None]:
sample_docs = [
    "What are the components of a decision tree?",
    "What is k-means clustering?",
    "What is the bias-variance tradeoff?",
    "What is overfitting in the context of machine learning?",
    "What does a decision boundary mean in the context of K-nearest neighbors?",
    "What are ensemble learning methods, and how do they improve model performance?"
]