# Model Initiation

In [28]:
import sys

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")




# Load the Necessary API Keys

In [29]:
from dotenv import load_dotenv
import os

load_dotenv()

huggingface_api_key = os.getenv("hugging_face_key")
pinecone_key = os.getenv("pinecone_api_key")
mongo_uri = os.getenv("mongo_db_key")
open_ai_key = os.getenv("open_ai_api_key")


# Set-up/Connect to Pinecone

In [30]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app-images"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=512,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index = pc.Index(index_name)

#vector_store = PineconeVectorStore(embedding=embeddings, index=index)

In [31]:
from huggingface_hub import login
from huggingface_hub import whoami

login(huggingface_api_key)
whoami()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\vishg\.cache\huggingface\token
Login successful


{'type': 'user',
 'id': '66d5147ab005ad82ca47182f',
 'name': 'dorukozar',
 'fullname': 'Doruk Ozar',
 'email': 'dorukozar@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': None,
 'isPro': False,
 'avatarUrl': '/avatars/06335824f9a6991ec7b901b31802dd5b.svg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'Presentation',
   'role': 'read',
   'createdAt': '2025-01-16T00:00:59.134Z'}}}

In [32]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [33]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index_text = pc.Index(index_name)

vector_store_text = PineconeVectorStore(embedding=huggingface_embeddings, index=index_text)

In [34]:
user_query = "What is hyper parameter tuning?"
retriever = vector_store_text.as_retriever(search_kwargs={"k":5})
# retriever.get_relevant_documents(query)
retrieved_docs = retriever.invoke(user_query)
retrieved_docs

[Document(id='f3a35047-d5ad-4da6-80ad-d483e1e1c87a', metadata={'source_id': 'PhaseII-DataPrepAndProblemExploration.txt'}, page_content="foundational concept of machine learning is how we do this training tuning and test data sets and what that involves is dividing our data set into three partitions one that's fairly large that's probably the training set that might be eighty percent of the data and then two smaller data sets all right used for tuning and test all right so tuning is a way that we can modify our algorithm whether that is the hyper parameters or remember the hyper parameters are this particular features of an algorithm that can be adjusted or we do data engineering right which are the actual variables that are going into the model we might change those and then seeing how that affects the performance of the model all right so we do all sorts of things in the training and tuning phase to try and optimize this performance and when we think it's totally done we think we've g

In [35]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

print(format_docs(retrieved_docs))

foundational concept of machine learning is how we do this training tuning and test data sets and what that involves is dividing our data set into three partitions one that's fairly large that's probably the training set that might be eighty percent of the data and then two smaller data sets all right used for tuning and test all right so tuning is a way that we can modify our algorithm whether that is the hyper parameters or remember the hyper parameters are this particular features of an algorithm that can be adjusted or we do data engineering right which are the actual variables that are going into the model we might change those and then seeing how that affects the performance of the model all right so we do all sorts of things in the training and tuning phase to try and optimize this performance and when we think it's totally done we think we've got a really good model going based off the back and forth between training and tuning data set and I'll show you how to do this and

thi

# Embed the query

In [36]:
inputs = clip_processor(text=[user_query], return_tensors="pt", padding=True)
text_embedding = clip_model.get_text_features(**inputs).detach().numpy().tolist()[0]

# Query the Pinecone vector db and return top 5 matches

In [37]:
query_results = index.query(vector=text_embedding, top_k=5, include_metadata=True)

In [38]:
query_results["matches"]

[{'id': 'Clustering_InClass_9.28.21-1/page_33_img_1.png',
  'metadata': {'file_name': 'Clustering_InClass_9.28.21-1',
               'image_key': 'page_33_img_1.png'},
  'score': 0.280774266,
  'values': []},
 {'id': 'Clustering_InClass_9.28.21-2/page_33_img_1.png',
  'metadata': {'file_name': 'Clustering_InClass_9.28.21-2',
               'image_key': 'page_33_img_1.png'},
  'score': 0.280774266,
  'values': []},
 {'id': 'Clustering_InClass_9.28.21-1/page_39_img_1.png',
  'metadata': {'file_name': 'Clustering_InClass_9.28.21-1',
               'image_key': 'page_39_img_1.png'},
  'score': 0.280774266,
  'values': []},
 {'id': 'Clustering_InClass_9.28.21/page_36_img_1.png',
  'metadata': {'file_name': 'Clustering_InClass_9.28.21',
               'image_key': 'page_36_img_1.png'},
  'score': 0.280774266,
  'values': []},
 {'id': 'Clustering_InClass_9.28.21-2/page_39_img_1.png',
  'metadata': {'file_name': 'Clustering_InClass_9.28.21-2',
               'image_key': 'page_39_img_1.png'},


# Connect to Mongodb

In [39]:
from pymongo import MongoClient
import gridfs

client = MongoClient(mongo_uri)

#client = MongoClient(MONGO_URI)

db = client["images"]

collection = db["images_for_rag"]

print("Connected to MongoDB successfully!")

Connected to MongoDB successfully!


In [54]:
# file_names_list = os.listdir("data_processed")
fs = gridfs.GridFS(db)

collection


Collection(Database(MongoClient(host=['project-shard-00-00.6vv4e.mongodb.net:27017', 'project-shard-00-02.6vv4e.mongodb.net:27017', 'project-shard-00-01.6vv4e.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', appname='project', authsource='admin', replicaset='atlas-705ncd-shard-0', tls=True), 'images'), 'images_for_rag')

# Query MongDB

In [55]:
image_data = fs.find_one({"filename": "clustering.png"})
image_data_list = []
for i in query_results["matches"]:
    filename = i["id"]
    image_data = fs.find_one({"filename": filename})
    image_data_list.append(image_data)

ServerSelectionTimeoutError: SSL handshake failed: project-shard-00-00.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: project-shard-00-02.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms),SSL handshake failed: project-shard-00-01.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 67bca57f33277903ec11b3f6, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('project-shard-00-00.6vv4e.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: project-shard-00-00.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('project-shard-00-01.6vv4e.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: project-shard-00-01.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>, <ServerDescription ('project-shard-00-02.6vv4e.mongodb.net', 27017) server_type: Unknown, rtt: None, error=AutoReconnect('SSL handshake failed: project-shard-00-02.6vv4e.mongodb.net:27017: [SSL: TLSV1_ALERT_INTERNAL_ERROR] tlsv1 alert internal error (_ssl.c:1000) (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [42]:
image_data_list

[]

In [43]:
from PIL import Image, ImageChops
import io
import hashlib
import numpy as np
from skimage.metrics import structural_similarity as ssim

def images_are_equal(img1, img2):
    """Check if two images are identical pixel by pixel."""
    return ImageChops.difference(img1, img2).getbbox() is None

def hash_image(image):
    """Compute hash of an image."""
    hasher = hashlib.md5()
    hasher.update(image.tobytes())  # Convert image to bytes and hash
    return hasher.hexdigest()

def images_are_similar(img1, img2, threshold=0.7):
    """Compare two images using SSIM after resizing them to the same dimensions."""
    
    # Convert to grayscale
    img1_gray = img1.convert('L')
    img2_gray = img2.convert('L')

    # Resize images to the same size
    common_size = (min(img1_gray.width, img2_gray.width), min(img1_gray.height, img2_gray.height))
    img1_resized = img1_gray.resize(common_size, Image.LANCZOS)
    img2_resized = img2_gray.resize(common_size, Image.LANCZOS)

    # Convert to NumPy arrays
    img1_np = np.array(img1_resized)
    img2_np = np.array(img2_resized)

    # Compute SSIM similarity
    similarity = ssim(img1_np, img2_np)
    print(similarity)
    return similarity > threshold  # Return True if similar

In [46]:
from PIL import Image, ImageChops
import io
import hashlib
import numpy as np
from skimage.metrics import structural_similarity as ssim



if len(image_data_list)>0:
    local_image_list = []
    hashes = set()
    user_input = input("""
    To use similarity, enter the number 1
    To use hashing comparison, enter the number 2
    To use pixel to pixel comparison, enter the number 3
    """)
    
    for j in image_data_list:
        if j:
            # Convert binary data to a PIL Image
            image = Image.open(io.BytesIO(j.read()))
            
            if user_input.strip() == "1":
                duplicate_found = False
                for stored_image in local_image_list:
                    if images_are_similar(stored_image, image):
                        print(f"⚠️ Similar image found at index {j}")
                        duplicate_found = True
                        break
        
                if not duplicate_found:
                    local_image_list.append(image)
                    image.show()
            
            elif user_input.strip() == "2":
                img_hash = hash_image(image)
                if img_hash in hashes:
                    print(f"⚠️ Duplicate image found at index {j}")
                else:
                    hashes.add(img_hash)
                    image.show()
            
            elif user_input.strip() == "3":
                duplicate_found = False
                for stored_image in local_image_list:
                    if images_are_equal(stored_image, image):
                        print(f"⚠️ Duplicate image found at index {j}")
                        duplicate_found = True
                        break
    
                if not duplicate_found:
                    local_image_list.append(image)
                    image.show()
            else:
                print("Wrong input!")
                sys.exit(1)
            
            
            # Display the image
            # image.show()
        else:
            print("❌ Image not found")
            
            


In [47]:
local_image_list

NameError: name 'local_image_list' is not defined

---
break
---

# Embedding text and image at the same time

In [48]:
import base64
from PIL import Image
import io


encoded_images = []
count = 0

if len(image_data_list) > 0:
    for j in local_image_list:
        if j:
            print(f"Processing image {count + 1}")
            
            try:
                # Ensure the file pointer is at the start
                j.seek(0)
                
                
                # Convert to PNG format and encode
                buffered = io.BytesIO()
                j.save(buffered, format="JPEG")
                
                encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
                encoded_images.append(encoded_image)
                
                count += 1
            except Exception as e:
                print(f"❌ Error processing image {count + 1}: {e}")
        else:
            print("❌ Image not found")

# Closing file handlers
for img in image_data_list:
    img.close()

# Displaying the number of successfully encoded images
print(f"✅ Successfully encoded {count} images.")


✅ Successfully encoded 0 images.


In [49]:
# prompt = f"""You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.
# 
# Guidelines:
# 1. Provide concise and informative answers.
# 2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
# 3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
# 4. Use examples where applicable to illustrate your answers.
# 5. Maintain a professional and helpful tone.
# 
# Question: {user_query}
# 
# Retrieved Information: {format_docs(retrieved_docs)}
# 
# Answer:
# """

prompt = f"""
Guidelines:
1. Provide extensive and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: {user_query}

Retrieved Information: {format_docs(retrieved_docs)}

Answer:
"""


In [50]:
content = [{"type":"text", "text":prompt}]
for img in encoded_images:
    content.append({
        "type":"image_url",
        "image_url":{
            "url":f"data:image/jpeg;base64, {img}"
        }
    })

In [51]:
from openai import OpenAI

client = OpenAI(api_key=open_ai_key)

try:
    chat = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
            "role":"system",
            "content":"You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question."
        },
            {
                "role": "user",
                "content": content
            }
        ]
    )
    
    print(chat.choices[0].message.content)
except Exception as e:
    print(f"An error occurred: {e}")

Hyperparameter tuning is a crucial process in machine learning that involves optimizing the parameters that govern the learning process of an algorithm, rather than the internal parameters that are derived during training. These are called hyperparameters, and they play a significant role in determining the performance of a machine learning model. 

Unlike model parameters, which are determined during the training process (e.g., weights in a linear regression model), hyperparameters are set before the actual training begins. They include settings such as the learning rate of a model, the number of layers in a neural network, the number of neighbors in a k-nearest neighbors (KNN) algorithm, or the maximum depth of a decision tree.

### The Importance of Hyperparameter Tuning

Hyperparameters can significantly affect the way a model learns and its subsequent accuracy and efficiency. Poorly chosen hyperparameters can lead to underfitting, overfitting, or even computational inefficiencies.