# Model Initiation

In [1]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


# Embedding Just images

In [2]:
import os
def image_embedding(path):
    result_dict = {}
    images_list = os.listdir(path)
    sorted_img_filenames = sorted(images_list, key=lambda x: int(x.split('_')[1].split('.')[0]))
    for i in sorted_img_filenames:
        image = Image.open(os.path.join(path, i))
        inputs = clip_processor(images=image, return_tensors="pt", padding=True)
        #outputs = model(**inputs)
        outputs = clip_model.get_image_features(**inputs)
        image_embeds = outputs
        result_dict[i] = image_embeds
        #print(image_embeds.shape)
        #break
    return result_dict


    

In [3]:
result_image_embeddings = {}
for i in os.listdir("data_processed"):
    
    image_embedding_dictionary = image_embedding(os.path.join("data_processed", i, "Images"))
    result_image_embeddings[i] = image_embedding_dictionary
result_image_embeddings

KeyboardInterrupt: 

# Load the Necessary API Keys

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

huggingface_api_key = os.getenv("hugging_face_key")
pinecone_key = os.getenv("pinecone_api_key")
mongo_uri = os.getenv("mongo_db_key")
open_ai_key = os.getenv("open_ai_api_key")


# Set-up/Connect to Pinecone

In [4]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app-images"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=512,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index = pc.Index(index_name)

#vector_store = PineconeVectorStore(embedding=embeddings, index=index)

# Delete all the content

In [43]:
# index.delete(delete_all=True)

{}

# Convert Image Embeddings to Vectors

In [5]:
import torch
embedding_vectors = {}
for file_name, image_embedding_dictionary in result_image_embeddings.items():
    image_embeddings = {k: v.squeeze().tolist() for k, v in image_embedding_dictionary.items()}
    vectors = [(k, v) for k, v in image_embeddings.items()]
    embedding_vectors[file_name] = vectors

NameError: name 'result_image_embeddings' is not defined

In [36]:
# import torch
# 
# image_embeddings = {k: v.squeeze().tolist() for k, v in image_embedding_dictionary.items()}
# vectors = [(k, v) for k, v in image_embeddings.items()]


# Push them to Pinecone

In [44]:
upsert_data = []
for file_name, vectors in embedding_vectors.items():
    for i, (key, vector) in enumerate(vectors):
        unique_id = f"{file_name}/{key}"  # Create a unique ID
        metadata = {"file_name": file_name, "image_key": key}  # Metadata including filename and key
        upsert_data.append((unique_id, vector, metadata))


In [25]:
len(upsert_data)

643

In [45]:
index.upsert(vectors=upsert_data)

#index.upsert(vectors)

{'upserted_count': 643}

# Embed the text

In [20]:
# def text_embedding(path):
#     text_embeddings = {}
#     text_list = os.listdir(path)
#     sorted_text_filenames = sorted(text_list, key=lambda x: int(x.split('_')[1].split('.')[0]))
#     for i in sorted_text_filenames:
#         file = os.path.join(path, i)
#         with open(file, "r") as f:
#             contents = f.read()
#         #print(type(contents))
#         inputs = clip_processor(text=[contents], return_tensors="pt", padding=True)
#         with torch.no_grad():
#             outputs = clip_model.get_text_features(**inputs)
#         embedding = outputs.squeeze().tolist()
#         text_embeddings[i] = embedding
#     return text_embeddings
#     


In [22]:
# data_processed_list = os.listdir("data_processed")
# result_text_embeddings = {}
# for i in data_processed_list:
#     print("name", i)
#     text_embedding_dictionary = text_embedding(os.path.join("data_processed", i, "Texts"))
#     result_text_embeddings[i] = text_embedding_dictionary

In [5]:
from huggingface_hub import login
from huggingface_hub import whoami

login(huggingface_api_key)
whoami()

{'type': 'user',
 'id': '66d5147ab005ad82ca47182f',
 'name': 'dorukozar',
 'fullname': 'Doruk Ozar',
 'email': 'dorukozar@gmail.com',
 'emailVerified': True,
 'canPay': False,
 'periodEnd': None,
 'isPro': False,
 'avatarUrl': '/avatars/06335824f9a6991ec7b901b31802dd5b.svg',
 'orgs': [],
 'auth': {'type': 'access_token',
  'accessToken': {'displayName': 'Presentation',
   'role': 'read',
   'createdAt': '2025-01-16T00:00:59.134Z'}}}

In [6]:
from langchain_huggingface import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [7]:
#create index
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

pc = Pinecone(api_key=pinecone_key)
index_name = "rag-app"


if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud = "aws", region="us-east-1")
    )

index_text = pc.Index(index_name)

vector_store_text = PineconeVectorStore(embedding=huggingface_embeddings, index=index_text)

In [8]:
def concat_text(path):
    text_embeddings = {}
    text_list = os.listdir(path)
    sorted_text_filenames = sorted(text_list, key=lambda x: int(x.split('_')[1].split('.')[0]))
    resulting_str = ""
    for i in sorted_text_filenames:
        file = os.path.join(path, i)
        with open(file, "r") as f:
            contents = f.read()
            resulting_str += contents.strip()
            #for line in file:
            #    resulting_str += line.strip() + "\n"
        #print(type(contents))
    #print(resulting_str)
    return resulting_str

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(combined_texts):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Define the maximum chunk size
        chunk_overlap=100  # Define the overlap between chunks
    )
    
    chunks_split = text_splitter.split_text(combined_texts)
    
    return chunks_split

In [10]:
from langchain.schema import Document
data_processed_list = os.listdir("data_processed")
result_text_embeddings = {}
documents = []
for i in data_processed_list:
    #print("Name:", i)
    concatenated_text = concat_text(os.path.join("data_processed", i, "Texts"))
    chunks = split_text(concatenated_text)
    #print(len(chunks))
    #documents = [Document(page_content=chunk) for chunk in chunks]
    for chunk in chunks:
        documents.append(Document(page_content=chunk, metadata={"source_id":i}))
    #print(len(documents))
    
    #result_text_embeddings[i] = text_embedding_dictionary
    

In [55]:
vector_store_text.add_documents(documents)

['2cf20eb1-81b7-419b-a853-b4bd2cdc66b8',
 'da4c378f-798b-46f4-9f4f-ecbb73cec7b0',
 '0731fc4a-0827-4d71-ac9e-5ffe6c2df642',
 'bd443e81-a377-48b1-ac4f-a82c4efe8e2f',
 'a07659c1-cfa2-4210-b85c-9bd0c463bbc4',
 '3b323f23-8480-4a6b-be53-339c4ec466a5',
 'ea5fdf1d-2621-45dc-93e7-a32181b680d6',
 '09674ca7-5a34-4d1d-a8ef-e6e1b9f437c2',
 '7f88bf93-59bb-4219-abe0-f13b194aff06',
 'dc07119d-1815-4e76-8307-10d570064ef3',
 '24ec12fc-3764-4e43-a2b9-7bc0f542a8b7',
 'd53edb23-33b9-4da6-a271-e931e92ca158',
 '3896d376-515d-4fb4-9f6f-085f0d6b54f9',
 '9dfaf1ab-d5c5-4f69-af3c-c0c1d7b7e955',
 '67a07167-5617-4eb6-939e-04a1a758f6bc',
 'b0fec8dc-a570-4ba3-b2f1-ae914161c877',
 'b506b42f-5999-4b8c-99dd-d95490331ce5',
 '081e5f75-b0e6-4e3e-b897-02e510207f32',
 '33a50344-bec1-463e-8fad-487162537dc5',
 '4405246a-20e6-49db-b55b-dcea45a7624b',
 'b656c779-aada-4993-b93e-4f2175872804',
 'b89a69e8-81de-4e73-b4cd-d6cd6700b109',
 '9f6cbb8f-bdf9-4ece-8441-e418a4aedd70',
 'a9ba9ac2-b104-4e13-935a-4dab0bd52415',
 '006299be-e795-

In [10]:
user_query = "What is supervised and unsupervised machine learning?"
retriever = vector_store_text.as_retriever(search_kwargs={"k":5})
# retriever.get_relevant_documents(query)
retrieved_docs = retriever.invoke(user_query)
retrieved_docs

[Document(id='ea6641c5-15a4-4697-8dd9-4706c958c30a', metadata={}, page_content='Machine Learning Overview, EDA and Clustering\nBrian Wright\nbrianwright@virginia.edu1. What is Machine Learning ?\n2. What is exploratory data analysis?\n3. k-means clustering\n– Does Congress vote in patterns?\n4. Multi-dimensional k-means clustering\n– Are NBA players compensated according to performance?\nOutline: Intro to Unsupervised ML\n2“A field of Computer Science that gives computers the ability to learn\nwithout being explicitly programmed.”\n-\nArthur Samuel (Coined the term in 1959 at IBM)\n“The ability [for systems] to acquire their own knowledge, by\nextracting patterns from raw data.”\n-\nDeep Learning, Goodfellow et alMachine vs. human\nMachine\nHuman\nUnderstanding context\n✔\nThinking through the problem\n✔\nAsking the right questions\n✔\nSelecting the right tools\n✔\nPerforming calculations quickly\n✔\nPerforming repetitive tasks\n✔\nFollowing pre-defined rules\n✔\nInterpreting results\n

In [11]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

print(format_docs(retrieved_docs))

Machine Learning Overview, EDA and Clustering
Brian Wright
brianwright@virginia.edu1. What is Machine Learning ?
2. What is exploratory data analysis?
3. k-means clustering
– Does Congress vote in patterns?
4. Multi-dimensional k-means clustering
– Are NBA players compensated according to performance?
Outline: Intro to Unsupervised ML
2“A field of Computer Science that gives computers the ability to learn
without being explicitly programmed.”
-
Arthur Samuel (Coined the term in 1959 at IBM)
“The ability [for systems] to acquire their own knowledge, by
extracting patterns from raw data.”
-
Deep Learning, Goodfellow et alMachine vs. human
Machine
Human
Understanding context
✔
Thinking through the problem
✔
Asking the right questions
✔
Selecting the right tools
✔
Performing calculations quickly
✔
Performing repetitive tasks
✔
Following pre-defined rules
✔
Interpreting results
✔5Pattern discovery when inputs (x) and outputs (y) are known
Supervised machine learning
Input x:
Voter
Output y:

# Embed the query

In [12]:
inputs = clip_processor(text=[user_query], return_tensors="pt", padding=True)
text_embedding = clip_model.get_text_features(**inputs).detach().numpy().tolist()[0]

# Query the Pinecone vector db and return top 5 matches

In [13]:
query_results = index.query(vector=text_embedding, top_k=5, include_metadata=True)

In [14]:
query_results["matches"]

[{'id': '3001_ETA/page_9_img_2.png',
  'metadata': {'file_name': '3001_ETA', 'image_key': 'page_9_img_2.png'},
  'score': 0.334317118,
  'values': []},
 {'id': 'machine_learning_bootcamp_II/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_bootcamp_II',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387815,
  'values': []},
 {'id': 'machine_learning_overview/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_overview',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387815,
  'values': []},
 {'id': 'machine_learning_III/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_III',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387815,
  'values': []},
 {'id': 'machine_learning_bootcamp_II copy/page_8_img_1.png',
  'metadata': {'file_name': 'machine_learning_bootcamp_II copy',
               'image_key': 'page_8_img_1.png'},
  'score': 0.322387815,
  'values': []}]

# Connect to Mongodb

In [15]:
from pymongo import MongoClient
import gridfs

client = MongoClient(mongo_uri)

#client = MongoClient(MONGO_URI)

db = client["images"]

collection = db["images_for_rag"]

print("Connected to MongoDB successfully!")

Connected to MongoDB successfully!


# Pushing all the images to mongodb

In [16]:
file_names_list = os.listdir("data_processed")

fs = gridfs.GridFS(db)



In [None]:
for i in file_names_list:
    images_list = os.listdir(os.path.join("data_processed", i, "Images"))
    for j in images_list:
        with open(os.path.join("data_processed", i, "Images", j), "rb") as image_file:
            image_id = fs.put(image_file, filename=i + "/" + j)

In [67]:
# pushing one image to mongodb 
# fs = gridfs.GridFS(db)
# 
# # Store an image
# image_path = "/Users/dorukozar/PycharmProjects/pythonProject4/clustering ss.png"
# 
# with open(image_path, "rb") as image_file:
#     image_id = fs.put(image_file, filename="clustering.png")
# 
# print(f"Image stored with ID: {image_id}")

Image stored with ID: 679abd90fc38751164dac1ef


# Query MongDB

In [17]:
# image_data = fs.find_one({"filename": "clustering.png"})
image_data_list = []
for i in query_results["matches"]:
    filename = i["id"]
    image_data = fs.find_one({"filename": filename})
    image_data_list.append(image_data)

In [18]:
image_data_list

[<gridfs.synchronous.grid_file.GridOut at 0x1b94a08e0>,
 <gridfs.synchronous.grid_file.GridOut at 0x1b73360b0>,
 <gridfs.synchronous.grid_file.GridOut at 0x1b7443730>,
 <gridfs.synchronous.grid_file.GridOut at 0x1b74438b0>,
 <gridfs.synchronous.grid_file.GridOut at 0x1b952d660>]

# Delete Multiple Files

In [35]:
# for file_data in fs.find():  # This retrieves all stored files
#     file_id = file_data._id
#     fs.delete(file_id)  # Deletes both metadata (fs.files) and chunks (fs.chunks)
#     print(f"✅ Deleted file: {file_data.filename}")
# 
# print("✅ All files deleted from GridFS!")

✅ Deleted file: page_14_img_1.png
✅ Deleted file: page_19_img_2.png
✅ Deleted file: page_21_img_1.png
✅ Deleted file: page_19_img_1.png
✅ Deleted file: page_42_img_1.png
✅ Deleted file: page_9_img_1.png
✅ Deleted file: page_56_img_1.png
✅ Deleted file: page_33_img_1.png
✅ Deleted file: page_49_img_1.png
✅ Deleted file: page_2_img_1.png
✅ Deleted file: page_27_img_1.png
✅ Deleted file: page_12_img_1.png
✅ Deleted file: page_49_img_2.png
✅ Deleted file: page_44_img_1.png
✅ Deleted file: page_50_img_1.png
✅ Deleted file: page_27_img_2.png
✅ Deleted file: page_15_img_1.png
✅ Deleted file: page_20_img_1.png
✅ Deleted file: page_34_img_1.png
✅ Deleted file: page_18_img_1.png
✅ Deleted file: page_8_img_1.png
✅ Deleted file: page_20_img_2.png
✅ Deleted file: page_43_img_1.png
✅ Deleted file: page_39_img_1.png
✅ Deleted file: page_26_img_1.png
✅ Deleted file: page_32_img_1.png
✅ Deleted file: page_48_img_1.png
✅ Deleted file: page_45_img_2.png
✅ Deleted file: page_45_img_3.png
✅ Deleted file: p

# Delete one specific file

In [75]:
# fs.delete(image_data._id)
# image_data._id

In [86]:
from PIL import Image
import io

if len(image_data_list)>0:
    for j in image_data_list:
        if j:
            # Convert binary data to a PIL Image
            image = Image.open(io.BytesIO(j.read()))
            
            # Display the image
            image.show()
        else:
            print("❌ Image not found")

UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x1e4698630>

<_io.BufferedWriter name='retrieved_image.png'>

---
break
---

# Embedding text and image at the same time

In [20]:
from PIL import Image
import io
images = []
count = 0
if len(image_data_list)>0:
    for j in image_data_list:
        if j:
            print(count)
            # Convert binary data to a PIL Image
            image = Image.open(io.BytesIO(j.read()))
            images.append(image)
            count += 1
            
        else:
            print("❌ Image not found")

0
1
2
3
4


In [95]:


#image = Image.open("/Users/dorukozar/PycharmProjects/pythonProject4/clustering ss.png")



inputs = clip_processor(text=user_query, images=images, return_tensors="pt", padding=True)
outputs = clip_model(**inputs)

image_embeds = outputs.image_embeds  # Image embeddings
text_embeds = outputs.text_embeds  # Text embeddings

In [21]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor, AutoTokenizer

# Specify the model
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load the model and set it to MPS (Apple Silicon GPU)
device = torch.device("mps")  # Set the device to MPS
model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use float16 precision
    device_map=None  # Explicitly manage the device
)
model.to(device)  # Move the model to the MPS device

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.model_max_length = 2048  # Increase from 77 to 2048





Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [22]:
prompt = f"""You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: {user_query}

Retrieved Information: {format_docs(retrieved_docs)}

Answer:
"""

In [23]:
# Load the processor
processor = AutoProcessor.from_pretrained(model_id, tokenizer=tokenizer)

# Load and preprocess the image
#img_path = "/Users/dorukozar/PycharmProjects/pythonProject4/clustering ss.png"
#image = Image.open(img_path)

# Prepare the messages
# messages = [
#     {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": "describe what these images are trying to explain"}]}
# ]

messages = [
    {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": prompt}]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

# Preprocess the inputs
inputs = processor(
    images,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(device)  # Move inputs to MPS device

In [25]:
%%time
# Generate output

output = model.generate(**inputs, max_new_tokens=10000)

# Decode and print the result
decoded_response = processor.decode(output[0])
print(decoded_response)

  test_elements = torch.tensor(test_elements)

KeyboardInterrupt

KeyboardInterrupt



In [26]:
print(decoded_response)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|><|image|><|image|><|image|><|image|>You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: What is supervised and unsupervised machine learning?

In [40]:
prompt = f"""You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.

Guidelines:
1. Provide concise and informative answers that (mostly undergrad) students can understand.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: {user_query}

Retrieved Information: {format_docs(retrieved_docs)}

Image Descriptions: {decoded_response}

Answer:
"""
#prompt = template.format(question = user_query, context =  format_docs(retrieved_docs))
print(prompt)

You are an expert LLM assistant specialized in answering questions related to computer science/data science/machine learning/LLM. Use the retrieved information from RAG (Retrieved information and Image Descriptions) and your knowledge to respond accurately and clearly to each question.

Guidelines:
1. Provide concise and informative answers.
2. If the question is beyond the scope of your knowledge or the provided information, state, "I don't know."
3. If the context section (the information that is returned by RAG pipeline) has no information about a part of the question, please express that "The retrieved information did not contain answer to this question" but if you can answer it based on your knowledge please do
4. Use examples where applicable to illustrate your answers.
5. Maintain a professional and helpful tone.

Question: What is supervised and unsupervised machine learning?

Retrieved Information: Machine Learning Overview, EDA and Clustering
Brian Wright
brianwright@virginia

In [41]:
import ollama
desired_model = "deepseek-r1:7b"

response = ollama.chat(model=desired_model, messages=[
    {
        'role':'user',
        'content':prompt,
    },
])


ollama_response = response['message']['content']

print(ollama_response)

<think>
Okay, so I need to understand what supervised and unsupervised machine learning are. From what I gather from the information provided, supervised learning involves models that learn from labeled data, where both input features (x) and corresponding outputs (y) are known. This is used for tasks like classification and regression. The examples given were predicting a voter's political affiliation based on their voting history.

Unsupervised learning, on the other hand, deals with unlabeled data. Here, the model tries to find patterns or inherent groupings in the input data without knowing the corresponding outputs. Clustering is mentioned as an example, such as grouping customers by purchasing behavior without any prior information about their preferences.

Looking at the image description, it confirms this by showing Supervised Learning split into Classification and Regression, while Unsupervised Learning includes Clustering and Association. Association seems to involve finding 