In [2]:
from openai import OpenAI
import base64
import os
from dotenv import load_dotenv

load_dotenv()

openai_client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def process_image(image_path):
    base64_image = encode_image(image_path)
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": '''You are an expert in tagging images based on the following criteria:
                - Claim Stance Tags: Support, Refute, Neutral
                - Keyword Tags: Specific to each image, based on your analysis or OCR of any text in the image. These keywords should be specific to the image and not general as they will be used to search for evidence later using vector search. Make the keywords short and concise.
                
                Avoid saying "This is an image from Pew regarding..." or "This is an image from the Pew Research Center regarding..."
                '''
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    },
                ],
            }
        ],
        max_tokens=200,
    )
    return response



# Process all images in the evidence folder
from pymongo import MongoClient

# Connect to MongoDB
mongodb_client = MongoClient('mongodb://localhost:27017/')
db = mongodb_client['pew_image_metadata']
collection = db['image_analysis']

evidence_folder = "evidence"
for filename in os.listdir(evidence_folder):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp')):
        image_path = os.path.join(evidence_folder, filename)
        print(f"Processing {filename}:")
        result = process_image(image_path)
        
        # Prepare data for MongoDB
        analysis_data = {
            'filename': filename,
            'image_path': image_path,
            'analysis_result': result.choices[0].message.content
        }
        
        # Insert data into MongoDB
        collection.insert_one(analysis_data)
        
        print(f"Analysis for {filename} saved to MongoDB")
        print("\n" + "-"*50 + "\n")

# Close the MongoDB connection
mongodb_client.close()

Processing youthSocialMedia_ev19.png:
Analysis for youthSocialMedia_ev19.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev25.png:
Analysis for youthSocialMedia_ev25.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev24.png:
Analysis for youthSocialMedia_ev24.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev18.png:
Analysis for youthSocialMedia_ev18.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev26.png:
Analysis for youthSocialMedia_ev26.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev23.png:
Analysis for youthSocialMedia_ev23.png saved to MongoDB

--------------------------------------------------

Processing youthSocialMedia_ev22.png:
Analysis for youthSocialMedia_ev22.png saved to MongoDB

-----------------------

In [6]:
import chromadb
from chromadb.config import Settings

# After processing images and storing in MongoDB
# Initialize ChromaDB
chroma_client = chromadb.Client(Settings(persist_directory="./chroma_db"))
mongodb_client = MongoClient('mongodb://localhost:27017/')
db = mongodb_client['pew_image_metadata']
collection = chroma_client.create_collection(name="image_vectors")

# Retrieve data from MongoDB and store in ChromaDB
mongodb_data = db['image_analysis'].find()

for item in mongodb_data:
    filename = item['filename']
    description = item['analysis_result']
    
    # Add the document to ChromaDB
    collection.add(
        documents=[description],
        metadatas=[{"filename": filename}],
        ids=[filename]
    )

print("Data has been transferred from MongoDB to ChromaDB")

/Users/mrityunjay/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 39.4MiB/s]


Data has been transferred from MongoDB to ChromaDB


In [39]:
# Example of how to query ChromaDB based on user story
def find_relevant_images(user_story, n=3):
    results = collection.query(
        query_texts=[user_story],
        n_results=n
    )
    return results

# Example usage:
user_story = "I want to know about the screen time of teenagers on social media."
relevant_images = find_relevant_images(user_story)

In [42]:
print(relevant_images['documents'])
print()
print(relevant_images['ids'])

[['The image presents statistics about U.S. teens aged 13 to 17 regarding their usage of various social media platforms. It shows the percentage of teens who have ever used each platform and those who visit or use them almost constantly. \n\n**Key Statistics:**\n- **YouTube**: \n  - Ever use: 93%\n  - Almost constantly: 16%\n  \n- **TikTok**: \n  - Ever use: 63%\n  - Almost constantly: 17%\n  \n- **Snapchat**: \n  - Ever use: 60%\n  - Almost constantly: 14%\n  \n- **Instagram**: \n  - Ever use: 59%\n  - Almost constantly: 8%\n  \n- **Facebook**: \n  - Ever use: 33%\n  - Almost constantly: 3%\n\n**Note**: A small disclaimer indicates that some responses are not shown.\n\n### Claim Stance Tags:\n- Neutral\n\n### Keyword Tags:\n- Teen usage', 'The image is a chart showing the percentage of U.S. teens aged 13 to 17 who use various social media platforms. The data is presented for the years 2014-2015, 2022, and 2023. \n\nKey points include:\n\n- **YouTube** is the most popular platform, wit

In [46]:
evidence_folder = "evidence"
relevant_image_ids = relevant_images['ids']
relevant_image_paths = [os.path.join(evidence_folder, img_id) for img_id in relevant_image_ids if isinstance(img_id, str)]

In [47]:
relevant_image_paths

[]

In [7]:
# # Close the MongoDB connection
# mongodb_client.close()