In [19]:
import pandas as pd
# Load your DataFrame
pew = pd.read_csv('../dataset/pew_dataset/metadata.csv')
pew['imgPath'] = pew['imgPath'].str.replace('imgs', '../dataset/pew_dataset/pew_imgs')
statista = pd.read_csv('../dataset/statista_dataset/metadata.csv')
statista['imgPath'] = statista['imgPath'].str.replace('out/two_col/imgs', '../dataset/statista_dataset/statista_imgs')
columns = ['title','caption','imgPath']

# Filtering the DataFrame to include only the specified columns
pew_df = pew[columns]
statista_df = statista[columns]
combined_df = pd.concat([pew_df, statista_df], ignore_index=True)

# Add a new column 'ID' to the DataFrame at the first position
combined_df.insert(0, 'id', combined_df.reset_index().index + 1)

combined_df.head()


Unnamed: 0,id,title,caption,imgPath
0,1,"Foreign-born population in the United States, ...",The foreign-born population residing in the U....,../dataset/pew_dataset/pew_imgs/1.png
1,2,"English proficiency among U.S. immigrants, 198...","Since 1980, the share of immigrants who are pr...",../dataset/pew_dataset/pew_imgs/2.png
2,3,"Languages spoken among U.S. immigrants, 2018","Among the nation’s immigrants, Spanish is by f...",../dataset/pew_dataset/pew_imgs/3.png
3,4,"Hispanic population in the U.S., 2000-2017",There were nearly 60 million Latinos in the Un...,../dataset/pew_dataset/pew_imgs/4.png
4,5,Weekly broadcast audience for top 20 NPR-affil...,The top 20 NPR-affiliated public radio station...,../dataset/pew_dataset/pew_imgs/5.png


In [20]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize CLIP model and processor
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

In [21]:
def generate_embeddings(images, texts):
    inputs = processor(text=texts, images=images, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        image_embeddings = outputs.image_embeds  # Access image embeddings
        text_embeddings = outputs.text_embeds    # Access text embeddings
        return image_embeddings, text_embeddings

In [22]:
# Extract image paths from DataFrame and take only the first 10
image_paths = combined_df['imgPath'].tolist()[:10]

# Load images
images = [Image.open(path) for path in image_paths]

# Specific text prompt for embedding generation, repeated for each image
specific_text_prompt = ["Provide a statistical summary and analysis of the chart"] * len(images)

# Generate embeddings for images and text
image_embeddings, _ = generate_embeddings(images, specific_text_prompt)

In [24]:
from PIL import Image
# Create a blank white image, size 224x224 (common input size for image models like CLIP)
image_size = (224, 224)
color = (255, 255, 255)  # White background, RGB format

# Create the image with the specified color and size
dummy_image = Image.new("RGB", image_size, color)

In [29]:
# Example text query with a dummy image
text_query = ["hispanic people"]

# Generate embedding for the text query with a placeholder image which is a dummy image
_, query_embedding = generate_embeddings(dummy_image, text_query)

# Normalize embeddings
image_embeddings = image_embeddings / torch.norm(image_embeddings, dim=1, keepdim=True)
query_embedding = query_embedding / torch.norm(query_embedding, dim=1, keepdim=True)

# Calculate cosine similarities between the query embedding and all image embeddings
similarities = cosine_similarity(query_embedding.cpu().numpy(), image_embeddings.cpu().numpy())  # Use .cpu() if on CUDA

# Get the index of the highest similarity score
most_similar_idx = np.argmax(similarities)

# Retrieve the most similar image
most_similar_image_path = image_paths[most_similar_idx]
print("Most similar image is:", most_similar_image_path)

Most similar image is: ../dataset/pew_dataset/pew_imgs/3.png


In [30]:
# Get indices of the top 3 highest similarity scores
top_indices = np.argsort(similarities[0])[::-1][:3]  # Reverse sort and take top 3

# Retrieve the paths of the most similar images
top_images_paths = [image_paths[idx] for idx in top_indices]

# Print the top 3 most similar images
print("Top 3 most similar images are:")
for i, path in enumerate(top_images_paths, start=1):
    print(f"{i}: {path}")

Top 3 most similar images are:
1: ../dataset/pew_dataset/pew_imgs/3.png
2: ../dataset/pew_dataset/pew_imgs/2.png
3: ../dataset/pew_dataset/pew_imgs/1.png
