# Load data to MongoDB Database

In [25]:
import pandas as pd
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
import pymongo
from pymongo import MongoClient
from dotenv import load_dotenv
import os

## 1. Load Embeddings

### Create new CSV file that contains image descriptions and image attributes

In [8]:
# load image_descriptions.csv
image_descriptions = pd.read_csv('output/image_descriptions.csv')
print('Number of rows in image_descriptions:', image_descriptions.shape[0])

# load photo_metadata.csv
photo_metadata = pd.read_csv('output/photo_metadata.csv')
print('Number of rows in photo_metadata:', photo_metadata.shape[0])

Number of rows in image_descriptions: 556
Number of rows in photo_metadata: 555


In [9]:
# add the columbs from photo_metadata to image_descriptions on the 'filename' column
# Adjust filenames to ensure consistency (all as .jpeg)
photo_metadata['filename'] = photo_metadata['filename'].str.replace('.jpg', '.jpeg', regex=False)
image_descriptions['filename'] = image_descriptions['filename'].str.replace('.jpg', '.jpeg', regex=False)

# Merge the dataframes again after adjusting the filenames
merged = pd.merge(image_descriptions, photo_metadata, on='filename')
print('Number of rows in merged dataframe:', merged.shape[0])

# save the merged dataframe to a new csv file in 'output' folder
merged.to_csv('output/data.csv', index=False)

Number of rows in merged dataframe: 545


### Create embeddings using BERT

#### Load and prepare data

In [10]:
# load data.csv
data = pd.read_csv('output/data.csv')

# replace NaN values with empty strings
data = data.fillna('')

data.head()

Unnamed: 0,filename,description,people,location,date
0,2662f4ba-ff02-4175-b562-f27b01d1062a.jpeg,The image shows three individuals posing toget...,"Juan Daniel (Man), Kika (Woman), Tia Icha (Woman)","Perímetro Urbano Barranquilla, Atlántico, Colo...",2024-01-01 15:39:42-05:00
1,fe01fa69-6df1-48b8-80a7-eee303558bdd.jpeg,"In the image, there are two people posing for ...","Graciela (Woman), Pipe (Man)","Seattle, Washington, United States",2022-07-04 17:54:15-07:00
2,a36c2180-eb5c-40a1-8dd7-a950c226a0dd.jpeg,"In the image, there are three young men close ...","Daniel (Man), Javier (Man), Santiago (Man)","Ann Arbor, Michigan, United States",2022-04-09 23:58:02-04:00
3,IMG_2008.jpeg,This image features a parking lot situated in ...,,"Washington, United States",2022-06-26 14:00:10.604301-07:00
4,IMG_0824.jpeg,"In this image, we see a man and a woman seated...",Will (Man),"Ann Arbor, Michigan, United States",2022-04-01 01:22:35.760919-04:00


In [11]:
# create a dataframe that contains description, people, location, date in a string for each filename
data_dict = {}

for index, row in data.iterrows():
    filename = row['filename']
    description = row['description']
    people = row['people']
    location = row['location']
    date = row['date']
    text = f"People: {people}\n Location: {location}\n Date: {date}\n Description: {description}"
    data_dict[filename] = text

# create a dataframe from the dictionary
data_df = pd.DataFrame(data_dict.items(), columns=['filename', 'text'])

# save the dataframe to a new csv file in 'output' folder
data_df.to_csv('output/data_text.csv', index=False)

data_df.head()


Unnamed: 0,filename,text
0,2662f4ba-ff02-4175-b562-f27b01d1062a.jpeg,"People: Juan Daniel (Man), Kika (Woman), Tia I..."
1,fe01fa69-6df1-48b8-80a7-eee303558bdd.jpeg,"People: Graciela (Woman), Pipe (Man)\n Locatio..."
2,a36c2180-eb5c-40a1-8dd7-a950c226a0dd.jpeg,"People: Daniel (Man), Javier (Man), Santiago (..."
3,IMG_2008.jpeg,"People: \n Location: Washington, United States..."
4,IMG_0824.jpeg,"People: Will (Man)\n Location: Ann Arbor, Mich..."


#### Embed Data

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def tokenize_text(df, tokenizer, model):
    text = df['text']
    tokenized_text = tokenizer(text.tolist(), padding=True, truncation=True, return_tensors="pt")

    return tokenized_text

tokenized_text = tokenize_text(data_df, tokenizer, model)


In [13]:
# Disable gradient calculations for efficiency
with torch.no_grad():
    # Get model outputs
    outputs = model(**tokenized_text)

# The 'outputs' is a tuple where the first item contains the last hidden states
last_hidden_states = outputs.last_hidden_state

In [14]:
print(last_hidden_states.shape)

torch.Size([545, 309, 768])


### Load embeddings to MongoDB Database

In [28]:
load_dotenv()
uri = os.getenv('MONGODB_URI')
# Connect to the MongoDB client
client = MongoClient(uri)

# Select the database
db = client['photo-rag-db']

# Select the collection
collection_emb = db['photo-embeddings']

In [20]:
filenames = data_df['filename'].to_list()
texts = data_df['text'].to_list()
documents = []

# if database is not empty, drop all documents
if collection_emb.count_documents({}) > 0:
    collection_emb.drop()

for filename, text in zip(filenames, texts):
    # tokenize text
    tokenized_text = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        # Get model outputs
        outputs = model(**tokenized_text)
    
    # extract the last hidden states
    last_hidden_states = outputs.last_hidden_state

    # calculate the mean of the last hidden states
    mean_embedding = torch.mean(last_hidden_states, dim=1).tolist()[0]

    # prepare the document
    document = {'filename': filename, 'embedding': mean_embedding}

    # add to the batch
    documents.append(document)

# insert documents in batch
if documents:
    collection_emb.insert_many(documents)

## 2. Load Image-Description data

In [29]:
# connect to the MongoDB client, select database and collection
client = MongoClient(uri)
db = client['photo-rag-db']
collection_desc = db['photo-descriptions']

# declare list of filenames and descriptions
data_text_df = pd.read_csv('output/data_text.csv')
filenames = data_text_df['filename'].to_list()
descriptions = data_text_df['text'].to_list()

In [22]:
# if collection_desc is not empty, drop all documents
if collection_desc.count_documents({}) > 0:
    collection_desc.drop()

for filename, description in zip(filenames, descriptions):
    # Insert the data into the collection
    collection_desc.insert_one({
        'filename': filename,
        'description': description
    })