In [1]:
# import packages
from pymongo import MongoClient
from dotenv import load_dotenv
import pandas as pd
import os
import re

In [2]:
# Connect to MongoDB and retrieve image URLs and metadata

# Load the dotenv file
load_dotenv()

# Grab the MONGO_URI from local or from Kaggle
mongo_uri = os.getenv('MONGO_URI')

# Connect to the MongoDB client
client = MongoClient(mongo_uri)
 
# Access the database and collection
db = client['test']  # Replace with your database name
collection = db['cameratrapmedias']  # Replace with your collection name
 
# Query the collection to retrieve the first 150 records with image URLs, metadata, and the first index of 'relativePath'
data = list(collection.aggregate([
    {
        '$project': {
            '_id': 0,
            'publicURL': 1,
            'timestamp': 1,
            'folderName': { '$arrayElemAt': ['$relativePath', 1] },
            'fileName': 1
        }
    },
    # { '$limit': 1500 }
]))
 
# Convert the data to a pandas DataFrame for exploration
df = pd.DataFrame(data)
print(df.head())

            timestamp                                          publicURL  \
0 2024-01-27 13:33:15  https://urbanriverrangers.s3.amazonaws.com/ima...   
1 2024-01-24 18:56:50  https://urbanriverrangers.s3.amazonaws.com/ima...   
2 2024-01-24 19:01:54  https://urbanriverrangers.s3.amazonaws.com/ima...   
3 2024-01-24 19:03:05  https://urbanriverrangers.s3.amazonaws.com/ima...   
4 2024-01-24 19:04:19  https://urbanriverrangers.s3.amazonaws.com/ima...   

       fileName                               folderName  
0  SYFW0060.JPG                   2024-01-30_prologis_02  
1  SYFW0001.JPG  2024-01-30_Learnin_platform_camera_test  
2  SYFW0002.JPG  2024-01-30_Learnin_platform_camera_test  
3  SYFW0004.JPG  2024-01-30_Learnin_platform_camera_test  
4  SYFW0006.JPG  2024-01-30_Learnin_platform_camera_test  


In [3]:
# Retrieve one document from the collection
record = collection.find_one()

# Print the document
print(record)

{'_id': ObjectId('671a925b731cf4b5c4fd2203'), 'mediaID': 'fb04201b6417ea917fdd24e1a7415d8a', 'imageHash': '4f78f0835a9518c3f460bba2c29782b1', 'timestamp': datetime.datetime(2024, 1, 27, 13, 33, 15), 'publicURL': 'https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_prologis_02/DCIM/100MEDIA/SYFW0060.JPG', 'relativePath': ['2024', '2024-01-30_prologis_02', 'DCIM', '100MEDIA', 'SYFW0060.JPG'], 'filePath': 'images/2024/2024-01-30_prologis_02/DCIM/100MEDIA/SYFW0060.JPG', 'filePublic': True, 'fileName': 'SYFW0060.JPG', 'fileMediatype': 'image/jpeg', 'exifData': {'Make': 'TC', 'Model': 'XG2', 'Software': 'R2.3', 'DateTime': '2024:01:27 13:33:15', 'ExifVersion': '0220', 'DateTimeOriginal': '2024:01:27 13:33:15', 'FocalLength': 0.0, 'ExposureTime': 0.03333333333333333, 'FNumber': 2.8, 'ISOSpeedRatings': 200}, 'exifExtracted': True, 'createdAt': datetime.datetime(2024, 10, 24, 13, 39, 43, 872000), 'fileLocations': [{'publicURL': 'https://urbanriverrangers.s3.amazonaws.com/images/20

In [5]:


# This function will format the final string
def make_filename(s):
    # s = s.lower()
    s = re.sub(r'[^\w\s.-]', '', s) # remove special characters except dash or underscore or period
    s = re.sub(r'\s+', '_', s) # replace whitespace with underscore
    return s

# Combine the relative path second (folder name) + fileName
df['imageName'] = df['folderName'] + '--' + df['fileName']
df['imageName'] = df['imageName'].apply(make_filename)

print(df.head())

# Export the small array to a CSV file for preview
df.to_csv('ur_test_medias.csv', index=False)

            timestamp                                          publicURL  \
0 2024-01-27 13:33:15  https://urbanriverrangers.s3.amazonaws.com/ima...   
1 2024-01-24 18:56:50  https://urbanriverrangers.s3.amazonaws.com/ima...   
2 2024-01-24 19:01:54  https://urbanriverrangers.s3.amazonaws.com/ima...   
3 2024-01-24 19:03:05  https://urbanriverrangers.s3.amazonaws.com/ima...   
4 2024-01-24 19:04:19  https://urbanriverrangers.s3.amazonaws.com/ima...   

       fileName                               folderName  \
0  SYFW0060.JPG                   2024-01-30_prologis_02   
1  SYFW0001.JPG  2024-01-30_Learnin_platform_camera_test   
2  SYFW0002.JPG  2024-01-30_Learnin_platform_camera_test   
3  SYFW0004.JPG  2024-01-30_Learnin_platform_camera_test   
4  SYFW0006.JPG  2024-01-30_Learnin_platform_camera_test   

                                           imageName  
0               2024-01-30_prologis_02--SYFW0060.JPG  
1  2024-01-30_Learnin_platform_camera_test--SYFW0...  
2  2024-01-30_Lea