In [1]:
!pip install ultralytics pandas tqdm -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import csv
import ast
from pathlib import Path
from collections import defaultdict
from ultralytics import YOLO
from tqdm.auto import tqdm
import torch

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [3]:
BASE_INPUT_DIR = Path('/kaggle/input/part-1')
IMAGE_DIR = BASE_INPUT_DIR / 'images_part1/images_part1'
MAPPING_FILE = BASE_INPUT_DIR / 'p1.txt'

OUTPUT_CSV = Path('/kaggle/working/part1_objects.csv')

In [4]:
def load_post_to_image_map(file_path):
    """
    Reads the mapping file and creates a dictionary linking post_id to its username and image files.
    """
    print(f"Loading post-to-image map from {file_path}...")
    if not file_path.exists():
        print(f"ERROR: Mapping file not found at {file_path}")
        return None
    
    # datastructure: {post_id: {'username': 'user1', 'images': ['img1.jpg', 'img2.jpg']}}
    mapping = {}
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                username = parts[0]
                post_id = parts[1].replace('.info', '')
                image_list = ast.literal_eval(parts[2])
                mapping[post_id] = {'username': username, 'images': image_list}
                
    print(f"Loaded {len(mapping)} unique posts.")
    return mapping

def load_processed_posts(csv_path):
    """This function remains the same."""
    if not csv_path.exists() or os.path.getsize(csv_path) == 0:
        return set()
    print(f"Loading previously processed posts from {csv_path}...")
    processed = set()
    with open(csv_path, 'r') as f:
        reader = csv.reader(f)
        try:
            next(reader)
            for row in reader:
                if row:
                    processed.add(row[0])
        except StopIteration:
            return set() 
    print(f"Found {len(processed)} posts already processed.")
    return processed

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

post_to_image_map = load_post_to_image_map(MAPPING_FILE)

if post_to_image_map:
    processed_posts = load_processed_posts(OUTPUT_CSV)

    print("Loading YOLOv8 model...")
    model = YOLO('yolov8m.pt')
    model.to(device)
    print("Model loaded.")

    with open(OUTPUT_CSV, 'a', newline='') as f:
        writer = csv.writer(f)
        if os.path.getsize(OUTPUT_CSV) == 0:
            writer.writerow(['post_id', 'detected_objects'])

        for post_id, post_data in tqdm(post_to_image_map.items(), desc="Processing Posts"):
            if post_id in processed_posts:
                continue

            username = post_data['username']
            image_files = post_data['images']
            
            image_paths_to_process = []
            for img_file in image_files:

                correct_filename = f"{username}-{img_file}"
                path = IMAGE_DIR / correct_filename
                if path.exists():
                    image_paths_to_process.append(path)

            if not image_paths_to_process:
                continue

            all_objects_for_post = set()
            try:
                results = model(image_paths_to_process, verbose=False, device=device)
                for res in results:
                    class_names = res.names
                    for c in res.boxes.cls:
                        all_objects_for_post.add(class_names[int(c)])
            except Exception as e:
                print(f"ERROR processing post {post_id}. Error: {e}")
                continue
            
            writer.writerow([post_id, sorted(list(all_objects_for_post))])
            f.flush()

    print("\nProcessing complete!")
    print(f"Results saved to {OUTPUT_CSV}")
else:
    print("Could not load post-to-image map. Aborting.")

Using device: cuda
Loading post-to-image map from /kaggle/input/part-1/p1.txt...
Loaded 149950 unique posts.
Loading YOLOv8 model...
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8m.pt to 'yolov8m.pt': 100% ━━━━━━━━━━━━ 49.7MB 91.9MB/s 0.5s
Model loaded.


Processing Posts:   0%|          | 0/149950 [00:00<?, ?it/s]


Processing complete!
Results saved to /kaggle/working/part1_objects.csv
