## Downloading .parquet files

In [None]:
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00000-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00001-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00002-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00003-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00004-of-00330.parquet

In [2]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import os
import ollama
import tempfile
import time

## Downloading photos and classification using minicpm-v:8b (ollama)

### Setup (need to be run for every .parquet file)

In [3]:

model_name = "minicpm-v:8b"
parquet_file = "train-00002-of-00330.parquet"


# Ścieżki wyjściowe
INSIDE_DIR = './dataset/inside'
OUTSIDE_DIR = './dataset/outside'
not_known_DIR = './dataset/not_known'
os.makedirs(INSIDE_DIR, exist_ok=True)
os.makedirs(OUTSIDE_DIR, exist_ok=True)



# Wczytaj Parquet do DataFrame
df = pd.read_parquet(parquet_file)

headers = {
    "User-Agent": "IN/OUT DeepLearning Project (mieszkowskifff@gmail.com)"
}

### Function for asking model and saving file under proper directory

In [4]:
def classify_and_save(image_url, caption, idx):
    try:
        response = requests.get(image_url, headers = headers, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
    except Exception as e:
        print(f"[{idx}] Błąd pobierania obrazu: {e}")
        return

    try:
        prompt = f"""You are an image scene classifier. Based on the image and the following caption, classify the scene strictly as "inside", "outside" or "not known".

Caption: "{caption}"

Respond only with: "inside", "outside" or "not known".
"""
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
            image.save(tmp.name)
            image_path = tmp.name

        result = ollama.chat(
            model = model_name,
            messages = [{
                "role": "user",
                "content": prompt,
                "images": [image_path]
            }]
        )

        reply = result["message"]["content"].strip().lower()
        if "inside" in reply:
            out_path = os.path.join(INSIDE_DIR, f"{idx}_{time.time()}.jpg")
        elif "outside" in reply:
            out_path = os.path.join(OUTSIDE_DIR, f"{idx}_{time.time()}.jpg")
        else:
            out_path = os.path.join(not_known_DIR, f"{idx}_{time.time()}.jpg")


        image.save(out_path)
        print(f"[{idx}] Zapisano: {reply} → {out_path}")

    except Exception as e:
        print(f"[{idx}] Błąd modelu: {e}")


### Run the downloading

In [None]:
for idx, row in df.iterrows():
    classify_and_save(row['image_url'], row['caption_attribution_description'], idx)


[0] Zapisano: outside → ./dataset/outside/0_1749140542.607868.jpg
[1] Zapisano: inside → ./dataset/inside/1_1749140543.161938.jpg
[2] Błąd modelu: [Errno 2] No such file or directory: './dataset/not_known/2_1749140544.429642.jpg'
[3] Zapisano: inside → ./dataset/inside/3_1749140546.6311276.jpg
[4] Zapisano: inside → ./dataset/inside/4_1749140547.6052117.jpg
[5] Zapisano: outside → ./dataset/outside/5_1749140548.3845892.jpg
[6] Zapisano: outside → ./dataset/outside/6_1749140550.7292733.jpg
[7] Zapisano: outside → ./dataset/outside/7_1749140553.2497263.jpg
[8] Zapisano: outside → ./dataset/outside/8_1749140554.5359519.jpg
[9] Zapisano: inside → ./dataset/inside/9_1749140556.5299706.jpg
[10] Zapisano: inside → ./dataset/inside/10_1749140558.8867617.jpg
[11] Zapisano: outside → ./dataset/outside/11_1749140561.761361.jpg
[12] Zapisano: outside → ./dataset/outside/12_1749140562.6752684.jpg
[13] Błąd modelu: [Errno 2] No such file or directory: './dataset/not_known/13_1749140563.681521.jpg'
[