## Downloading .parquet files

In [None]:
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00000-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00001-of-00330.parquet
!wget https://huggingface.co/datasets/wikimedia/wit_base/resolve/main/data/train-00002-of-00330.parquet

In [1]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import os
import ollama
import tempfile
import time

## Downloading photos and classification using minicpm-v:8b (ollama)

### Setup (need to be run for every .parquet file)

In [2]:

model_name = "minicpm-v:8b" # also llava:7b was tested, but it often classified as "inside" images that were actually obviously "inside" images
parquet_file = "train-00002-of-00330.parquet"


INSIDE_DIR = './dataset/inside'
OUTSIDE_DIR = './dataset/outside'
not_known_DIR = './dataset/not_known'
os.makedirs(INSIDE_DIR, exist_ok=True)
os.makedirs(OUTSIDE_DIR, exist_ok=True)


df = pd.read_parquet(parquet_file)

headers = {
    "User-Agent": "IN/OUT DeepLearning Project (mieszkowskifff@gmail.com)"
}

### Function for asking model and saving file under proper directory

In [3]:
def classify_and_save(image_url, caption, idx):
    try:
        response = requests.get(image_url, headers = headers, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert("RGB")
    except Exception as e:
        print(f"[{idx}] Błąd pobierania obrazu: {e}")
        return

    try:
        prompt = f"""You are an image scene classifier. Based on the image and the following caption, classify the scene strictly as "inside", "outside" or "not known".

Caption: "{caption}"

Respond only with: "inside", "outside" or "not known".
"""
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
            image.save(tmp.name)
            image_path = tmp.name

        result = ollama.chat(
            model = model_name,
            messages = [{
                "role": "user",
                "content": prompt,
                "images": [image_path]
            }]
        )

        reply = result["message"]["content"].strip().lower()
        if "inside" in reply:
            out_path = os.path.join(INSIDE_DIR, f"{idx}_{round(time.time())}.jpg")
        elif "outside" in reply:
            out_path = os.path.join(OUTSIDE_DIR, f"{idx}_{round(time.time())}.jpg")
        else:
            out_path = os.path.join(not_known_DIR, f"{idx}_{round(time.time())}.jpg")


        image.save(out_path)
        print(f"[{idx}] Zapisano: {reply} → {out_path}")

    except Exception as e:
        print(f"[{idx}] Błąd modelu: {e}")


### Run the downloading

In [None]:
for idx, row in df.iterrows():
    classify_and_save(row['image_url'], row['caption_attribution_description'], idx)


[0] Zapisano: outside. → ./dataset/outside/0_1749140852.6780584.jpg
[1] Zapisano: inside → ./dataset/inside/1_1749140853.1726313.jpg
[2] Zapisano: not known. → ./dataset/not_known/2_1749140854.4906976.jpg
[3] Zapisano: inside → ./dataset/inside/3_1749140856.6994412.jpg
[4] Zapisano: inside. → ./dataset/inside/4_1749140857.695384.jpg
[5] Zapisano: outside → ./dataset/outside/5_1749140858.511278.jpg
[6] Zapisano: outside → ./dataset/outside/6_1749140860.9059908.jpg
[7] Zapisano: outside → ./dataset/outside/7_1749140863.462371.jpg
[8] Zapisano: outside → ./dataset/outside/8_1749140864.7028122.jpg
[9] Zapisano: inside → ./dataset/inside/9_1749140866.7470326.jpg
[10] Zapisano: inside → ./dataset/inside/10_1749140869.230453.jpg
[11] Zapisano: outside → ./dataset/outside/11_1749140872.1603081.jpg
[12] Zapisano: outside → ./dataset/outside/12_1749140873.1346273.jpg
[13] Zapisano: not known → ./dataset/not_known/13_1749140874.027256.jpg
[14] Błąd pobierania obrazu: 404 Client Error: Not Found f



[721] Błąd modelu: POST predict: Post "http://127.0.0.1:42297/completion": EOF (status code: 500)
[722] Zapisano: inside → ./dataset/inside/722_1749142380.0572155.jpg
[723] Zapisano: outside → ./dataset/outside/723_1749142383.391446.jpg
[724] Zapisano: outside → ./dataset/outside/724_1749142385.808962.jpg
[725] Zapisano: outside → ./dataset/outside/725_1749142388.522416.jpg
[726] Zapisano: outside. → ./dataset/outside/726_1749142389.905238.jpg
[727] Zapisano: outside → ./dataset/outside/727_1749142392.5726821.jpg
[728] Zapisano: inside → ./dataset/inside/728_1749142395.2813835.jpg
[729] Zapisano: outside → ./dataset/outside/729_1749142396.9812331.jpg
[730] Zapisano: outside → ./dataset/outside/730_1749142400.7660851.jpg
[731] Zapisano: outside → ./dataset/outside/731_1749142403.2548444.jpg
[732] Zapisano: outside → ./dataset/outside/732_1749142404.7848468.jpg
[733] Zapisano: outside → ./dataset/outside/733_1749142407.7249947.jpg
[734] Zapisano: outside → ./dataset/outside/734_174914240

### Some of the photos were broken. We gathered $1584$ inside and $2798$ outside photos. The majority class is ~$63.9$%.

## Training the model

### Imports

In [1]:
import torch
import torch.nn as nn
from torchvision.models import inception_v3, Inception_V3_Weights
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import torchsummary
from tqdm import tqdm

### Model (transfer learning)

In [2]:
class InceptionCustom(nn.Module):
    def __init__(self, num_classes: int):
        super(InceptionCustom, self).__init__()
        self.base = inception_v3(weights=Inception_V3_Weights.DEFAULT, aux_logits = True)
        self.base.fc = nn.Identity()
        self.pool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(2048, num_classes)

    def freeze_base(self):
        for param in self.base.parameters():
            param.requires_grad = False
        
    def unfreeze_base(self):
        for param in self.base.parameters():
            param.requires_grad = True


    def forward(self, x):
        x = self.base(x)
        if isinstance(x, tuple):
            x = x[0]
        x = self.classifier(x)
        return x

### Hyperparameters

In [3]:
torch.manual_seed(42)

data_dir = "./dataset"
batch_size = 64
num_epochs = 10
learning_rate = 1e-4
num_classes = 2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Dataset

In [4]:
transform = transforms.Compose([
    transforms.Resize((320, 320)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.5] * 3, std = [0.5] * 3),
])

dataset = datasets.ImageFolder(root = data_dir, transform = transform)

### Splitting the dataset into train ($80$%) and validation ($20$%)

In [5]:
val_split = 0.2
total_size = len(dataset)
val_size = int(total_size * val_split)
train_size = total_size - val_size

In [6]:
train_dataset_split, val_dataset_split = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset_split, batch_size = batch_size, shuffle = True, num_workers = 4)
val_loader = DataLoader(val_dataset_split, batch_size = batch_size, shuffle = False, num_workers = 4)

### Initializing the model

In [7]:
model = InceptionCustom(num_classes).to(device)
torchsummary.summary(model, input_size=(3, 320, 320), device = device.type)
model.freeze_base()

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 159, 159]             864
       BatchNorm2d-2         [-1, 32, 159, 159]              64
       BasicConv2d-3         [-1, 32, 159, 159]               0
            Conv2d-4         [-1, 32, 157, 157]           9,216
       BatchNorm2d-5         [-1, 32, 157, 157]              64
       BasicConv2d-6         [-1, 32, 157, 157]               0
            Conv2d-7         [-1, 64, 157, 157]          18,432
       BatchNorm2d-8         [-1, 64, 157, 157]             128
       BasicConv2d-9         [-1, 64, 157, 157]               0
        MaxPool2d-10           [-1, 64, 78, 78]               0
           Conv2d-11           [-1, 80, 78, 78]           5,120
      BatchNorm2d-12           [-1, 80, 78, 78]             160
      BasicConv2d-13           [-1, 80, 78, 78]               0
           Conv2d-14          [-1, 192,

In [8]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

### Training loop

In [9]:
def train(
        model,
        epochs,
        train_loader,
        val_loader,
        criterion,
        optimizer,
):
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        with torch.no_grad():
            model.eval()
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                loss = criterion(outputs, labels)
                running_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        accuracy = correct / total

        print(f"Epoch {epoch+1}: Valid Loss = {loss:.4f}, Valid Acc = {accuracy:.4f}")

In [None]:
train(
    model,
    num_epochs,
    train_loader,
    val_loader,
    criterion,
    optimizer
)

Epoch 1/10:  29%|██▉       | 16/55 [00:21<00:25,  1.52it/s]

### Without finetuning we've obtained $84.8$% accuracy.

In [16]:
learning_rate = 1e-6
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
model.unfreeze_base()
epochs = 5
train(
    model,
    epochs,
    train_loader,
    val_loader,
    criterion,
    optimizer
)

Epoch 1/10: 100%|██████████| 55/55 [01:06<00:00,  1.22s/it]


Epoch 1: Valid Loss = 0.4703, Valid Acc = 0.8516


Epoch 2/10: 100%|██████████| 55/55 [01:06<00:00,  1.20s/it]


Epoch 2: Valid Loss = 0.4811, Valid Acc = 0.8596


Epoch 3/10: 100%|██████████| 55/55 [01:06<00:00,  1.22s/it]


Epoch 3: Valid Loss = 0.4826, Valid Acc = 0.8584


Epoch 4/10: 100%|██████████| 55/55 [01:07<00:00,  1.23s/it]


Epoch 4: Valid Loss = 0.4521, Valid Acc = 0.8642


Epoch 5/10: 100%|██████████| 55/55 [01:10<00:00,  1.28s/it]


Epoch 5: Valid Loss = 0.4600, Valid Acc = 0.8619


Epoch 6/10:  24%|██▎       | 13/55 [00:22<01:14,  1.77s/it]


KeyboardInterrupt: 

In [None]:
model_name = "Finetuned_model"
torch.save(model.state_dict(), f"{model_name}.pth")

## 

## Training the whole model directly

In [None]:
model = InceptionCustom(num_classes).to(device)
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
model.unfreeze_base()
epochs = 10
train(
    model,
    epochs,
    train_loader,
    val_loader,
    criterion,
    optimizer
)

In [None]:
model_name = "Direct_model"
torch.save(model.state_dict(), f"{model_name}.pth")