In [8]:
%pip install torch pandas torchvision scikit-learn tqdm kaggle timm -q

Note: you may need to restart the kernel to use updated packages.


In [2]:
!apt update -qq
!apt install -qq unzip
!unzip -q data.zip -d data

[1;31mE: [0mCould not open lock file /var/lib/apt/lists/lock - open (13: Permission denied)[0m
[1;31mE: [0mUnable to lock directory /var/lib/apt/lists/[0m
[1;33mW: [0mProblem unlinking the file /var/cache/apt/pkgcache.bin - RemoveCaches (13: Permission denied)[0m
[1;33mW: [0mProblem unlinking the file /var/cache/apt/srcpkgcache.bin - RemoveCaches (13: Permission denied)[0m
[1;31mE: [0mCould not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)[0m
[1;31mE: [0mUnable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?[0m


In [4]:
import os
import shutil

import pandas as pd

# Define paths
data_root = "data"
images_dir = os.path.join(data_root, "images")

# Create images directory if it doesn't exist
os.makedirs(images_dir, exist_ok=True)

# List to store image paths and labels
dataset = []

# Loop through each subfolder
for subfolder in os.listdir(data_root):
    subfolder_path = os.path.join(data_root, subfolder)

    # Ensure it's a directory
    if os.path.isdir(subfolder_path) and subfolder != "images":
        # Loop through images inside the subfolder
        for image in os.listdir(subfolder_path):
            old_image_path = os.path.join(subfolder_path, image)

            # Ensure it's a file (image)
            if os.path.isfile(old_image_path):
                # Define new image path in "data/images" directory
                new_image_path = os.path.join(images_dir, image)

                # If filename already exists, rename it to avoid conflicts
                if os.path.exists(new_image_path):
                    base, ext = os.path.splitext(image)
                    counter = 1
                    while os.path.exists(new_image_path):
                        new_image_path = os.path.join(images_dir, f"{base}_{counter}{ext}")
                        counter += 1

                # Move image
                shutil.move(old_image_path, new_image_path)

                # Append to dataset with updated path and original label
                dataset.append({"image_path": new_image_path, "label": subfolder})

        # Optionally remove empty subfolder after moving images
        os.rmdir(subfolder_path)

df = pd.DataFrame(dataset)
df = df.rename(columns={"image_path": "image_id"})
df["image_id"] = df["image_id"].str.replace("data/images/", "", regex=False)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

df.to_csv(os.path.join(data_root, "dataset.csv"), index=False)

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [1]:
# To load the dataset again:
import pandas as pd
df = pd.read_csv('data/dataset.csv')

In [2]:
df["label"].value_counts()

1    1722
7    1194
6     663
5     652
0     471
3     346
8     316
2     314
4     297
Name: label, dtype: int64

In [3]:
import pandas as pd

# Assuming you already have the DataFrame `df`
# and that the column name is actually 'label'

min_count = df['label'].value_counts().min()

# Group by label and take a random sample of size min_count from each class
df_balanced = df.groupby('label').sample(n=min_count, random_state=42)

# Shuffle the resulting balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check new class distribution
print(df_balanced['label'].value_counts())


1    297
4    297
5    297
6    297
8    297
2    297
3    297
7    297
0    297
Name: label, dtype: int64


In [4]:
import os

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from dataset import Dataset


In [5]:
train_df, temp_df = train_test_split(df_balanced, test_size=0.2, random_state=42, stratify=df_balanced["label"])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

# Change the path to the directory where the images are stored
path = "data/images"
train_dataset = Dataset(train_df, path)
test_dataset = Dataset(test_df, path)
val_dataset = Dataset(val_df, path)

In [8]:
import itertools

from model import SoyaTrans
from train import Trainer

batch_size = 16
lr = 2e-4
num_epochs = 35
num_classes = 9


def run_experiment(batch_size, lr):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = SoyaTrans(num_classes)
    trainer = Trainer(model, train_loader, val_loader, test_loader, lr, num_epochs, batch_size=batch_size)

    trainer.train()
    trainer.test()
    torch.save(trainer.model.state_dict(), 'soyatrans.pth')

In [9]:
# for batch_size, lr in itertools.product(batch_sizes, lrs):
#     print(f"\nRunning experiment with batch_size={batch_size}, lr={lr}")
run_experiment(batch_size, lr)

Downloading: "https://download.pytorch.org/models/vgg16-397923af.pth" to /home/ubuntu/.cache/torch/hub/checkpoints/vgg16-397923af.pth
100%|██████████| 528M/528M [00:01<00:00, 352MB/s] 
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Epoch 1/35: 100%|██████████| 134/134 [00:54<00:00,  2.47it/s, loss=1.8360]



Train Metrics:
--------------------------------------------------
Epoch: 0
Train Loss: 2.1097
Test Loss: 1.9835
Accuracy: 0.3895
Precision: 0.3331
Recall: 0.3895
F1: 0.3316
--------------------------------------------------


Epoch 2/35: 100%|██████████| 134/134 [00:52<00:00,  2.54it/s, loss=1.9321]



Train Metrics:
--------------------------------------------------
Running experiment with batch_size=16, lr=0.0002
Epoch: 1
Train Loss: 1.9124
Test Loss: 1.9049
Accuracy: 0.4607
Precision: 0.3953
Recall: 0.4607
F1: 0.3846
--------------------------------------------------


Epoch 3/35: 100%|██████████| 134/134 [00:53<00:00,  2.52it/s, loss=1.8726]



Train Metrics:
--------------------------------------------------
Epoch: 2
Train Loss: 1.8101
Test Loss: 1.7798
Accuracy: 0.5993
Precision: 0.5926
Recall: 0.5993
F1: 0.5660
--------------------------------------------------


Epoch 4/35: 100%|██████████| 134/134 [00:53<00:00,  2.52it/s, loss=1.8666]



Train Metrics:
--------------------------------------------------
Epoch: 3
Train Loss: 1.7507
Test Loss: 1.7447
Accuracy: 0.6292
Precision: 0.6654
Recall: 0.6292
F1: 0.6078
--------------------------------------------------


Epoch 5/35: 100%|██████████| 134/134 [00:53<00:00,  2.51it/s, loss=1.5797]



Train Metrics:
--------------------------------------------------
Epoch: 4
Train Loss: 1.7182
Test Loss: 1.7219
Accuracy: 0.6442
Precision: 0.6483
Recall: 0.6442
F1: 0.6261
--------------------------------------------------


Epoch 6/35: 100%|██████████| 134/134 [00:53<00:00,  2.51it/s, loss=1.8607]



Train Metrics:
--------------------------------------------------
Epoch: 5
Train Loss: 1.7029
Test Loss: 1.6966
Accuracy: 0.6742
Precision: 0.6822
Recall: 0.6742
F1: 0.6557
--------------------------------------------------


Epoch 7/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.9661]



Train Metrics:
--------------------------------------------------
Epoch: 6
Train Loss: 1.6943
Test Loss: 1.6861
Accuracy: 0.6854
Precision: 0.6901
Recall: 0.6854
F1: 0.6664
--------------------------------------------------


Epoch 8/35: 100%|██████████| 134/134 [00:53<00:00,  2.48it/s, loss=1.4117]



Train Metrics:
--------------------------------------------------
Epoch: 7
Train Loss: 1.6841
Test Loss: 1.6697
Accuracy: 0.7116
Precision: 0.7253
Recall: 0.7116
F1: 0.6948
--------------------------------------------------


Epoch 9/35: 100%|██████████| 134/134 [00:53<00:00,  2.48it/s, loss=1.5836]



Train Metrics:
--------------------------------------------------
Epoch: 8
Train Loss: 1.6644
Test Loss: 1.6549
Accuracy: 0.7154
Precision: 0.7221
Recall: 0.7154
F1: 0.6953
--------------------------------------------------


Epoch 10/35: 100%|██████████| 134/134 [00:53<00:00,  2.51it/s, loss=1.7823]



Train Metrics:
--------------------------------------------------
Epoch: 9
Train Loss: 1.6348
Test Loss: 1.6449
Accuracy: 0.7341
Precision: 0.7784
Recall: 0.7341
F1: 0.7156
--------------------------------------------------


Epoch 11/35: 100%|██████████| 134/134 [00:53<00:00,  2.50it/s, loss=1.3867]



Train Metrics:
--------------------------------------------------
Epoch: 10
Train Loss: 1.5941
Test Loss: 1.6311
Accuracy: 0.7528
Precision: 0.7848
Recall: 0.7528
F1: 0.7364
--------------------------------------------------


Epoch 12/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.8781]



Train Metrics:
--------------------------------------------------
Epoch: 11
Train Loss: 1.5695
Test Loss: 1.5713
Accuracy: 0.8052
Precision: 0.8274
Recall: 0.8052
F1: 0.7875
--------------------------------------------------


Epoch 13/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.7735]



Train Metrics:
--------------------------------------------------
Epoch: 12
Train Loss: 1.5521
Test Loss: 1.5668
Accuracy: 0.8165
Precision: 0.8384
Recall: 0.8165
F1: 0.8009
--------------------------------------------------


Epoch 14/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.5267]



Train Metrics:
--------------------------------------------------
Epoch: 13
Train Loss: 1.5385
Test Loss: 1.5539
Accuracy: 0.8390
Precision: 0.8428
Recall: 0.8390
F1: 0.8347
--------------------------------------------------


Epoch 15/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.3808]



Train Metrics:
--------------------------------------------------
Epoch: 14
Train Loss: 1.5298
Test Loss: 1.5405
Accuracy: 0.8390
Precision: 0.8504
Recall: 0.8390
F1: 0.8336
--------------------------------------------------


Epoch 16/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.5500]



Train Metrics:
--------------------------------------------------
Epoch: 15
Train Loss: 1.5246
Test Loss: 1.5363
Accuracy: 0.8464
Precision: 0.8569
Recall: 0.8464
F1: 0.8393
--------------------------------------------------


IOPub message rate exceeded.| 78/134 [00:31<00:22,  2.49it/s, loss=1.4341]
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Epoch 23/35: 100%|██████████| 134/134 [00:54<00:00,  2.48it/s, loss=1.5730]



Train Metrics:
--------------------------------------------------
Epoch: 22
Train Loss: 1.5092
Test Loss: 1.5153
Accuracy: 0.8689
Precision: 0.8765
Recall: 0.8689
F1: 0.8635
--------------------------------------------------


Epoch 24/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.3760]



Train Metrics:
--------------------------------------------------
Epoch: 23
Train Loss: 1.5063
Test Loss: 1.5122
Accuracy: 0.8689
Precision: 0.8684
Recall: 0.8689
F1: 0.8669
--------------------------------------------------


Epoch 25/35: 100%|██████████| 134/134 [00:53<00:00,  2.48it/s, loss=1.4825]



Train Metrics:
--------------------------------------------------
Epoch: 24
Train Loss: 1.5078
Test Loss: 1.5099
Accuracy: 0.8689
Precision: 0.8705
Recall: 0.8689
F1: 0.8681
--------------------------------------------------


Epoch 26/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.7666]



Train Metrics:
--------------------------------------------------
Epoch: 25
Train Loss: 1.5083
Test Loss: 1.5152
Accuracy: 0.8652
Precision: 0.8706
Recall: 0.8652
F1: 0.8639
--------------------------------------------------


Epoch 27/35: 100%|██████████| 134/134 [00:53<00:00,  2.50it/s, loss=1.3813]



Train Metrics:
--------------------------------------------------
Epoch: 26
Train Loss: 1.5036
Test Loss: 1.5151
Accuracy: 0.8614
Precision: 0.8619
Recall: 0.8614
F1: 0.8600
--------------------------------------------------


Epoch 28/35: 100%|██████████| 134/134 [00:53<00:00,  2.51it/s, loss=1.8161]



Train Metrics:
--------------------------------------------------
Epoch: 27
Train Loss: 1.5040
Test Loss: 1.5117
Accuracy: 0.8689
Precision: 0.8686
Recall: 0.8689
F1: 0.8672
--------------------------------------------------


Epoch 29/35: 100%|██████████| 134/134 [00:53<00:00,  2.50it/s, loss=1.4785]



Train Metrics:
--------------------------------------------------
Epoch: 28
Train Loss: 1.5003
Test Loss: 1.5104
Accuracy: 0.8689
Precision: 0.8710
Recall: 0.8689
F1: 0.8677
--------------------------------------------------


Epoch 30/35: 100%|██████████| 134/134 [00:54<00:00,  2.47it/s, loss=1.5136]



Train Metrics:
--------------------------------------------------
Epoch: 29
Train Loss: 1.4961
Test Loss: 1.5124
Accuracy: 0.8652
Precision: 0.8661
Recall: 0.8652
F1: 0.8638
--------------------------------------------------


Epoch 31/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.5079]



Train Metrics:
--------------------------------------------------
Epoch: 30
Train Loss: 1.4898
Test Loss: 1.5109
Accuracy: 0.8689
Precision: 0.8703
Recall: 0.8689
F1: 0.8679
--------------------------------------------------


Epoch 32/35: 100%|██████████| 134/134 [00:53<00:00,  2.51it/s, loss=1.4855]



Train Metrics:
--------------------------------------------------
Epoch: 31
Train Loss: 1.4846
Test Loss: 1.5049
Accuracy: 0.8801
Precision: 0.8845
Recall: 0.8801
F1: 0.8793
--------------------------------------------------


Epoch 33/35: 100%|██████████| 134/134 [00:53<00:00,  2.49it/s, loss=1.6751]



Train Metrics:
--------------------------------------------------
Epoch: 32
Train Loss: 1.4842
Test Loss: 1.5088
Accuracy: 0.8614
Precision: 0.8611
Recall: 0.8614
F1: 0.8598
--------------------------------------------------


Epoch 34/35: 100%|██████████| 134/134 [00:53<00:00,  2.50it/s, loss=1.4342]



Train Metrics:
--------------------------------------------------
Epoch: 33
Train Loss: 1.4796
Test Loss: 1.5062
Accuracy: 0.8689
Precision: 0.8679
Recall: 0.8689
F1: 0.8678
--------------------------------------------------


Epoch 35/35: 100%|██████████| 134/134 [00:53<00:00,  2.50it/s, loss=1.3856]



Train Metrics:
--------------------------------------------------
Epoch: 34
Train Loss: 1.4770
Test Loss: 1.5035
Accuracy: 0.8839
Precision: 0.8826
Recall: 0.8839
F1: 0.8823
--------------------------------------------------

Test Metrics:
--------------------------------------------------
Test Loss: 1.5035
Accuracy: 0.8839
Precision: 0.8826
Recall: 0.8839
F1: 0.8823
--------------------------------------------------
