In [1]:
import pandas as pd
import numpy as np

import PIL
from tqdm.notebook import tqdm

from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
import evaluate

from datasets import Dataset, Image

import torch




In [2]:
torch.cuda.is_available()

True

In [3]:
model_checkpoint = "microsoft/resnet-50"
# model_checkpoint = "google/vit-base-patch16-224"
batch_size = 64 # batch size for training and evaluation

In [4]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,unified_class,class_id,image_name
0,Оленевые,5,3cf4207b958eade893a2f1618cf062b8.JPG
1,Кошки,2,37698901280c871f426d40afe5c373cd.JPG
2,Заяц,0,20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,Кошки,2,a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,Оленевые,5,54eb76914b84db8a0d56f98125abf588.JPG
...,...,...,...
28010,Оленевые,5,07b420b4fe265b4ed918b46435c025d7.JPG
28011,Пантеры,6,2d1c5918357bbdd729bf79085e55d35e.JPG
28012,Заяц,0,1531efa9f8687e390adf780355acd606.JPG
28013,Кабан,1,2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [5]:
df.drop(columns=['unified_class'], inplace = True)
df

Unnamed: 0,class_id,image_name
0,5,3cf4207b958eade893a2f1618cf062b8.JPG
1,2,37698901280c871f426d40afe5c373cd.JPG
2,0,20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28010,5,07b420b4fe265b4ed918b46435c025d7.JPG
28011,6,2d1c5918357bbdd729bf79085e55d35e.JPG
28012,0,1531efa9f8687e390adf780355acd606.JPG
28013,1,2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [6]:
df['image_name'] = df['image_name'].apply(lambda x: 'train/' + x)
df

Unnamed: 0,class_id,image_name
0,5,train/3cf4207b958eade893a2f1618cf062b8.JPG
1,2,train/37698901280c871f426d40afe5c373cd.JPG
2,0,train/20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,train/a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,train/54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28010,5,train/07b420b4fe265b4ed918b46435c025d7.JPG
28011,6,train/2d1c5918357bbdd729bf79085e55d35e.JPG
28012,0,train/1531efa9f8687e390adf780355acd606.JPG
28013,1,train/2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [7]:
df = df[df['image_name'].apply(lambda x: len(np.array(PIL.Image.open(x)).shape)) == 3]
df = df.reset_index(drop=True)
df

Unnamed: 0,class_id,image_name
0,5,train/3cf4207b958eade893a2f1618cf062b8.JPG
1,2,train/37698901280c871f426d40afe5c373cd.JPG
2,0,train/20e7b30026001cbfe0b5c0ee16c9ff56.JPG
3,2,train/a1bc8ea546206ee8fc0f1836fda9a5c1.JPG
4,5,train/54eb76914b84db8a0d56f98125abf588.JPG
...,...,...
28009,5,train/07b420b4fe265b4ed918b46435c025d7.JPG
28010,6,train/2d1c5918357bbdd729bf79085e55d35e.JPG
28011,0,train/1531efa9f8687e390adf780355acd606.JPG
28012,1,train/2b15eaef0ce9b57b6570709f95a4bea4.JPG


In [8]:
sample = pd.read_csv('sample_submission.csv')
sample['image_name'] = sample['image_name'].apply(lambda x: 'test/' + x)
sample

Unnamed: 0,image_name,predicted_class
0,test/cc27b9b56583a615fb8501e352402eb9.JPG,0
1,test/87872711fe672676fd34a97e997f9c47.JPG,0
2,test/424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,test/c5537eaa60525efd7bad4a5560607e83.JPG,0
4,test/e9f15b67ca49453e281b2b4f245eac13.JPG,0
...,...,...
12953,test/028668e733cd17ec9b9f1c7e2c657b36.JPG,0
12954,test/eb1f1152941fdfdd50ff9954010e622a.JPG,0
12955,test/bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,0
12956,test/2eaf9c794958a93bb9984441fd5d7f61.JPG,0


In [9]:
image_processor  = AutoImageProcessor.from_pretrained(model_checkpoint)
image_processor 

ConvNextImageProcessor {
  "crop_pct": 0.875,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "ConvNextImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}

In [10]:
dataset = Dataset.from_pandas(df)
dataset

Dataset({
    features: ['class_id', 'image_name'],
    num_rows: 28014
})

In [11]:
dataset['image_name'][:5]

['train/3cf4207b958eade893a2f1618cf062b8.JPG',
 'train/37698901280c871f426d40afe5c373cd.JPG',
 'train/20e7b30026001cbfe0b5c0ee16c9ff56.JPG',
 'train/a1bc8ea546206ee8fc0f1836fda9a5c1.JPG',
 'train/54eb76914b84db8a0d56f98125abf588.JPG']

In [12]:
# def load_image(image_file):
#     return image_processor(Image.open('train/'+ image_file).convert('RGB'), return_tensors="pt")

# dataset = dataset.map(lambda example: {'image': load_image(example['image_name'])})

# dataset

In [13]:
dataset = dataset.cast_column('image_name', Image())
dataset

Dataset({
    features: ['class_id', 'image_name'],
    num_rows: 28014
})

In [14]:
dataset = dataset.class_encode_column("class_id")

Stringifying the column:   0%|          | 0/28014 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/28014 [00:00<?, ? examples/s]

In [15]:
splits = dataset.train_test_split(test_size=0.1, stratify_by_column='class_id')
# splits = dataset.train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']

In [16]:
def load_image(example_batch):
    # Process each image individually and collect pixel values as tensors
    di = [image_processor(image.convert("RGB"), return_tensors="pt")['pixel_values'].squeeze(0) 
          for image in example_batch["image_name"]]
    # Stack the list of tensors into a single tensor with batch dimension
    example_batch["pixel_values"] = torch.stack(di)  
    example_batch["labels"] = torch.tensor(example_batch["class_id"])

    # Remove original columns no longer needed
    del example_batch["image_name"]
    del example_batch["class_id"]

    return example_batch

In [17]:
train_ds.set_transform(load_image)
val_ds.set_transform(load_image)

In [18]:
train_ds[0]

{'pixel_values': tensor([[[-1.6384, -1.5699, -1.5528,  ..., -1.6042, -1.6898, -1.5870],
          [-1.5699, -1.5185, -1.4500,  ..., -1.6727, -1.6898, -1.6042],
          [-1.5870, -1.5528, -1.4329,  ..., -1.6213, -1.5870, -1.6384],
          ...,
          [-1.4500, -1.4500, -1.2103,  ..., -1.0390, -1.0048, -1.0904],
          [-1.3644, -1.4158, -1.3473,  ..., -1.0733, -1.1247, -1.1932],
          [-1.4672, -1.4672, -1.3130,  ..., -1.1932, -1.2274, -1.1932]],
 
         [[-1.5455, -1.4755, -1.4580,  ..., -1.5105, -1.5980, -1.4930],
          [-1.4755, -1.4230, -1.3529,  ..., -1.5805, -1.5980, -1.5105],
          [-1.4930, -1.4580, -1.3354,  ..., -1.5280, -1.4930, -1.5455],
          ...,
          [-1.3529, -1.3529, -1.1078,  ..., -0.9328, -0.8978, -0.9853],
          [-1.2654, -1.3179, -1.2479,  ..., -0.9678, -1.0203, -1.0903],
          [-1.3704, -1.3704, -1.2129,  ..., -1.0903, -1.1253, -1.0903]],
 
         [[-1.3164, -1.2467, -1.2293,  ..., -1.2816, -1.3687, -1.2641],
          [-

In [None]:
# from collections import Counter
# from tqdm.notebook import tqdm

# lst = []

# for i in tqdm(range(len(train_ds))):
#     lst.append(train_ds[i]['pixel_values'].shape)

# Counter(lst)

In [19]:
model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint, 
    # label2id=label2id,
    # id2label=id2label,
    num_labels=len(set(df.class_id)),
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([10, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = TrainingArguments(
    output_dir="results",
    remove_unused_columns=False,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    # logging_steps=10,
    # load_best_model_at_end=True,
    # metric_for_best_model="f1"
)

In [21]:
accuracy = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels, average="macro")

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    # tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

In [23]:
torch.cuda.is_available()

True

In [24]:
trainer.train()
torch.cuda.empty_cache()

  0%|          | 0/490 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def pred_load_image(example_batch):

    di = [image_processor(image.convert("RGB"), return_tensors="pt") for image in example_batch["image_name"]]
    example_batch["pixel_values"] = di[0]['pixel_values']

    del example_batch["image_name"]

    return example_batch

In [None]:
pred_dataset = Dataset.from_pandas(pd.DataFrame(sample['image_name']))
pred_dataset = pred_dataset.cast_column('image_name', Image())
pred_dataset.set_transform(pred_load_image)

In [None]:
# predictions = trainer.predict(pred_dataset)
# predictions

In [None]:
lst=[]
with torch.no_grad():
    for input in tqdm(pred_dataset):
        outputs = model(input['pixel_values'].to('cuda').reshape(1,3,224,224)).logits.argmax(-1)
        lst.append(int(outputs.cpu()))
lst[:5]

  0%|          | 0/12958 [00:00<?, ?it/s]

[6,
 5,
 0,
 1,
 6,
 6,
 5,
 5,
 6,
 6,
 7,
 4,
 5,
 5,
 9,
 7,
 8,
 6,
 4,
 6,
 5,
 5,
 5,
 6,
 5,
 6,
 5,
 6,
 5,
 7,
 1,
 5,
 6,
 4,
 6,
 4,
 5,
 6,
 8,
 4,
 6,
 8,
 5,
 0,
 5,
 7,
 5,
 4,
 6,
 6,
 6,
 8,
 2,
 6,
 9,
 5,
 7,
 7,
 5,
 1,
 5,
 1,
 6,
 0,
 7,
 4,
 9,
 6,
 6,
 7,
 5,
 5,
 6,
 5,
 5,
 7,
 6,
 5,
 4,
 5,
 8,
 5,
 2,
 5,
 5,
 5,
 9,
 6,
 1,
 5,
 5,
 5,
 5,
 5,
 5,
 6,
 2,
 5,
 5,
 1,
 6,
 5,
 6,
 6,
 8,
 7,
 5,
 5,
 6,
 4,
 6,
 1,
 6,
 6,
 8,
 5,
 5,
 6,
 1,
 6,
 1,
 7,
 6,
 6,
 0,
 4,
 6,
 7,
 7,
 5,
 5,
 6,
 5,
 3,
 5,
 6,
 6,
 5,
 2,
 6,
 1,
 5,
 4,
 5,
 6,
 6,
 5,
 8,
 6,
 5,
 6,
 4,
 8,
 5,
 5,
 5,
 5,
 5,
 8,
 5,
 6,
 7,
 4,
 5,
 6,
 0,
 5,
 7,
 5,
 6,
 5,
 5,
 5,
 2,
 1,
 6,
 5,
 5,
 1,
 6,
 5,
 5,
 6,
 3,
 5,
 5,
 5,
 7,
 5,
 6,
 0,
 5,
 5,
 6,
 6,
 5,
 6,
 7,
 6,
 6,
 4,
 5,
 5,
 6,
 6,
 1,
 8,
 4,
 5,
 1,
 0,
 6,
 6,
 9,
 6,
 5,
 6,
 6,
 1,
 5,
 2,
 6,
 1,
 5,
 6,
 6,
 1,
 6,
 6,
 5,
 1,
 5,
 8,
 5,
 5,
 3,
 1,
 7,
 6,
 5,
 5,
 2,
 5,
 5,
 8,
 1,
 6,
 1,
 7,
 5,


In [None]:
# sample['predicted_class'] = predictions
sample['predicted_class'] = lst
sample

Unnamed: 0,image_name,predicted_class
0,test/cc27b9b56583a615fb8501e352402eb9.JPG,6
1,test/87872711fe672676fd34a97e997f9c47.JPG,5
2,test/424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,test/c5537eaa60525efd7bad4a5560607e83.JPG,1
4,test/e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,test/028668e733cd17ec9b9f1c7e2c657b36.JPG,1
12954,test/eb1f1152941fdfdd50ff9954010e622a.JPG,4
12955,test/bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,test/2eaf9c794958a93bb9984441fd5d7f61.JPG,6


In [None]:
sample['image_name'] = sample['image_name'].apply(lambda x: x[5:])
sample

Unnamed: 0,image_name,predicted_class
0,cc27b9b56583a615fb8501e352402eb9.JPG,6
1,87872711fe672676fd34a97e997f9c47.JPG,5
2,424aa1aa8eb5bbdd07275f88077bc86c.JPG,0
3,c5537eaa60525efd7bad4a5560607e83.JPG,1
4,e9f15b67ca49453e281b2b4f245eac13.JPG,6
...,...,...
12953,028668e733cd17ec9b9f1c7e2c657b36.JPG,1
12954,eb1f1152941fdfdd50ff9954010e622a.JPG,4
12955,bfd2dde9f4a5753c9f85b2a93bee9c03.JPG,5
12956,2eaf9c794958a93bb9984441fd5d7f61.JPG,6


In [None]:
sample.to_csv('sub.csv', index=False)