## This notebook is for fine-tuning the ViT (Visual Transformers) we used as our main Computer vision model to detect offensive clash clan bases

In [None]:
!pip install transformers
!pip install datasets
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m111.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, https://u

In [None]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    RandomVerticalFlip,
    RandomGrayscale,
    GaussianBlur,
    Resize,
    ToTensor,
)
from transformers import ViTFeatureExtractor
from transformers import ViTForImageClassification
from datasets import load_dataset
import torch
import numpy as np
from datasets import load_metric
from transformers import TrainingArguments
from transformers import Trainer

In [None]:
model_name_or_path = 'google/vit-base-patch16-224-in21k'
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name_or_path)

In [None]:
def process_example(example):
    inputs = feature_extractor(example['image'], return_tensors='pt')
    inputs['labels'] = example['labels']
    return inputs

In [None]:
ds = load_dataset('ogimgio/starthack-supercell-dataset')
ds = ds.rename_column("label", "labels")

def transform(example_batch):
    # Take a list of PIL images and turn them to pixel values
    inputs = feature_extractor([x for x in example_batch['image']], return_tensors='pt')

    # Don't forget to include the labels!
    inputs['labels'] = example_batch['labels']
    return inputs

prepared_ds = ds.with_transform(transform)



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def collate_fn(batch):
    return {
        'pixel_values': torch.stack([x['pixel_values'] for x in batch]),
        'labels': torch.tensor([x['labels'] for x in batch])
    }

metric = load_metric("accuracy")
def compute_metrics(p):
    return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)

In [None]:
labels = ds['train'].features['labels'].names

model = ViTForImageClassification.from_pretrained(
    model_name_or_path,
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)}
)

Some weights of the model checkpoint at google/vit-base-patch16-224-in21k were not used when initializing ViTForImageClassification: ['pooler.dense.weight', 'pooler.dense.bias']
- This IS expected if you are initializing ViTForImageClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTForImageClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Freeze all layers but the last encoder and the classifier
for name, param in model.named_parameters():
    if 'encoder' in name and int(name.split('.')[3]) < 11:
        param.requires_grad = False

In [None]:
print('Number of parameters: ', sum(p.numel() for p in model.parameters()))
print('Number of trainable parameters: ', 
      sum(p.numel() for p in model.parameters() if p.requires_grad))

7833602

In [None]:
training_args = TrainingArguments(
  "ogimgio/start-hack-supercell",
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=10,
  weight_decay=0.001,
  #fp16=True,
  save_steps=50,
  eval_steps=50,
  logging_steps=10,
  learning_rate=2e-4,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=True,
  hub_token="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",
  report_to='tensorboard',
  load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=feature_extractor,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    train_dataset=prepared_ds["train"],
    eval_dataset=prepared_ds["validation"],
)


In [None]:
train_results = trainer.train()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)



Step,Training Loss,Validation Loss,Accuracy
50,0.3171,0.130185,0.942857
100,0.058,0.056171,0.971429
150,0.0186,0.088794,0.971429
200,0.0114,0.11497,0.971429
250,0.0053,0.119831,0.971429


***** train metrics *****
  epoch                    =        10.0
  total_flos               = 148670280GF
  train_loss               =      0.1132
  train_runtime            =  0:02:04.31
  train_samples_per_second =       16.57
  train_steps_per_second   =       2.091


In [None]:
metrics = trainer.evaluate(prepared_ds['validation'])
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

***** eval metrics *****
  epoch                   =       20.0
  eval_accuracy           =     0.9143
  eval_loss               =     0.4061
  eval_runtime            = 0:00:01.37
  eval_samples_per_second =     25.516
  eval_steps_per_second   =      3.645


In [None]:
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/327M [00:00<?, ?B/s]

Upload file runs/Mar23_16-26-40_212cdeca7be7/events.out.tfevents.1679588811.212cdeca7be7.21238.13:   0%|      …

To https://huggingface.co/ogimgio/start-hack-supercell
   8919879..708c958  main -> main

   8919879..708c958  main -> main

To https://huggingface.co/ogimgio/start-hack-supercell
   708c958..41aa2dc  main -> main

   708c958..41aa2dc  main -> main



'https://huggingface.co/ogimgio/start-hack-supercell/commit/708c95883f556600f841fdf026a1d6aa077d5e26'

## Test model with url image

In [None]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
from PIL import Image
import requests

# load the model and feature extractor
model_name = 'ogimgio/start-hack-supercell'
model = ViTForImageClassification.from_pretrained(model_name)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_name)

# load the input image and preprocess it
url = 'https://i.pinimg.com/474x/45/15/51/4515510d22e256932f896490801859ee.jpg' 
image = Image.open(requests.get(url, stream=True).raw)
inputs = feature_extractor(images=image, return_tensors='pt')

# perform inference and get the predicted label
outputs = model(**inputs)
predicted_label = torch.argmax(outputs.logits).item()
mapping = {0: 'Normal', 1: 'Offencive'}
print("Predicted label:", mapping[int(predicted_label)])

tensor([[ 2.3080, -2.1956]], grad_fn=<AddmmBackward0>)
Predicted label: 0
