In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip -q install transformers datasets evaluate wandb

See the image classification [task page](https://huggingface.co/tasks/image-classification) for more information about its associated models, datasets, and metrics.

In [3]:
from datasets import load_dataset, Image, Dataset, DatasetDict
from transformers import DefaultDataCollator, AutoImageProcessor
from pathlib import Path

import torch.nn.functional as f
import torch

import wandb
import pandas as pd
import numpy as np
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mporg[0m ([33mmassive-texts[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

## Prep Dataset

In [4]:
wd = Path('/content/drive/MyDrive/Projects/motes-figural')
project_data_dir = wd / 'data'
img_dir = wd / 'extracted_content'
gt_dir = wd / 'img_ground_truth'

seed = 1234 #@param {type:"integer"}
name = "motes_figural" #@param {type:"string"}
classification_model = "microsoft/beit-base-patch16-224-pt22k-ft22k" #@param ["google/vit-large-patch16-224", "microsoft/beit-large-patch16-224-pt22k-ft22k", "facebook/convnext-base-224", "microsoft/resnet-50", "google/vit-base-patch16-384", "microsoft/beit-base-patch16-224-pt22k-ft22k", "google/vit-base-patch16-224"]
%env WANDB_PROJECT=$name

modeldir = project_data_dir / 'models' / (name+'_'+classification_model.replace('/','-'))
modeldir.mkdir(exist_ok=True)


env: WANDB_PROJECT=motes_figural


### General Prep

In [5]:
rng = np.random.default_rng(seed=seed)
data = pd.read_csv(project_data_dir / 'figural_ground_truth.csv', index_col=0)
data = data[['img_path', 'booklet', 'id', 'pdf_path', 'titlepage', 'F', 'O', 'T', 'E', 'R', 'C', 'Name']]
# 10% for test set
test_prop = 0.1
val_prop = 0.04

data['split'] = 'train'
randv = rng.random(size=len(data))
data.loc[(randv < test_prop+val_prop), 'split'] = 'val'
data.loc[(randv < test_prop), 'split'] = 'test'
print("Data split sizes")
display((data.split.value_counts() / len(data)).round(2))
display(data.split.value_counts())

data['img_path'] = data.img_path.apply(lambda x: Path(x))
data['testset'] = (rng.random(size=len(data)) < test_prop)
data['activity'] = data.img_path.apply(lambda x: x.parent.stem)
# id-encode activity
activities = data['activity'].unique().tolist()
id2activity = {i:x for i, x in enumerate(activities)}
activity2id = {x:i for i, x in enumerate(activities)}
data['activity_id'] = data.activity.replace(activity2id)

for measure_data in ['sims_to_blank.parquet', 'avg_sims.parquet', 'elaboration.parquet', 'zlist_sims_sketch_of.parquet']:
    x = pd.read_parquet(project_data_dir / measure_data)
    x = x.drop(columns=[y for y in ['path', 'cropped', 'contrast'] if y in x.columns])
    data = data.merge(x)
# remove some data errors
data.loc[data['F'] > 1, 'F'] = np.NaN
data.loc[data['T'] > 3, 'T'] = np.NaN
data.loc[data['R'] > 2, 'R'] = np.NaN

def fix_path(x):
    y= (project_data_dir / 'outputs' / x.booklet.lower() / x.img_path.parent.name / x.img_path.name)
    return y
data.img_path = data.apply(fix_path, axis=1)

# ignore missing data
data = data[~data.O.isna()]
data.O = data.O.astype(int)

data.sample(1)

Data split sizes


train    0.86
test     0.10
val      0.04
Name: split, dtype: float64

train    4255
test      472
val       208
Name: split, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,img_path,booklet,id,pdf_path,titlepage,F,O,T,E,R,...,activity,activity_id,blank_sim,blank_sim_uncropped,avg_sim,avg_sim_uncropped,elaboration_raw,min_zlist,mean_zlist,lowest3_zlist
2105,/content/drive/MyDrive/Projects/motes-figural/...,BOOKLETA,bb3e3-RD-SL-4811,../data/ttct_figural/REDACTED/TTCT Post ASS SS...,3,1.0,0,1.0,1.0,0.0,...,activity2d,4,0.769583,0.761269,0.786432,0.774703,0.115088,0.223252,0.251545,0.231451


TODO add activity to data (or train separate classifiers)

For a single classifier, I would use the first few values of the image input as special characters which encode the activity. VIT uses a transformer architecture, maybe I could grab a 'tail' value.

*although... if no cropping is done, the 'activity type' is just the consistently dark pixels.*

In [6]:
# Create dataset
labels = ["Not Original", "Original"]
id2label = {str(i):x for i, x in enumerate(labels)}
label2id = {v:k for k,v in id2label.items()}

# Create dataset
datasets = {}
for split in ['test', 'train', 'val']:
    subset = data[(data.split == split)]
    datasets[split] = Dataset.from_dict({'image': subset.img_path.astype(str),
                                         'activity': subset.activity_id.astype(int),
                                         'label': subset.O}).cast_column("image", Image()).shuffle(seed=seed)

rawdataset = DatasetDict(datasets)
rawdataset

DatasetDict({
    test: Dataset({
        features: ['image', 'activity', 'label'],
        num_rows: 348
    })
    train: Dataset({
        features: ['image', 'activity', 'label'],
        num_rows: 3016
    })
    val: Dataset({
        features: ['image', 'activity', 'label'],
        num_rows: 142
    })
})

### Model-Specific Prep

Transformations and normalization

In [7]:
from torchvision.transforms import RandomResizedCrop, RandomHorizontalFlip, Resize, Compose, Normalize, ToTensor, RandomInvert
import warnings

one_hot = False #@param {type:"boolean"}
if one_hot:
    warnings.warn("The current one-hot strategy likely won't make a difference because of the patches in VITs. A better strategy would be to change the token in the feature extraction")
invert_prob = 0 #@param {type:"slider", min:0, max:1, step:0.5}
random_crop = False #@param {type:"boolean"}
random_hflip = False #@param {type:'boolean'}

image_processor = AutoImageProcessor.from_pretrained(classification_model)
if "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
else:
    size = (image_processor.size["height"], image_processor.size["width"])
print("image size:", size)

pipeline = []
normalize = Normalize(mean=image_processor.image_mean,
                      std=image_processor.image_std)

if random_crop:
    pipeline.append(RandomResizedCrop(size))
else:
    pipeline.append(Resize(size))

if random_hflip:
    pipeline.append(RandomHorizontalFlip(p=0.5))

if invert_prob > 0:
    pipeline.append(RandomInvert(p=invert_prob)) # possibly force it to look for signal beyond ink

pipeline += [ToTensor(), normalize]
_transforms = Compose(pipeline)

def one_hot_pad(input):
    pixel_values, activity = input
    pixel_values[:, :len(activities), 0] = f.one_hot(torch.tensor(activity), num_classes=len(activities))
    return pixel_values

one_hot_transform = Compose([one_hot_pad])

def transforms(examples, one_hot=False):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    if one_hot:
        examples["pixel_values"] = [one_hot_transform((px, act)) for px, act in zip(examples['pixel_values'], examples['activity'])]
    del examples["image"]
    del examples['activity']
    return examples

dataset = rawdataset.with_transform(lambda x: transforms(x, one_hot))

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


image size: (224, 224)


## Train

In [8]:
import numpy as np
import evaluate
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer, EarlyStoppingCallback

accuracy = evaluate.load("accuracy")
f1 = evaluate.load('f1')
pearson = evaluate.load("pearsonr")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # this is a classification problem, so use predictions rather than logits
    predictions = np.argmax(logits, axis=1)
    metrics = {}
    for metric in [accuracy, f1, pearson]:
        metrics.update(metric.compute(predictions=predictions, references=labels))
    for metric in [pearson]:
        softmax = torch.tensor(logits).softmax(dim=-1)[:, 1] # make a single 'probability of class 2' number, bound [0,1]
        r_logit = metric.compute(predictions=softmax.numpy(), references=labels.astype(np.float32))
        metrics['pearsonr_soft'] = r_logit['pearsonr']

    return metrics

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

In [9]:
# Init model
model = AutoModelForImageClassification.from_pretrained(
    classification_model,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True # without this, it has the original classifier's n(classes) from ImageNet
)

Some weights of BeitForImageClassification were not initialized from the model checkpoint at microsoft/beit-base-patch16-224-pt22k-ft22k and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([21841, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([21841]) in the checkpoint and torch.Size([2]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
epochs = 25
log_times_per_epoch = 5 # approx. used to set logging_steps

# this is based off the observed limit with my own gpu
if ('384' in classification_model) or ("large" in classification_model):
    # beit-large-224 and vit-large-224 have 307m params, whereas their base models have 86m
    # https://arxiv.org/pdf/2106.08254.pdf
    batch_size = 20
    gradient_accumulation_steps = 8
elif 'convnext-base-224' in classification_model:
    batch_size = 48
    gradient_accumulation_steps = 4
elif '224' in classification_model:
    batch_size = 64
    gradient_accumulation_steps = 4
else:
    batch_size = 16
    gradient_accumulation_steps = 4

# init logging. No pretty names here, I prefer changing names in wandb console
wandb.init(project=name)
wandb.config.update({'one-hot':one_hot,
                     'classification_model':classification_model,
                     'seed':seed,
                     'invert_prob':invert_prob,
                     'random_crop':random_crop,
                     'random_hflip': random_hflip})

data_collator = DefaultDataCollator() 
training_args = TrainingArguments(
    output_dir=str(modeldir),
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3, #deletes oldest checkpoints
    
    learning_rate= 5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    warmup_ratio=0.2, #% of total steps to warm up (higher is slower)
    
    num_train_epochs=epochs,
    # log {log_times_per_epoch}, for easier comparison across different runs with different steps configs
    logging_steps = int(dataset['train'].num_rows/batch_size/gradient_accumulation_steps//log_times_per_epoch),
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True, 
    push_to_hub=False, # true if you want to share
    report_to='wandb',
    run_name=modeldir.name
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 5)]
)

trainer.train()

***** Running training *****
  Num examples = 3016
  Num Epochs = 25
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 4
  Total optimization steps = 300
  Number of trainable parameters = 85763522
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Accuracy,F1,Pearsonr,Pearsonr Soft
1,0.6333,0.56509,0.732394,0.819048,0.330771,0.366065
2,0.5177,0.544242,0.753521,0.820513,0.427345,0.44493
3,0.4628,0.510737,0.816901,0.87,0.564314,0.531016
4,0.3678,0.49961,0.760563,0.819149,0.46813,0.552677
5,0.2814,0.541519,0.788732,0.852941,0.488483,0.556378
6,0.1659,0.527205,0.802817,0.862745,0.523604,0.555676
7,0.122,0.62357,0.725352,0.786885,0.410473,0.530394
8,0.0721,0.754464,0.760563,0.824742,0.446964,0.509206


***** Running Evaluation *****
  Num examples = 142
  Batch size = 64
Saving model checkpoint to /content/drive/MyDrive/Projects/motes-figural/data/models/motes_figural_microsoft-beit-base-patch16-224-pt22k-ft22k/checkpoint-12
Configuration saved in /content/drive/MyDrive/Projects/motes-figural/data/models/motes_figural_microsoft-beit-base-patch16-224-pt22k-ft22k/checkpoint-12/config.json
Model weights saved in /content/drive/MyDrive/Projects/motes-figural/data/models/motes_figural_microsoft-beit-base-patch16-224-pt22k-ft22k/checkpoint-12/pytorch_model.bin
Image processor saved in /content/drive/MyDrive/Projects/motes-figural/data/models/motes_figural_microsoft-beit-base-patch16-224-pt22k-ft22k/checkpoint-12/preprocessor_config.json
Deleting older checkpoint [/content/drive/MyDrive/Projects/motes-figural/data/models/motes_figural_microsoft-beit-base-patch16-224-pt22k-ft22k/checkpoint-108] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 142
  Batch size = 64

TrainOutput(global_step=96, training_loss=0.3496219921701898, metrics={'train_runtime': 935.8159, 'train_samples_per_second': 80.571, 'train_steps_per_second': 0.321, 'total_flos': 1.868927582490329e+18, 'train_loss': 0.3496219921701898, 'epoch': 8.0})

In [11]:
x[:3].round(3)

Unnamed: 0,min_zlist,mean_zlist,lowest3_zlist,booklet,activity,id
0,0.222,0.267,0.238,BOOKLETA,activity2b,94fd1-71561
1,0.21,0.254,0.232,BOOKLETA,activity2b,9cf64-79099
2,0.239,0.27,0.247,BOOKLETA,activity2b,c6e72-74881


In [12]:
predictions, label_ids, metrics = trainer.predict(dataset['test'], metric_key_prefix='test')
metrics = {k.replace('test_', 'test/'):v for k,v in metrics.items()}
wandb.log(metrics)
print(metrics)

***** Running Prediction *****
  Num examples = 348
  Batch size = 64


{'test/loss': 0.5087286233901978, 'test/accuracy': 0.7614942528735632, 'test/f1': 0.8151447661469934, 'test/pearsonr': 0.4792395758975373, 'test/pearsonr_soft': 0.5239505471435327, 'test/runtime': 6.934, 'test/samples_per_second': 50.187, 'test/steps_per_second': 0.865}


Accuracy after 15 epoches, early stopping=3:
- microsoft/resnet-50: 0.511 (best was epoch 1, wtf?)
- google/vit-base-patch16-384: 0.911 (best, stopped at epoch 10)
- microsoft/beit-base-patch16-224-pt22k-ft22k: 0.8555566 (best was 0.86, stopped at epoch 6, epoch 1 was already 0.83)
- google/vit-base-patch16-224: 0.867 (best was 0.87, stopped at epoch 10)


<Tip>

For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).

</Tip>

## Inference

Example of using the model, in a `pipeline`:

In [13]:
# load example image
image = dataset['test']
image[0]

{'label': 1,
 'pixel_values': tensor([[[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9843, 0.9922, 0.9059,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          ...,
          [0.9843, 0.9922, 0.9059,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
 
         [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
          [

In [14]:
from transformers import pipeline
path_to_model = "test_model"

classifier = pipeline("image-classification", model=path_to_model)
classifier(image)

OSError: ignored

Or manually, rather than via pipeline:

In [None]:
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch

image_processor = AutoImageProcessor.from_pretrained(path_to_model)
inputs = image_processor(image, return_tensors="pt") # return as pytorch

model = AutoModelForImageClassification.from_pretrained(path_to_model)
with torch.no_grad():
    logits = model(**inputs).logits

predicted_label = logits.argmax(-1).item()
model.config.id2label[predicted_label]