In [1]:
import pandas as pd
import os
import torch
import numpy as np
import PIL

from datasets import load_dataset, Image, Dataset, concatenate_datasets
from transformers import AutoFeatureExtractor, ViTFeatureExtractor,ViTForImageClassification,TrainingArguments, Trainer, BeitFeatureExtractor, TrainerCallback
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    RandomVerticalFlip,
    Resize,
    ToTensor)

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

# Data exploration

In [2]:
data = pd.read_csv('labels_train.csv')
data.head()

Unnamed: 0,filename,FISSURE,REPARATION,FISSURE LONGITUDINALE,FAÏENCAGE,MISE EN DALLE
0,BDCAEROD0000000017183099_runway_3_gridsize_512...,0,0,1,1,0
1,BDCAEROD0000000017183055_runway_1_gridsize_512...,0,0,1,0,0
2,BDCAEROD0000000017183118_runway_1_gridsize_512...,1,0,1,0,0
3,BDCAEROD0000000017183028_runway_1_gridsize_512...,1,0,0,0,0
4,BDCAEROD0000000017183088_runway_1_gridsize_512...,0,0,0,0,0


# Converting images to dataset object

In [3]:
# Storing all the paths to images in a dict

path_start = os.getcwd() + "\\dataset\\train\\"
list_path = [path_start + filename for filename in os.listdir('dataset/train') ]
path_dict = {"image":list_path}

# Converting the dict to a dataset object

dataset = Dataset.from_dict(path_dict).cast_column("image", Image())
dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x23AB7A9C130>}

### Label management

In [4]:
data['label'] = data.apply(lambda x: [x.FISSURE, x.REPARATION, x['FISSURE LONGITUDINALE'], x.FAÏENCAGE, x['MISE EN DALLE']] , axis=1)
data['label']

0      [0, 0, 1, 1, 0]
1      [0, 0, 1, 0, 0]
2      [1, 0, 1, 0, 0]
3      [1, 0, 0, 0, 0]
4      [0, 0, 0, 0, 0]
            ...       
825    [0, 0, 0, 0, 0]
826    [1, 1, 1, 0, 1]
827    [0, 1, 0, 1, 0]
828    [0, 1, 1, 1, 0]
829    [0, 0, 0, 1, 0]
Name: label, Length: 830, dtype: object

In [5]:
data.filename[0]

'BDCAEROD0000000017183099_runway_3_gridsize_512_idx_7_idy_0.jpg'

### Creating the column of labels to be added to the dataset

In [6]:
column_of_labels = []

for i in range(dataset.shape[0]):
    filename = list_path[i][65:]
    row_dataset = data[data.filename == filename]
    list_label = list(row_dataset['label'])[0]
    list_label = np.array(list_label, dtype = np.float32).tolist()
    column_of_labels.append(list_label)
    
column_of_labels[:15]

[[0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 0.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 0.0, 1.0],
 [1.0, 0.0, 1.0, 1.0, 1.0],
 [1.0, 0.0, 1.0, 1.0, 1.0],
 [0.0, 0.0, 0.0, 0.0, 0.0]]

In [7]:
dataset = dataset.add_column(name="label", column=column_of_labels)

In [8]:
dataset[0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x23AB7B3D940>,
 'label': [0.0, 0.0, 0.0, 0.0, 0.0]}

In [9]:
ALL_LABELS = ['FISSURE','REPARATION','FISSURE LONGITUDINALE','FAÏENCAGE','MISE EN DALLE']

In [10]:
id2label = {k:l for k, l in enumerate(ALL_LABELS)}
label2id = {l:k for k, l in enumerate(ALL_LABELS)}

In [11]:
id2label

{0: 'FISSURE',
 1: 'REPARATION',
 2: 'FISSURE LONGITUDINALE',
 3: 'FAÏENCAGE',
 4: 'MISE EN DALLE'}

In [12]:
label2id

{'FISSURE': 0,
 'REPARATION': 1,
 'FISSURE LONGITUDINALE': 2,
 'FAÏENCAGE': 3,
 'MISE EN DALLE': 4}

# Extracting pixel data from our dataset

In [13]:
model_checkpoint = "google/vit-base-patch16-224" # pre-trained model from which to fine-tune
batch_size = 4 # batch size for training and evaluation

In [14]:
feature_extractor = ViTFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
}

In [15]:
# Creating copies of original dataset for data augmentation

dataset2 = dataset.flatten_indices()

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

### Defining data augmentation fonctions and creating dataset

In [16]:
### Calling function for normalizing image size

normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)

### Creating pipeline objects for image preparation
### One different pipeline for each copy of the dataset (so that we don't have duplicate images)

dataset_transforms = Compose(
        [
            Resize(feature_extractor.size),
            ToTensor(),
            normalize,
        ]
    )

dataset2_transforms = Compose(
        [
            Resize(feature_extractor.size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )

### Creating transformation functions based on pipelines

def preprocess_dataset(example_batch):
    example_batch["pixel_values"] = [dataset_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch 


def preprocess_dataset2(example_batch):
    example_batch["pixel_values"] = [dataset2_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch

### Applying transformation functions

dataset.set_transform(preprocess_dataset)
dataset2.set_transform(preprocess_dataset2)

### Renaming

In [17]:
train_ds = dataset
val_ds = dataset2

## Model building

In [18]:
model = ViTForImageClassification.from_pretrained(
    model_checkpoint,
    num_labels = 5,
    problem_type="multi_label_classification",
    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint,
    use_auth_token='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'
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
model.save_pretrained(model_checkpoint)
feature_extractor.save_pretrained(model_checkpoint)

In [20]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

# Using default trainer

In [None]:
args = TrainingArguments(
    model_checkpoint,
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=7,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=feature_extractor,
    data_collator=collate_fn,
    #hub_token = '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'
)

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

In [None]:
train_results = trainer.train()
# rest is optional but nice to have
#trainer.save_model()
#trainer.log_metrics("train", train_results.metrics)
#trainer.save_metrics("train", train_results.metrics)
#trainer.save_state()

In [None]:
trainer.evaluate()

# Using custom trainer

### Defining precisely the loss we want

In [21]:
GLOBAL_SCORE_INDICES = range(0, 5)

def get_preds_from_logits(logits):
    ret = np.zeros(logits.shape)
    
    # We fill 1 to every class whose score is higher than some threshold
    # In this example, we choose that threshold = 0.0
    ret[:, GLOBAL_SCORE_INDICES] = np.array(logits[:, GLOBAL_SCORE_INDICES] >= 0.0).astype(int)
    
    return ret

In [22]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    final_metrics = {}
    
    # Deduce predictions from logits
    predictions = get_preds_from_logits(logits)

    # The global f1_metrics
    final_metrics["f1_micro"] = f1_score(labels, predictions, average="micro")
    final_metrics["f1_macro"] = f1_score(labels, predictions, average="macro")
    final_metrics["f1_weight"] = f1_score(labels, predictions, average="weighted")
    
    # Classification report
    print("Classification report for global scores: ")
    print(classification_report(labels[:, GLOBAL_SCORE_INDICES], predictions[:, GLOBAL_SCORE_INDICES], zero_division=0))
    return final_metrics

### Adapting the Huggingface trainer class

In [23]:
class MultiTaskClassificationTrainer(Trainer):
    def __init__(self, group_weights=None, **kwargs):
        super().__init__(**kwargs)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0]
        
        loss = torch.nn.functional.binary_cross_entropy_with_logits(logits[:, GLOBAL_SCORE_INDICES], labels[:, GLOBAL_SCORE_INDICES])
        
        return (loss, outputs) if return_outputs else loss

In [24]:
class PrinterCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, logs=None, **kwargs):
        print(f"Epoch {state.epoch}: ")

### Running trainer

In [25]:
training_args = TrainingArguments(
    model_checkpoint,
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=6,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model="f1_macro"
)

trainer = MultiTaskClassificationTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    callbacks=[PrinterCallback]
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 830
  Num Epochs = 6
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 312


Epoch,Training Loss,Validation Loss


# Trying to predict

In [None]:
# Encode the image

image_test = test_ds[6]['image']
encoding = feature_extractor(image_test.convert("RGB"), return_tensors="pt")

# Call the model to get predictions

outputs = model(**encoding)
logits = outputs.logits

# Decode the result

preds = get_preds_from_logits(logits)
decoded_preds = [[id2label[i] for i, l in enumerate(row) if l == 1] for row in preds]
decoded_preds

In [None]:
train_ds[19]

# Defining submission

In [None]:
### Creating test dataset

path_start_test = os.getcwd() + "\\dataset\\test\\"
list_path_test = [path_start_test + filename for filename in os.listdir('dataset/test') ]
path_dict_test = {"image":list_path_test}

# Converting the dict to a dataset object

dataset_test = Dataset.from_dict(path_dict_test).cast_column("image", Image())
dataset_test

In [None]:
template_test = pd.read_csv('template_test.csv')
template_test.head()

In [None]:
template_test['filepath'] = template_test.apply(lambda x: os.getcwd() + "\\dataset\\test\\" + x.filename,axis=1)
template_test.filepath[0]

In [None]:
def return_pred(filepath):
    image_to_decode = PIL.Image.open(filepath)
    encoding = feature_extractor(image_to_decode.convert("RGB"), return_tensors="pt")
    outputs = model(**encoding)
    logits = outputs.logits
    preds = get_preds_from_logits(logits)
    return(preds)

In [None]:
template_test['predictions'] = template_test.apply(lambda x: return_pred(x.filepath),axis=1)

In [None]:
template_test['predictions']

In [None]:
template_test["FISSURE"] = template_test.apply(lambda x: int(x.predictions.tolist()[0][0]) ,axis=1)
template_test["REPARATION"] = template_test.apply(lambda x: int(x.predictions.tolist()[0][1]),axis=1)
template_test["FISSURE LONGITUDINALE"] = template_test.apply(lambda x: int(x.predictions.tolist()[0][2]),axis=1)
template_test["FAÏENCAGE"] = template_test.apply(lambda x: int(x.predictions.tolist()[0][3]),axis=1)
template_test["MISE EN DALLE"] = template_test.apply(lambda x: int(x.predictions.tolist()[0][4]),axis=1)

In [None]:
template_test

In [None]:
submission = template_test.drop(columns=['filepath','predictions'], axis=1)
submission

In [None]:
submission.to_csv('submission.csv', index=False)