In [2]:
from transformers import SiglipForImageClassification, AutoImageProcessor
import os
import pandas as pd
from PIL import Image
from datasets import Dataset
from transformers import Trainer, TrainingArguments
import numpy as np
import torch
import evaluate
from torch import nn

In [3]:
device = (torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [4]:
id2label = {0: "not_taking_medication", 1: "taking_medication"}
label2id = {"not_taking_medication": 0, "taking_medication": 1}

df = pd.read_csv("frames_annotations.csv")
df['label'] = df['label'].map(label2id)

dataset = Dataset.from_dict({ 
    "image": df['image'].values,
    "label": df['label'].values
})

In [6]:
model = SiglipForImageClassification.from_pretrained(
    "prithivMLmods/Human-Action-Recognition", 
    num_labels=2, # changing the model to be a binary classifier instead
    id2label=id2label, 
    label2id=label2id, 
    ignore_mismatched_sizes=True,
    problem_type="single_label_classification"
)
processor = AutoImageProcessor.from_pretrained("prithivMLmods/Human-Action-Recognition")

# the output will give a warning, but this can be safely ignored: the original model had 15 classes, 
# this one has 2 causing a shape mismatch, but it is all good

Some weights of SiglipForImageClassification were not initialized from the model checkpoint at prithivMLmods/Human-Action-Recognition and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([15]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.weight: found shape torch.Size([15, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [7]:
def preprocess(example):
    image = Image.open(example["image"]).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    inputs["label"] = example["label"]
    return {
        "pixel_values": inputs["pixel_values"][0],
        "label": int(example["label"])
    }

# converts all images to arrays of pixel values
dataset = dataset.map(preprocess)

dataset = dataset.train_test_split(test_size=0.1)

Map:   0%|          | 0/850 [00:00<?, ? examples/s]

In [8]:
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)

    acc = accuracy.compute(predictions=preds, references=labels)
    prec = precision.compute(predictions=preds, references=labels, average="binary")
    rec = recall.compute(predictions=preds, references=labels, average="binary")
    f1_score = f1.compute(predictions=preds, references=labels, average="binary")

    return {
        "accuracy": acc["accuracy"],
        "precision": prec["precision"],
        "recall": rec["recall"],
        "f1": f1_score["f1"]
    }

In [9]:
training_args = TrainingArguments(
    output_dir="./HAR-medication-finetuned_v2",
    per_device_train_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    hub_model_id="Adekiii/HAR-medication-finetuned_v2",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4064,0.260571,0.894118,0.714286,0.416667,0.526316
2,0.463,0.398692,0.858824,0.0,0.0,0.0
3,0.2051,0.186737,0.917647,0.647059,0.916667,0.758621
4,0.2234,0.346711,0.894118,0.615385,0.666667,0.64
5,0.0316,0.434357,0.905882,0.611111,0.916667,0.733333


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=240, training_loss=0.3138491948445638, metrics={'train_runtime': 394.231, 'train_samples_per_second': 9.702, 'train_steps_per_second': 0.609, 'total_flos': 3.203647715165184e+17, 'train_loss': 0.3138491948445638, 'epoch': 5.0})

In [11]:
trainer.save_model("./HAR-med-finetuned_v2")
processor.save_pretrained("./HAR-med-finetuned_v2")

['./HAR-med-finetuned_v2\\preprocessor_config.json']

In [None]:
from huggingface_hub import HfApi

access_token = "___"

api = HfApi(token=access_token)
api.upload_folder(
    folder_path="HAR-med-finetuned_v2",
    repo_id="Adekiii/HAR-medication-finetuned_v2", # change with own repo
    repo_type="model",
)

model.safetensors:   0%|          | 0.00/372M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.71k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Adekiii/HAR-medication-finetuned_v2/commit/c346152e4d9d2f6bbabbee131a8a1b390f0e39a6', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c346152e4d9d2f6bbabbee131a8a1b390f0e39a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Adekiii/HAR-medication-finetuned_v2', endpoint='https://huggingface.co', repo_type='model', repo_id='Adekiii/HAR-medication-finetuned_v2'), pr_revision=None, pr_num=None)