In [12]:
#Name : Manahil Sarwar
#Section : AI-K
#Roll No : 21I-0293

In [None]:
#Importing Libraries
import os
import cv2
import torch
from torchvision import transforms
from transformers import ViTFeatureExtractor
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from PIL import Image
import random
from transformers import ViTForImageClassification, ViTFeatureExtractor
import torch.optim as optim
from transformers import get_linear_schedule_with_warmup
from torch import tensor
from tqdm import tqdm 

### Data Preprocessing

In [1]:
#Define paths
dataset_path='/kaggle/input/hmdb-human-activity-recognition/HMDB_dataset'
frame_output_path='/kaggle/working/frames'

#Extract frames
def extract_frames(video_path,output_dir,frame_interval=5):
    video_capture=cv2.VideoCapture(video_path)
    success,frame_count=True,0
    while success:
        success,frame=video_capture.read()
        if frame_count % frame_interval==0 and success:
            frame_resized=cv2.resize(frame,(224,224))
            cv2.imwrite(f"{output_dir}/frame_{frame_count}.jpg",frame_resized)
        frame_count+=1
    video_capture.release()

#Loop through HMDB videos and extract frames
for action_class in os.listdir(dataset_path):
    class_dir=os.path.join(dataset_path,action_class)
    print("Extracting Frames from Class : ",class_dir)
    for video_file in os.listdir(class_dir):
        video_path=os.path.join(class_dir,video_file)
        output_dir=os.path.join(frame_output_path,action_class,video_file.split('.')[0])
        os.makedirs(output_dir,exist_ok=True)
        extract_frames(video_path,output_dir)

#Define image transformations for data augmentation
data_transforms=transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5,0.5,0.5],std=[0.5,0.5,0.5])
])

Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/kick_ball
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/catch
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/shoot_ball
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/climb_stairs
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/punch
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/stand
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/smoke
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/clap
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/somersault
Extracting Frames from Class :  /kaggle/input/hmdb-human-activity-recognition/HMDB_dataset/drink
Extracting

In [7]:
#Create a dictionary to map each action class to an integer
class_names=sorted(os.listdir(frame_output_path))
label_mapping={class_name:idx for idx, class_name in enumerate(class_names)}

#Dataset Classes
class HMDBDataset(Dataset):
    def __init__(self,frame_paths,labels,transform=None):
        self.frame_paths=frame_paths
        self.labels=labels
        self.transform=transform
    def __len__(self):
        return len(self.frame_paths)
    def __getitem__(self,idx):
        img_path=self.frame_paths[idx]
        image=Image.open(img_path).convert("RGB")
        label=self.labels[idx]
        label=label_mapping[label]
        if self.transform:
            image=self.transform(image)
        return image,label


#Load image paths and labels
frame_paths,labels=[],[]
for action_class in os.listdir(frame_output_path):
    label=action_class
    class_dir=os.path.join(frame_output_path,action_class)
    for video_folder in os.listdir(class_dir):
        frames=os.listdir(os.path.join(class_dir,video_folder))
        frame_paths.extend([os.path.join(class_dir,video_folder,frame) for frame in frames])
        labels.extend([label]*len(frames))

#Split into train and validation
train_paths,val_paths,train_labels,val_labels=train_test_split(frame_paths,labels,test_size=0.2,stratify=labels)
#Dataloaders
train_dataset=HMDBDataset(train_paths,train_labels,transform=data_transforms)
val_dataset=HMDBDataset(val_paths,val_labels,transform=data_transforms)
train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
val_loader=DataLoader(val_dataset,batch_size=16,shuffle=False)

### Load the Vision Tranformer Model

In [8]:
#Load ViT model and feature extractor
feature_extractor=ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
model=ViTForImageClassification.from_pretrained("google/vit-base-patch16-224",num_labels=51,ignore_mismatched_sizes=True)
print(model)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([51]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([51, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_fe

### Train the Model

In [10]:
#Training configurations
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion=torch.nn.CrossEntropyLoss()
optimizer=optim.AdamW(model.parameters(),lr=3e-5)
scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=100,num_training_steps=len(train_loader)*10)

#Training loop
best_accuracy=0
early_stopping_count=0
for epoch in range(10):
    model.train()
    print(f"Epoch {epoch + 1}/{10}")
    train_progress_bar=tqdm(train_loader,desc="Training",leave=False)
    for images,labels in train_progress_bar:
        images,labels=images.to(device),labels.to(device)
        optimizer.zero_grad()
        outputs=model(images).logits
        loss=criterion(outputs,labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_progress_bar.set_postfix(loss=loss.item())
    #Validation
    model.eval()
    correct,total=0,0
    val_progress_bar=tqdm(val_loader,desc="Validating",leave=False)
    with torch.no_grad():
        for images,labels in val_progress_bar:
            images,labels=images.to(device),labels.to(device)
            outputs=model(images).logits
            _,predicted=torch.max(outputs,1)
            total+=labels.size(0)
            correct+=(predicted==labels).sum().item()
            #Update validation progress bar
            val_progress_bar.set_postfix(accuracy=(correct/total)*100)
    accuracy=correct/total
    print(f"Epoch {epoch+1}, Accuracy: {accuracy*100:.2f}%")

    #Early Stopping & Checkpointing
    if accuracy>best_accuracy:
        best_accuracy=accuracy
        early_stopping_count=0
        torch.save(model.state_dict(),"best_model.pth")
    else:
        early_stopping_count+=1
        if early_stopping_count>=3:
            print("Early stopping triggered")
            break

Epoch 1/10


                                                                              

Epoch 1, Accuracy: 88.53%
Epoch 2/10


                                                                              

Epoch 2, Accuracy: 91.92%
Epoch 3/10


                                                                              

Epoch 3, Accuracy: 93.25%
Epoch 4/10


                                                                              

Epoch 4, Accuracy: 94.74%
Epoch 5/10


                                                                              

Epoch 5, Accuracy: 95.38%
Epoch 6/10


                                                                              

Epoch 6, Accuracy: 95.93%
Epoch 7/10


                                                                              

Epoch 7, Accuracy: 96.28%
Epoch 8/10


                                                                              

Epoch 8, Accuracy: 96.56%
Epoch 9/10


                                                                              

Epoch 9, Accuracy: 96.80%
Epoch 10/10


                                                                              

Epoch 10, Accuracy: 97.04%


### Test the Model

In [11]:
#Load best model for testing
model.load_state_dict(torch.load("best_model.pth"))
model.eval()
#Test on validation set
correct,total=0,0
with torch.no_grad():
    for images,labels in val_loader:
        images,labels=images.to(device),labels.to(device)
        outputs=model(images).logits
        _,predicted=torch.max(outputs,1)
        total+=labels.size(0)
        correct+=(predicted==labels).sum().item()
accuracy=correct/total
print(f"Final Test Accuracy: {accuracy*100:.2f}%")


  model.load_state_dict(torch.load("best_model.pth"))


Final Test Accuracy: 97.06%
