In [None]:
!pip install torchinfo

In [None]:
import time
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import numpy as np
import cv2
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision
from torchvision import transforms
from torchinfo import summary
import torchvision.models as models
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score
import PIL
import matplotlib.pyplot as plt
import seaborn as sns
import time
from collections import OrderedDict
import platform
import psutil
import random
import glob
from tqdm import tqdm
from PIL import Image
from torchvision import transforms
from torchvision.transforms import ColorJitter, RandomRotation, RandomResizedCrop
from torchvision.transforms.functional import gaussian_blur
from PIL import ImageOps
from tabulate import tabulate

In [None]:
import os
import shutil

# Old directory (read-only)
old_dir = "/kaggle/input/fishdataset/Fish Data/Aair/Raw Data"

# Temporary copy location (writable)
copy_dir = "/kaggle/working/Raw Data"

# New renamed directory
new_dir = "/kaggle/working/Raw"

# Step 1: Remove existing 'Raw' directory if it exists
if os.path.exists(new_dir):
    shutil.rmtree(new_dir)  # This removes the directory and everything inside it

# Step 2: Copy the original directory to working space
if os.path.exists(copy_dir):  # If already copied, skip copy to save time
    print("Copy already exists.")
else:
    shutil.copytree(old_dir, copy_dir)

# Step 3: Rename the copied directory
shutil.move(copy_dir, new_dir)  # Better than os.rename for moving/renaming folders

# Step 4: Verify
print("Directories in /kaggle/working/:")
print(os.listdir("/kaggle/working/"))

In [None]:
seed = 1
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [None]:
import os
import pandas as pd

# Define data directory
data_dir = "/kaggle/input/fishdataset/Fish Data"

# Collect file paths and labels
file_paths = []
labels = []

for class_name in os.listdir(data_dir):
    class_dir = os.path.join(data_dir, class_name)
    for subdir in ["Augmented", "Raw"]:
        subdir_path = os.path.join(class_dir, subdir)
        if os.path.isdir(subdir_path):  # Ensure the directory exists
            for img_file in os.listdir(subdir_path):
                if img_file.endswith(".jpg"):  # Ensure only image files are added
                    file_paths.append(os.path.join(subdir_path, img_file))
                    labels.append(class_name)

# Create DataFrame
df = pd.DataFrame({"file_path": file_paths, "label": labels})
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the dataset

# Split dataset into train, validation, and test sets
train_dataframe, temp_dataframe = train_test_split(df, test_size=0.30, stratify=df['label'], random_state=42)
valid_dataframe, test_df = train_test_split(temp_dataframe, test_size=0.50, stratify=temp_dataframe['label'], random_state=42)

print("Training Data:", len(train_dataframe))
print("Validation Data:", len(valid_dataframe))
print("Test Data:", len(test_df))
print("-")
print("Total amounts of data in the dataset:", len(df))

In [None]:
# Ensure device is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define save path for checkpoints
save_path_checkpoints = "/kaggle/working/checkpoints"

# Make sure directory exists
os.makedirs(save_path_checkpoints, exist_ok=True)

In [None]:
from torchvision import transforms

# Data transformations
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
from torch.utils.data import Dataset
from PIL import Image

class FishDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform
        # Create a mapping from class name to integer index
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(sorted(dataframe['label'].unique()))}

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = self.dataframe.iloc[idx]['file_path']
        label_name = self.dataframe.iloc[idx]['label']
        label = self.class_to_idx[label_name]  # Convert label to integer index
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)

In [None]:
from torch.utils.data import DataLoader
train_batch = 32
val_batch = 8
# Create datasets
train_dataset = FishDataset(train_dataframe, transform=train_transform)
valid_dataset = FishDataset(valid_dataframe, transform=val_transform)
test_dataset = FishDataset(test_df, transform=val_transform)

# Create dataloaders
dataloader_train_dataset = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
dataloader_valid_dataset  = DataLoader(valid_dataset, batch_size=8, shuffle=False, num_workers=4)
dataloader_test_dataset = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=4)

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models

# Load pretrained ResNet50
model = models.resnet50(weights=models.ResNet50_Weights.IMAGENET1K_V1)

# Replace the final fully connected layer
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 20)  # Assuming 20 fish types

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define loss function and optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()
summary(model, input_size=(train_batch, 3, 224, 224))

In [None]:
import time
import os
import pandas as pd
import torch
from tqdm.notebook import tqdm

def train_model(model, criterion, optimizer, dataloader_train_dataset, dataloader_valid_dataset, num_epochs=20, early_stop_patience=5, save_path_checkpoints="checkpoints"):
    train_loss_history = []
    train_acc_history = []
    val_loss_history = []
    val_acc_history = []

    best_val_acc = 0.0
    consecutive_no_improvement = 0
    num_epochs_loss_greater = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        progress_bar = tqdm(enumerate(dataloader_train_dataset), total=len(dataloader_train_dataset))
        for i, (inputs, labels) in progress_bar:  # Only unpack 2 values
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            _, predicted = torch.max(outputs, 1)
            running_loss += loss.item() * inputs.size(0)
            total_train += labels.size(0)
            correct_train += (predicted == labels).sum().item()

            progress_bar.set_description(f"Epoch {epoch+1}/{num_epochs}")
            progress_bar.set_postfix(loss=running_loss / total_train, acc=correct_train / total_train)

        epoch_train_loss = running_loss / total_train
        epoch_train_acc = correct_train / total_train
        train_loss_history.append(epoch_train_loss)
        train_acc_history.append(epoch_train_acc)
        print('Training Loss: {:.3f} Acc: {:.3f}'.format(epoch_train_loss, epoch_train_acc))

        # Validation phase
        model.eval()
        running_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for inputs, labels in dataloader_valid_dataset:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                _, predicted = torch.max(outputs, 1)
                running_loss += loss.item() * inputs.size(0)
                total_val += labels.size(0)
                correct_val += (predicted == labels).sum().item()

        epoch_val_loss = running_loss / total_val
        epoch_val_acc = correct_val / total_val
        val_loss_history.append(epoch_val_loss)
        val_acc_history.append(epoch_val_acc)
        print('Validation Loss: {:.3f} Acc: {:.3f}'.format(epoch_val_loss, epoch_val_acc))

        # Early stopping logic
        if epoch_val_acc > best_val_acc:
            best_val_acc = epoch_val_acc
            best_epoch = epoch + 1
            filepath = f"{save_path_checkpoints}/model.pt"
            checkpoint = {
                "epoch": epoch + 1,
                "model_weight": model.state_dict(),
                "optimizer_state": optimizer.state_dict()
            }
            torch.save(checkpoint, filepath)
            print(f"Best model saved at epoch {best_epoch} with validation accuracy: {best_val_acc:.3f}")
            consecutive_no_improvement = 0
        else:
            consecutive_no_improvement += 1

        if epoch_val_loss > epoch_train_loss:
            num_epochs_loss_greater += 1
        else:
            num_epochs_loss_greater = 0

        if consecutive_no_improvement >= early_stop_patience or num_epochs_loss_greater >= early_stop_patience:
            print(f"Early stopping criteria met. Training stopped at epoch {epoch + 1}.")
            break

    return train_loss_history, train_acc_history, val_loss_history, val_acc_history


# Start training
start_time = time.time()
train_loss_history, train_acc_history, val_loss_history, val_acc_history = train_model(
    model, criterion, optimizer, dataloader_train_dataset, dataloader_valid_dataset,
    save_path_checkpoints=save_path_checkpoints
)
end_time = time.time()
training_time = end_time - start_time
print(f"Training Time: {training_time:.2f} seconds ---> {training_time/60:.2f} minutes")

# Save training history
data = {
    'Epoch': list(range(1, len(train_loss_history) + 1)),
    'Train Loss': train_loss_history,
    'Train Accuracy': train_acc_history,
    'Validation Loss': val_loss_history,
    'Validation Accuracy': val_acc_history
}
history = pd.DataFrame(data)
history.to_excel('/kaggle/working/training_data.xlsx', index=False)