# Login to Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive

# Teacher Student model

## codes

In [None]:
import IPython.display as display

import glob
from collections import Counter

import math
import pandas as pd

import librosa
import librosa.display
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import torch
import torchaudio
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from pathlib import Path
from PIL import Image
import soundfile as sf
from torch.utils.data import Dataset
from torchvision import models, transforms

import tensorflow as tf
from torch.utils.tensorboard import SummaryWriter
import datetime

from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd

import torch.optim.lr_scheduler as lr_scheduler

def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    # print('model size: {:.3f}MB'.format(size_all_mb))
    return size_all_mb

def evaluate(model, test_loader, device="cpu"):
    model.eval()
    num_correct = 0
    num_examples = 0

    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            correct = torch.eq(torch.max(F.softmax(output, dim=1), dim=1)[1], targets).view(-1)
            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]

    accuracy = num_correct / num_examples
    return accuracy

class FrequencyMask(object):
    """
      Example:
        >>> transforms.Compose([
        >>>     transforms.ToTensor(),
        >>>     FrequencyMask(max_width=10, use_mean=False),
        >>> ])

    """

    def __init__(self, max_width, use_mean=True):
        self.max_width = max_width
        self.use_mean = use_mean

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of
            size (C, H, W) where the frequency
            mask is to be applied.

        Returns:
            Tensor: Transformed image with Frequency Mask.
        """
        start = random.randrange(0, tensor.shape[2])
        end = start + random.randrange(1, self.max_width)
        if self.use_mean:
            tensor[:, start:end, :] = tensor.mean()
        else:
            tensor[:, start:end, :] = 0
        return tensor

    def __repr__(self):
        format_string = self.__class__.__name__ + "(max_width="
        format_string += str(self.max_width) + ")"
        format_string += 'use_mean=' + (str(self.use_mean) + ')')

        return format_string


class TimeMask(object):
    """
      Example:
        >>> transforms.Compose([
        >>>     transforms.ToTensor(),
        >>>     TimeMask(max_width=10, use_mean=False),
        >>> ])

    """

    def __init__(self, max_width, use_mean=True):
        self.max_width = max_width
        self.use_mean = use_mean

    def __call__(self, tensor):
        """
        Args:
            tensor (Tensor): Tensor image of
            size (C, H, W) where the time mask
            is to be applied.

        Returns:
            Tensor: Transformed image with Time Mask.
        """
        start = random.randrange(0, tensor.shape[1])
        end = start + random.randrange(0, self.max_width)
        if self.use_mean:
            tensor[:, :, start:end] = tensor.mean()
        else:
            tensor[:, :, start:end] = 0
        return tensor

    def __repr__(self):
        format_string = self.__class__.__name__ + "(max_width="
        format_string += str(self.max_width) + ")"
        format_string += 'use_mean=' + (str(self.use_mean) + ')')
        return format_string


class PrecomputedESC50(Dataset):
    def __init__(self,path, max_freqmask_width, max_timemask_width, use_mean=True, dpi=50):
        files = Path(path).glob('*.png')
        self.items = [(f,int(f.name.split("-")[-1].replace(".wav.png",""))) for f in files]
        self.length = len(self.items)
        self.max_freqmask_width = max_freqmask_width
        self.max_timemask_width = max_timemask_width
        self.use_mean = use_mean
        self.img_transforms = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
            transforms.RandomApply([FrequencyMask(self.max_freqmask_width, self.use_mean)], p=0.5),
            transforms.RandomApply([TimeMask(self.max_timemask_width, self.use_mean)], p=0.5)])

    def __getitem__(self, index):
        filename, label = self.items[index]
        img = Image.open(filename).convert('RGB')
        return (self.img_transforms(img), label)

    def __len__(self):
        return self.length

# Define a function to plot and log confusion matrix to TensorBoard
def plot_confusion_matrix(model, test_loader, device="cpu"):
    model.eval()
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            predictions = torch.max(F.softmax(output, dim=1), dim=1)[1].cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(targets.cpu().numpy())

    # Generate confusion matrix
    cm = confusion_matrix(all_labels, all_predictions)

    # Create a heatmap of the confusion matrix
    plt.figure(figsize=(20, 16))
    sn.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=True, yticklabels=True)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')

    figure = plt.gcf()
    return figure

# Define a function to log predictions vs. actuals as images to TensorBoard
def log_predictions_vs_actuals(model, data_loader, device="cpu", num_batches=5):
    model.eval()

    batch_counter = 0
    with torch.no_grad():
        for batch in data_loader:
            if batch_counter >= num_batches:
                break

            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            predictions = torch.max(F.softmax(output, dim=1), dim=1)
            predicted_labels = predictions[1]
            probabilities = predictions[0]

            # Convert PyTorch tensors to NumPy arrays
            inputs_np = inputs.permute(0, 2, 3, 1).cpu().numpy()

            # Create a figure for each batch
            fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(12, 12))

            for i, ax in enumerate(axes.flat):
                ax.imshow(inputs_np[i])
                ax.axis("off")

                actual_label = targets[i].item()
                predicted_label = predicted_labels[i].item()
                probability = probabilities[i].item()

                # Color the title based on correctness
                title_color = 'green' if actual_label == predicted_label else 'red'

                ax.set_title(f"Actual: {actual_label}\nPredicted: {predicted_label}\nProb: {probability:.2f}", color=title_color)

            plt.tight_layout()
            batch_counter += 1
    return fig

class EarlyStopping:
    def __init__(self, patience, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_valid_accuracy = 0.0
        self.early_stop = False

    def step(self, valid_accuracy):
        if valid_accuracy > self.best_valid_accuracy:
            self.best_valid_accuracy = valid_accuracy
            self.counter = 0
        else:
            self.counter += 1
            if self.counter > self.patience:
                self.early_stop = True
                if self.verbose:
                    print("Early stopping activated.")
        return self.early_stop

class LearningRateScheduler(lr_scheduler._LRScheduler):
    def __init__(self, optimizer, patience, factor=0.1, verbose=False):
        self.optimizer = optimizer
        self.patience = patience
        self.factor = factor
        self.verbose = verbose
        self.lr_scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, patience=self.patience, factor=self.factor, verbose=self.verbose)

    def step(self, valid_accuracy):
        self.lr_scheduler.step(valid_accuracy)
        return self.optimizer.param_groups[0]['lr']

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")


PATH_ESC50_TRAIN="./train1/"
PATH_ESC50_VALID="./valid1/"
PATH_ESC50_TEST="./test/"

bs=16
esc50pre_train = PrecomputedESC50(PATH_ESC50_TRAIN, max_freqmask_width=10, max_timemask_width=10 )
esc50pre_valid = PrecomputedESC50(PATH_ESC50_VALID,max_freqmask_width=10, max_timemask_width=10 )
esc50pre_test = PrecomputedESC50(PATH_ESC50_TEST,max_freqmask_width=10, max_timemask_width=10 )

esc50_train_loader = torch.utils.data.DataLoader(esc50pre_train, bs, shuffle=True)
esc50_val_loader = torch.utils.data.DataLoader(esc50pre_valid, bs, shuffle=True)
esc50_test_loader = torch.utils.data.DataLoader(esc50pre_test, bs, shuffle=True)

In [None]:
# Function to calculate model latency (replace with your actual latency calculation)
def estimate_latency(model, device="cpu"):
    input_tensor = torch.randn(1, 3, 224, 224).to(device)
    # This is a simplified estimation of model latency and may not be accurate for all models.
    model = model.to(device)
    input_tensor = input_tensor.to(device)

    # Warm-up to reduce variability
    for _ in range(10):
        _ = model(input_tensor)

    # Measure execution time
    start_time = torch.cuda.Event(enable_timing=True)
    end_time = torch.cuda.Event(enable_timing=True)
    start_time.record()
    _ = model(input_tensor)
    end_time.record()
    torch.cuda.synchronize()

    latency_ms = start_time.elapsed_time(end_time)
    return latency_ms


def train(model, optimizer, loss_fn, train_loader, val_loader, test_loader, load_model_name, epochs=20, device="cpu", log_dir='tensorboard_logs', patience=5, early_stopping_patience=10, classifier_type='MLP'):
    ## Create folders and writer
    # Create a directory to store TensorBoard logs
    # log_dir = 'tensorboard_logs'

    # Create a TensorBoard SummaryWriter
    # load_model_name = "resnet50"
    model_name = "best_model_" + load_model_name + ".pth"

    current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
    unique_folder_name = f"{current_datetime}_{load_model_name}"
    unique_log_dir = os.path.join(log_dir, unique_folder_name)
    model_path = os.path.join(unique_log_dir, model_name)

    layout = {
        "Train and validation at same time": {
            "Loss": ["Multiline", ["Loss/Train", "Loss/Validation"]],
            "Accuracy": ["Multiline", ["Accuracy/Train", "Accuracy/Validation"]],
        },
    }

    writer = SummaryWriter(log_dir=unique_log_dir)
    writer.add_custom_scalars(layout)

    ## use early_stopping and scheduler learning rate
    early_stopping = EarlyStopping(patience=early_stopping_patience, verbose=True)
    lr_scheduler = LearningRateScheduler(optimizer, patience=patience, factor=0.1, verbose=True)

    ## start training
    best_valid_accuracy = 0.0
    best_model_state = None

    for epoch in range(1, epochs + 1):
        training_loss = 0.0
        valid_loss = 0.0
        model.train()

        # Initialize variables for train accuracy calculation
        num_correct_train = 0
        num_examples_train = 0

        for batch in train_loader:
            optimizer.zero_grad()
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_fn(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item() * inputs.size(0)

            # Calculate the number of correct predictions in the current batch
            correct = torch.eq(torch.max(F.softmax(output, dim=1), dim=1)[1], targets).view(-1)
            num_correct_train += torch.sum(correct).item()
            num_examples_train += correct.shape[0]

        training_loss /= len(train_loader.dataset)
        train_accuracy = num_correct_train / num_examples_train

        model.eval()
        num_correct = 0
        num_examples = 0

        for batch in val_loader:
            inputs, targets = batch
            inputs = inputs.to(device)
            output = model(inputs)
            targets = targets.to(device)
            loss = loss_fn(output, targets)
            valid_loss += loss.data.item() * inputs.size(0)
            correct = torch.eq(torch.max(F.softmax(output, dim=1), dim=1)[1], targets).view(-1)

            num_correct += torch.sum(correct).item()
            num_examples += correct.shape[0]

        valid_loss /= len(val_loader.dataset)
        valid_accuracy = num_correct / num_examples

        # Get the current learning rate from the optimizer
        current_lr = lr_scheduler.step(valid_accuracy)

        print('Epoch: {}, Learning Rate: {}, Training Loss: {:.2f}, Training Accuracy: {:.4f}, Validation Loss: {:.2f}, Validation Accuracy: {:.4f}'.format(epoch, current_lr, training_loss, train_accuracy, valid_loss, valid_accuracy))

        # Log training accuracy to TensorBoard
        writer.add_scalar('Learning Rate', current_lr, epoch)
        writer.add_scalar('Loss/Train', training_loss, epoch)
        writer.add_scalar('Accuracy/Train', train_accuracy, epoch)
        writer.add_scalar('Loss/Validation', valid_loss, epoch)
        writer.add_scalar('Accuracy/Validation', valid_accuracy, epoch)

        early_stop = early_stopping.step(valid_accuracy)
        if early_stop:
            break  # Stop training if early stopping is activated

        # Save the best model based on validation accuracy
        if valid_accuracy > best_valid_accuracy:
            best_valid_accuracy = valid_accuracy
            best_model_state = model.state_dict()
            # Save the best model state to a file
            torch.save(best_model_state, model_path)

    print(f"\n Model has been saved to {model_path} \n")

    # Inspect the model
    writer.add_graph(model, inputs)
    writer.add_figure('Confusion Matrix', plot_confusion_matrix(model, val_loader, device))
    # writer.add_figure(f"Predictions vs. Actuals", log_predictions_vs_actuals(model, val_loader, device=device, num_batches=1))

    # Add hyperparameters to TensorBoard
    hyperparameters = {
        'Feature Extractor': load_model_name,
        'Model Accuracy': best_valid_accuracy,
        'Params (M)': sum(p.numel() for p in model.parameters()) / 1e6,  # Convert to million parameters
        'Size of model (MB)': os.path.getsize(model_path) / (1024 * 1024),  # Size in MB
        'Latency of model (ms)': estimate_latency(model, device),  # Calculate latency with a dummy input
        'Classifier type': classifier_type,
        'Training type': 'Normal',
    }

    writer.add_hparams(hparam_dict=hyperparameters, metric_dict={})
    # Print hyperparameters with .4f
    for key, value in hyperparameters.items():
        if isinstance(value, float):
            print(f'{key}: {value:.4f}')
        else:
            print(f'{key}: {value}')

    # Close the TensorBoard SummaryWriter
    writer.close()

    model.load_state_dict(torch.load(model_path))
    test_accuracy = evaluate(model, test_loader, device=device)
    print(f"\n Test Accuracy: {test_accuracy * 100:.2f}%")

In [None]:
def train_knowledge_distillation(teacher, student, optimizer, ce_loss, train_loader, val_loader, test_loader, load_model_name, epochs, T, soft_target_loss_weight, ce_loss_weight, device, log_dir='tensorboard_logs', patience=5, early_stopping_patience=10, classifier_type='MLP'):
    ## Create folders and writer
    # Create a directory to store TensorBoard logs
    # log_dir = 'tensorboard_logs'

    # Create a TensorBoard SummaryWriter
    # load_model_name = "resnet50"
    model_name = "best_model_" + load_model_name + ".pth"

    current_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
    unique_folder_name = f"{current_datetime}_{load_model_name}"
    unique_log_dir = os.path.join(log_dir, unique_folder_name)
    model_path = os.path.join(unique_log_dir, model_name)

    layout = {
        "Train and validation at same time": {
            "Loss": ["Multiline", ["Loss/Train", "Loss/Validation"]],
            "Accuracy": ["Multiline", ["Accuracy/Train", "Accuracy/Validation"]],
        },
    }

    writer = SummaryWriter(log_dir=unique_log_dir)
    writer.add_custom_scalars(layout)

    ## use early_stopping and scheduler learning rate
    early_stopping = EarlyStopping(patience=early_stopping_patience, verbose=True)
    lr_scheduler = LearningRateScheduler(optimizer, patience=patience, factor=0.1, verbose=True)

    ## start training
    best_valid_accuracy = 0.0
    best_model_state = None
    teacher.eval()  # Teacher set to evaluation mode

    for epoch in range(1, epochs + 1):
        running_loss = 0.0
        num_correct_train = 0
        num_examples_train = 0
        student.train()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()

            # Forward pass with the teacher model - do not save gradients here as we do not change the teacher's weights
            with torch.no_grad():
                teacher_logits = teacher(inputs)

            # Forward pass with the student model
            student_logits = student(inputs)

            # Soften the student logits by applying softmax first and log() second
            soft_targets = nn.functional.softmax(teacher_logits / T, dim=-1)
            soft_prob = nn.functional.log_softmax(student_logits / T, dim=-1)

            # Calculate the soft targets loss. Scaled by T**2 as suggested by the authors of the paper "Distilling the knowledge in a neural network"
            soft_targets_loss = -torch.sum(soft_targets * soft_prob) / soft_prob.size()[0] * (T**2)

            # Calculate the true label loss
            label_loss = ce_loss(student_logits, labels)

            # Weighted sum of the two losses
            loss = soft_target_loss_weight * soft_targets_loss + ce_loss_weight * label_loss

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            # Calculate the number of correct predictions in the current batch
            correct = torch.eq(torch.max(F.softmax(student_logits, dim=1), dim=1)[1], labels).view(-1)
            num_correct_train += torch.sum(correct).item()
            num_examples_train += correct.shape[0]

        # Calculate training accuracy
        train_accuracy = num_correct_train / num_examples_train

        # Validation
        student.eval()
        num_correct_val = 0
        num_examples_val = 0
        valid_loss = 0.0


        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)

                student_logits = student(inputs)

                # Calculate the true label loss
                label_loss = ce_loss(student_logits, labels)

                valid_loss += label_loss.item() * inputs.size(0)

                # Calculate the number of correct predictions in the current batch
                correct = torch.eq(torch.max(F.softmax(student_logits, dim=1), dim=1)[1], labels).view(-1)
                num_correct_val += torch.sum(correct).item()
                num_examples_val += correct.shape[0]

        # Calculate validation accuracy
        valid_accuracy = num_correct_val / num_examples_val
        valid_loss /= len(val_loader.dataset)

        # Get the current learning rate from the optimizer
        current_lr = lr_scheduler.step(valid_accuracy)

        # Print metrics
        print(f"Epoch {epoch}, Learning Rate: {current_lr}, Training Loss: {running_loss / len(train_loader):.2f}, Training Accuracy: {train_accuracy:.4f}, Validation Loss: {valid_loss:.2f}, Validation Accuracy: {valid_accuracy:.4f}")

        # Log training accuracy to TensorBoard
        writer.add_scalar('Learning Rate', current_lr, epoch)
        writer.add_scalar('Loss/Train', running_loss / len(train_loader), epoch)
        writer.add_scalar('Accuracy/Train', train_accuracy, epoch)
        writer.add_scalar('Loss/Validation', valid_loss, epoch)
        writer.add_scalar('Accuracy/Validation', valid_accuracy, epoch)

        early_stop = early_stopping.step(valid_accuracy)
        if early_stop:
            break  # Stop training if early stopping is activated

        # Save the best model based on validation accuracy
        if valid_accuracy > best_valid_accuracy:
            best_valid_accuracy = valid_accuracy
            best_model_state = student.state_dict()
            # Save the best model state to a file
            torch.save(best_model_state, model_path)

    print(f"\n Model has been saved to {model_path}\n")

    # Inspect the model
    writer.add_graph(student, inputs)
    writer.add_figure('Confusion Matrix', plot_confusion_matrix(student, val_loader, device))
    # writer.add_figure(f"Predictions vs. Actuals", log_predictions_vs_actuals(student, val_loader, device=device, num_batches=1))

    # Add hyperparameters to TensorBoard
    hyperparameters = {
        'Feature Extractor': load_model_name,
        'Model Accuracy': best_valid_accuracy,
        'Params (M)': sum(p.numel() for p in student.parameters()) / 1e6,  # Convert to million parameters
        'Size of model (MB)': os.path.getsize(model_path) / (1024 * 1024),  # Size in MB
        'Latency of model (ms)': estimate_latency(student, device),  # Calculate latency with a dummy input
        'Classifier type': classifier_type,
        'Training type': 'Teacher Student Model',
    }

    writer.add_hparams(hparam_dict=hyperparameters, metric_dict={})
    # Print hyperparameters with .4f
    for key, value in hyperparameters.items():
        if isinstance(value, float):
            print(f'{key}: {value:.4f}')
        else:
            print(f'{key}: {value}')

    # Close the TensorBoard SummaryWriter
    writer.close()

    student.load_state_dict(torch.load(model_path))
    test_accuracy = evaluate(student, test_loader, device=device)
    print(f"\n Test Accuracy: {test_accuracy * 100:.2f}%")

## Load model

In [None]:
# load teacher model
# model = models.resnext101_32x8d(pretrained=False)
model = models.resnet50(pretrained=False)
# Replace the last fully connected layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)
# model_path = 'tensorboard_logs/2023-11-27_062728_resnet50/best_model_resnet50.pth'
# model_path = 'tensorboard_logs/2023-12-01_104615_resnet50/best_model_resnet50.pth ' # 86.5 %
model_path = 'tensorboard_logs/2023-12-01_130404_resnet50/best_model_resnet50.pth' # 87.25%
# model_path = 'tensorboard_logs/2023-11-13_075020_resnext101_32x8d/best_model_resnext101_32x8d.pth'
# model_path = 'tensorboard_logs/2023-11-28_071915_resnext101_32x8d/best_model_resnext101_32x8d.pth '
# model_path = 'tensorboard_logs/2023-11-29_060228_resnext101_32x8d/best_model_resnext101_32x8d.pth'
# model.load_state_dict(torch.load(model_path))
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
print(f'model from {model_path} loaded')



model from tensorboard_logs/2023-12-01_130404_resnet50/best_model_resnet50.pth loaded


0.8575

In [None]:
evaluate(model, esc50_test_loader, device)

## ResNet18 raw

In [None]:
epoches = 20
patience = 5
early_stopping_patience=10
load_model_name = "resnet18"
classifier_type='MLP'

model_raw1 = models.resnet18(pretrained=True)

# Replace the last fully connected layer
num_features = model_raw1.fc.in_features
model_raw1.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)

model_raw1.to(device)
loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': model_raw1.conv1.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer1.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer2.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer3.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer4.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)


train(model_raw1, optimizer, loss_fn, esc50_train_loader, esc50_val_loader, esc50_test_loader, load_model_name, epochs=epoches, device=device,
      log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience, classifier_type=classifier_type)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 51.0MB/s]


Epoch: 1, Learning Rate: 0.0001, Training Loss: 3.60, Training Accuracy: 0.14, Validation Loss: 3.17, Validation Accuracy: 0.38
Epoch: 2, Learning Rate: 0.0001, Training Loss: 2.99, Training Accuracy: 0.40, Validation Loss: 2.73, Validation Accuracy: 0.49
Epoch: 3, Learning Rate: 0.0001, Training Loss: 2.55, Training Accuracy: 0.58, Validation Loss: 2.31, Validation Accuracy: 0.58
Epoch: 4, Learning Rate: 0.0001, Training Loss: 2.19, Training Accuracy: 0.69, Validation Loss: 2.09, Validation Accuracy: 0.67
Epoch: 5, Learning Rate: 0.0001, Training Loss: 1.90, Training Accuracy: 0.79, Validation Loss: 1.90, Validation Accuracy: 0.70
Epoch: 6, Learning Rate: 0.0001, Training Loss: 1.63, Training Accuracy: 0.86, Validation Loss: 1.71, Validation Accuracy: 0.78
Epoch 00007: reducing learning rate of group 0 to 1.0000e-05.
Epoch 00007: reducing learning rate of group 1 to 1.0000e-05.
Epoch 00007: reducing learning rate of group 2 to 1.0000e-05.
Epoch 00007: reducing learning rate of group 3

In [None]:
epoches = 50
patience = 10
early_stopping_patience=15
load_model_name = "resnet18"
classifier_type='MLP'

model_raw1 = models.resnet18(pretrained=True)

# Replace the last fully connected layer
num_features = model_raw1.fc.in_features
model_raw1.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)

model_raw1.to(device)
loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': model_raw1.conv1.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer1.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer2.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer3.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.layer4.parameters(), 'lr': 1e-4},
                        {'params': model_raw1.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)


train(model_raw1, optimizer, loss_fn, esc50_train_loader, esc50_val_loader, esc50_test_loader, load_model_name, epochs=epoches, device=device,
      log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience, classifier_type=classifier_type)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 67.9MB/s]


Epoch: 1, Learning Rate: 0.0001, Training Loss: 3.60, Training Accuracy: 0.1313, Validation Loss: 3.12, Validation Accuracy: 0.3450
Epoch: 2, Learning Rate: 0.0001, Training Loss: 2.97, Training Accuracy: 0.4081, Validation Loss: 2.68, Validation Accuracy: 0.5675
Epoch: 3, Learning Rate: 0.0001, Training Loss: 2.51, Training Accuracy: 0.5962, Validation Loss: 2.29, Validation Accuracy: 0.6450
Epoch: 4, Learning Rate: 0.0001, Training Loss: 2.15, Training Accuracy: 0.7281, Validation Loss: 2.01, Validation Accuracy: 0.7075
Epoch: 5, Learning Rate: 0.0001, Training Loss: 1.83, Training Accuracy: 0.8087, Validation Loss: 1.87, Validation Accuracy: 0.7400
Epoch: 6, Learning Rate: 0.0001, Training Loss: 1.58, Training Accuracy: 0.8681, Validation Loss: 1.74, Validation Accuracy: 0.7700
Epoch: 7, Learning Rate: 0.0001, Training Loss: 1.37, Training Accuracy: 0.9113, Validation Loss: 1.58, Validation Accuracy: 0.7900
Epoch: 8, Learning Rate: 0.0001, Training Loss: 1.20, Training Accuracy: 0.9

## ResNet50 - Teacher

In [None]:
epochs = 50
patience = 10
early_stopping_patience=15
load_model_name = "resnet50"
classifier_type='MLP'

model = models.resnet50(pretrained=True)

# Replace the last fully connected layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)

model.to(device)
loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': model.conv1.parameters(), 'lr': 1e-4},
                        {'params': model.layer1.parameters(), 'lr': 1e-4},
                        {'params': model.layer2.parameters(), 'lr': 1e-4},
                        {'params': model.layer3.parameters(), 'lr': 1e-4},
                        {'params': model.layer4.parameters(), 'lr': 1e-4},
                        {'params': model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)


train(model, optimizer, loss_fn, esc50_train_loader, esc50_val_loader, esc50_test_loader, load_model_name, epochs=epochs, device=device,
      log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience, classifier_type=classifier_type)




Epoch: 1, Learning Rate: 0.0001, Training Loss: 3.62, Training Accuracy: 0.1631, Validation Loss: 3.13, Validation Accuracy: 0.3700
Epoch: 2, Learning Rate: 0.0001, Training Loss: 3.01, Training Accuracy: 0.4113, Validation Loss: 2.56, Validation Accuracy: 0.5275
Epoch: 3, Learning Rate: 0.0001, Training Loss: 2.50, Training Accuracy: 0.5519, Validation Loss: 2.15, Validation Accuracy: 0.6075
Epoch: 4, Learning Rate: 0.0001, Training Loss: 2.00, Training Accuracy: 0.7262, Validation Loss: 1.74, Validation Accuracy: 0.6900
Epoch: 5, Learning Rate: 0.0001, Training Loss: 1.65, Training Accuracy: 0.8013, Validation Loss: 1.57, Validation Accuracy: 0.7125
Epoch: 6, Learning Rate: 0.0001, Training Loss: 1.32, Training Accuracy: 0.8606, Validation Loss: 1.34, Validation Accuracy: 0.7500
Epoch: 7, Learning Rate: 0.0001, Training Loss: 1.06, Training Accuracy: 0.9169, Validation Loss: 1.23, Validation Accuracy: 0.7675
Epoch: 8, Learning Rate: 0.0001, Training Loss: 0.85, Training Accuracy: 0.9

## resnext101_32x8d - Teacher

In [None]:
epochs = 50
patience = 5
early_stopping_patience=10
load_model_name = "resnext101_32x8d"
classifier_type='MLP'

model = models.resnext101_32x8d(pretrained=True)

# Replace the last fully connected layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)

model.to(device)
loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': model.conv1.parameters(), 'lr': 1e-4},
                        {'params': model.layer1.parameters(), 'lr': 1e-4},
                        {'params': model.layer2.parameters(), 'lr': 1e-4},
                        {'params': model.layer3.parameters(), 'lr': 1e-4},
                        {'params': model.layer4.parameters(), 'lr': 1e-4},
                        {'params': model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)


train(model, optimizer, loss_fn, esc50_train_loader, esc50_val_loader, esc50_test_loader, load_model_name, epochs=epochs, device=device,
      log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience, classifier_type=classifier_type)


Downloading: "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth" to /root/.cache/torch/hub/checkpoints/resnext101_32x8d-8ba56ff5.pth
100%|██████████| 340M/340M [00:05<00:00, 65.4MB/s]


Epoch: 1, Learning Rate: 0.0001, Training Loss: 3.36, Training Accuracy: 0.2544, Validation Loss: 2.69, Validation Accuracy: 0.4800
Epoch: 2, Learning Rate: 0.0001, Training Loss: 2.43, Training Accuracy: 0.5775, Validation Loss: 1.85, Validation Accuracy: 0.6750
Epoch: 3, Learning Rate: 0.0001, Training Loss: 1.74, Training Accuracy: 0.7619, Validation Loss: 1.53, Validation Accuracy: 0.6825
Epoch: 4, Learning Rate: 0.0001, Training Loss: 1.26, Training Accuracy: 0.8494, Validation Loss: 1.17, Validation Accuracy: 0.7800
Epoch: 5, Learning Rate: 0.0001, Training Loss: 0.91, Training Accuracy: 0.9169, Validation Loss: 1.06, Validation Accuracy: 0.7825
Epoch: 6, Learning Rate: 0.0001, Training Loss: 0.71, Training Accuracy: 0.9387, Validation Loss: 0.95, Validation Accuracy: 0.8300
Epoch 00007: reducing learning rate of group 0 to 1.0000e-05.
Epoch 00007: reducing learning rate of group 1 to 1.0000e-05.
Epoch 00007: reducing learning rate of group 2 to 1.0000e-05.
Epoch 00007: reducing 

## ResNet18 - Student

In [None]:
epochs = 50
patience = 15
early_stopping_patience=20
load_model_name = "resnet18"
classifier_type='MLP'

#spec_resnet = models.resnet50(pretrained=True)
student_model= models.resnet18(pretrained=False)

student_model.fc = nn.Sequential(nn.Linear(student_model.fc.in_features,500),
                               nn.ReLU(),
                               nn.Dropout(),
                               nn.Linear(500,50))
student_model.to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': student_model.conv1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer2.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer3.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer4.parameters(), 'lr': 1e-4},
                        {'params': student_model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)

# load teacher model
model = models.resnext101_32x8d(pretrained=False)
# Replace the last fully connected layer
num_features = model.fc.in_features
model.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)
model_path = 'tensorboard_logs/2023-11-27_062728_resnet50/best_model_resnet50.pth'
# model_path = 'tensorboard_logs/2023-11-13_075020_resnext101_32x8d/best_model_resnext101_32x8d.pth'
model_path = 'tensorboard_logs/2023-11-28_071915_resnext101_32x8d/best_model_resnext101_32x8d.pth '
model.load_state_dict(torch.load(model_path))
model.to(device)

nn_deep = model

new_nn_light = student_model

train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, optimizer=optimizer, ce_loss=loss_fn, train_loader=esc50_train_loader,
                             val_loader=esc50_val_loader, test_loader=esc50_test_loader, load_model_name=load_model_name,
                             epochs=epochs, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,
                             log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience,
                             classifier_type=classifier_type)


Epoch 1, Learning Rate: 0.0001, Training Loss: 6.69, Training Accuracy: 0.0606, Validation Loss: 3.53, Validation Accuracy: 0.1075
Epoch 2, Learning Rate: 0.0001, Training Loss: 6.48, Training Accuracy: 0.1044, Validation Loss: 3.37, Validation Accuracy: 0.1575
Epoch 3, Learning Rate: 0.0001, Training Loss: 6.35, Training Accuracy: 0.1556, Validation Loss: 3.25, Validation Accuracy: 0.2175
Epoch 4, Learning Rate: 0.0001, Training Loss: 6.20, Training Accuracy: 0.2181, Validation Loss: 3.12, Validation Accuracy: 0.2600
Epoch 5, Learning Rate: 0.0001, Training Loss: 6.10, Training Accuracy: 0.2406, Validation Loss: 3.03, Validation Accuracy: 0.2525
Epoch 6, Learning Rate: 0.0001, Training Loss: 5.96, Training Accuracy: 0.3056, Validation Loss: 2.90, Validation Accuracy: 0.3125
Epoch 7, Learning Rate: 0.0001, Training Loss: 5.82, Training Accuracy: 0.3613, Validation Loss: 2.76, Validation Accuracy: 0.3675
Epoch 8, Learning Rate: 0.0001, Training Loss: 5.68, Training Accuracy: 0.4175, Val

## Res Block - Student

In [None]:
import torch
import torch.nn as nn

def model_size(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    # print('model size: {:.3f}MB'.format(size_all_mb))
    return size_all_mb


class block(nn.Module):
    def __init__(
        self, in_channels, intermediate_channels, identity_downsample=None, stride=1
    ):
        super().__init__()
        self.expansion = 4
        self.conv1 = nn.Conv2d(
            in_channels,
            intermediate_channels,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        self.bn1 = nn.BatchNorm2d(intermediate_channels)
        self.conv2 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels,
            kernel_size=3,
            stride=stride,
            padding=1,
            bias=False,
        )
        self.bn2 = nn.BatchNorm2d(intermediate_channels)
        self.conv3 = nn.Conv2d(
            intermediate_channels,
            intermediate_channels * self.expansion,
            kernel_size=1,
            stride=1,
            padding=0,
            bias=False,
        )
        self.bn3 = nn.BatchNorm2d(intermediate_channels * self.expansion)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample
        self.stride = stride

    def forward(self, x):
        identity = x.clone()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.bn3(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x += identity
        x = self.relu(x)
        return x


class ResNet(nn.Module):
    def __init__(self, block, layers, image_channels, num_classes):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(
            image_channels, 64, kernel_size=7, stride=2, padding=3, bias=False
        )
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # Essentially the entire ResNet architecture are in these 4 lines below
        self.layer1 = self._make_layer(
            block, layers[0], intermediate_channels=64, stride=1
        )
        self.layer2 = self._make_layer(
            block, layers[1], intermediate_channels=128, stride=2
        )
        self.layer3 = self._make_layer(
            block, layers[2], intermediate_channels=256, stride=2
        )
        self.layer4 = self._make_layer(
            block, layers[3], intermediate_channels=512, stride=2
        )

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * 4, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)

        return x

    def _make_layer(self, block, num_residual_blocks, intermediate_channels, stride):
        identity_downsample = None
        layers = []

        # Either if we half the input space for ex, 56x56 -> 28x28 (stride=2), or channels changes
        # we need to adapt the Identity (skip connection) so it will be able to be added
        # to the layer that's ahead
        if stride != 1 or self.in_channels != intermediate_channels * 4:
            identity_downsample = nn.Sequential(
                nn.Conv2d(
                    self.in_channels,
                    intermediate_channels * 4,
                    kernel_size=1,
                    stride=stride,
                    bias=False,
                ),
                nn.BatchNorm2d(intermediate_channels * 4),
            )

        layers.append(
            block(self.in_channels, intermediate_channels, identity_downsample, stride)
        )

        # The expansion size is always 4 for ResNet 50,101,152
        self.in_channels = intermediate_channels * 4

        # For example for first resnet layer: 256 will be mapped to 64 as intermediate layer,
        # then finally back to 256. Hence no identity downsample is needed, since stride = 1,
        # and also same amount of channels.
        for i in range(num_residual_blocks - 1):
            layers.append(block(self.in_channels, intermediate_channels))

        return nn.Sequential(*layers)


def ResNet50(img_channel=3, num_classes=1000):
    return ResNet(block, [3, 4, 6, 3], img_channel, num_classes)

def ResNetRaw(img_channel=3, num_classes=50):
    return ResNet(block, [1, 2, 2, 1], img_channel, num_classes)

def ResNetRaw1(img_channel=3, num_classes=50):
    return ResNet(block, [1, 1, 1, 1], img_channel, num_classes)

net = ResNetRaw1(img_channel=3, num_classes=50).to(device)
print(model_size(net))

31.043045043945312


In [None]:
epochs = 50
patience = 20
early_stopping_patience=30
load_model_name = "ResNetRaw"
classifier_type='MLP'

#spec_resnet = models.resnet50(pretrained=True)
# student_model= models.resnet18(pretrained=False)
student_model = ResNetRaw(img_channel=3, num_classes=50)

student_model.fc = nn.Sequential(nn.Linear(student_model.fc.in_features,500),
                               nn.ReLU(),
                               nn.Dropout(),
                               nn.Linear(500,50))
student_model.to(device)
print(model_size(student_model))

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': student_model.conv1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer2.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer3.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer4.parameters(), 'lr': 1e-4},
                        {'params': student_model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)

nn_deep = model
new_nn_light = student_model

train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, optimizer=optimizer, ce_loss=loss_fn, train_loader=esc50_train_loader,
                             val_loader=esc50_val_loader, test_loader=esc50_test_loader, load_model_name=load_model_name,
                             epochs=epochs, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,
                             log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience,
                             classifier_type=classifier_type)


40.00364685058594
Epoch 1, Learning Rate: 0.0001, Training Loss: 6.69, Training Accuracy: 0.0737, Validation Loss: 3.53, Validation Accuracy: 0.1000
Epoch 2, Learning Rate: 0.0001, Training Loss: 6.45, Training Accuracy: 0.1250, Validation Loss: 3.33, Validation Accuracy: 0.1450
Epoch 3, Learning Rate: 0.0001, Training Loss: 6.24, Training Accuracy: 0.1819, Validation Loss: 3.17, Validation Accuracy: 0.1950
Epoch 4, Learning Rate: 0.0001, Training Loss: 6.10, Training Accuracy: 0.2263, Validation Loss: 3.17, Validation Accuracy: 0.2200
Epoch 5, Learning Rate: 0.0001, Training Loss: 5.95, Training Accuracy: 0.2769, Validation Loss: 2.85, Validation Accuracy: 0.2850
Epoch 6, Learning Rate: 0.0001, Training Loss: 5.84, Training Accuracy: 0.3162, Validation Loss: 2.93, Validation Accuracy: 0.2925
Epoch 7, Learning Rate: 0.0001, Training Loss: 5.70, Training Accuracy: 0.3731, Validation Loss: 2.69, Validation Accuracy: 0.3600
Epoch 8, Learning Rate: 0.0001, Training Loss: 5.59, Training Acc

In [None]:
epochs = 50
patience = 20
early_stopping_patience=30
load_model_name = "ResNetRaw"
classifier_type='MLP'

#spec_resnet = models.resnet50(pretrained=True)
# student_model= models.resnet18(pretrained=False)
student_model = ResNetRaw(img_channel=3, num_classes=50)

student_model.fc = nn.Sequential(nn.Linear(student_model.fc.in_features,500),
                               nn.ReLU(),
                               nn.Dropout(),
                               nn.Linear(500,50))
student_model.to(device)
print(model_size(student_model))

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': student_model.conv1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer2.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer3.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer4.parameters(), 'lr': 1e-4},
                        {'params': student_model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)

nn_deep = model
new_nn_light = student_model

train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, optimizer=optimizer, ce_loss=loss_fn, train_loader=esc50_train_loader,
                             val_loader=esc50_val_loader, test_loader=esc50_test_loader, load_model_name=load_model_name,
                             epochs=epochs, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,
                             log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience,
                             classifier_type=classifier_type)


40.00364685058594
Epoch 1, Learning Rate: 0.0001, Training Loss: 6.67, Training Accuracy: 0.0688, Validation Loss: 3.51, Validation Accuracy: 0.1100
Epoch 2, Learning Rate: 0.0001, Training Loss: 6.41, Training Accuracy: 0.1150, Validation Loss: 3.30, Validation Accuracy: 0.2250
Epoch 3, Learning Rate: 0.0001, Training Loss: 6.23, Training Accuracy: 0.1819, Validation Loss: 3.10, Validation Accuracy: 0.2100
Epoch 4, Learning Rate: 0.0001, Training Loss: 6.06, Training Accuracy: 0.2469, Validation Loss: 2.98, Validation Accuracy: 0.2075
Epoch 5, Learning Rate: 0.0001, Training Loss: 5.92, Training Accuracy: 0.2994, Validation Loss: 2.82, Validation Accuracy: 0.3250
Epoch 6, Learning Rate: 0.0001, Training Loss: 5.78, Training Accuracy: 0.3450, Validation Loss: 2.82, Validation Accuracy: 0.2925
Epoch 7, Learning Rate: 0.0001, Training Loss: 5.66, Training Accuracy: 0.3869, Validation Loss: 2.70, Validation Accuracy: 0.3200
Epoch 8, Learning Rate: 0.0001, Training Loss: 5.55, Training Acc

In [None]:
epochs = 50
patience = 20
early_stopping_patience=30
load_model_name = "ResNetRaw"
classifier_type='MLP'

#spec_resnet = models.resnet50(pretrained=True)
# student_model= models.resnet18(pretrained=False)
student_model = ResNetRaw(img_channel=3, num_classes=50)
student_model.fc = nn.Sequential(nn.Linear(student_model.fc.in_features,500),
                               nn.ReLU(),
                               nn.Dropout(),
                               nn.Linear(500,50))
student_model.to(device)
# print(model_size(student_model))

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': student_model.conv1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer2.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer3.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer4.parameters(), 'lr': 1e-4},
                        {'params': student_model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)

nn_deep = model
new_nn_light = student_model

train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, optimizer=optimizer, ce_loss=loss_fn, train_loader=esc50_train_loader,
                             val_loader=esc50_val_loader, test_loader=esc50_test_loader, load_model_name=load_model_name,
                             epochs=epochs, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,
                             log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience,
                             classifier_type=classifier_type)


Epoch 1, Learning Rate: 0.0001, Training Loss: 6.69, Training Accuracy: 0.0663, Validation Loss: 3.51, Validation Accuracy: 0.1250
Epoch 2, Learning Rate: 0.0001, Training Loss: 6.45, Training Accuracy: 0.1263, Validation Loss: 3.36, Validation Accuracy: 0.1900
Epoch 3, Learning Rate: 0.0001, Training Loss: 6.27, Training Accuracy: 0.2037, Validation Loss: 3.18, Validation Accuracy: 0.2225
Epoch 4, Learning Rate: 0.0001, Training Loss: 6.09, Training Accuracy: 0.2313, Validation Loss: 3.05, Validation Accuracy: 0.2425
Epoch 5, Learning Rate: 0.0001, Training Loss: 5.99, Training Accuracy: 0.2706, Validation Loss: 2.93, Validation Accuracy: 0.2925
Epoch 6, Learning Rate: 0.0001, Training Loss: 5.84, Training Accuracy: 0.3281, Validation Loss: 2.85, Validation Accuracy: 0.2750
Epoch 7, Learning Rate: 0.0001, Training Loss: 5.72, Training Accuracy: 0.3731, Validation Loss: 2.76, Validation Accuracy: 0.3400
Epoch 8, Learning Rate: 0.0001, Training Loss: 5.61, Training Accuracy: 0.4100, Val

In [None]:
epochs = 50
patience = 20
early_stopping_patience=30
load_model_name = "ResNetRaw1"
classifier_type='MLP'

#spec_resnet = models.resnet50(pretrained=True)
# student_model= models.resnet18(pretrained=False)
student_model = ResNetRaw1(img_channel=3, num_classes=50)
student_model.fc = nn.Sequential(nn.Linear(student_model.fc.in_features,500),
                               nn.ReLU(),
                               nn.Dropout(),
                               nn.Linear(500,50))
student_model.to(device)
print(model_size(student_model), '\n')

loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': student_model.conv1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer1.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer2.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer3.parameters(), 'lr': 1e-4},
                        {'params': student_model.layer4.parameters(), 'lr': 1e-4},
                        {'params': student_model.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)

nn_deep = model
new_nn_light = student_model

train_knowledge_distillation(teacher=nn_deep, student=new_nn_light, optimizer=optimizer, ce_loss=loss_fn, train_loader=esc50_train_loader,
                             val_loader=esc50_val_loader, test_loader=esc50_test_loader, load_model_name=load_model_name,
                             epochs=epochs, T=2, soft_target_loss_weight=0.25, ce_loss_weight=0.75, device=device,
                             log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience,
                             classifier_type=classifier_type)


34.65594482421875 

Epoch 1, Learning Rate: 0.0001, Training Loss: 6.68, Training Accuracy: 0.0600, Validation Loss: 3.53, Validation Accuracy: 0.1375
Epoch 2, Learning Rate: 0.0001, Training Loss: 6.45, Training Accuracy: 0.1212, Validation Loss: 3.38, Validation Accuracy: 0.1975
Epoch 3, Learning Rate: 0.0001, Training Loss: 6.30, Training Accuracy: 0.1794, Validation Loss: 3.24, Validation Accuracy: 0.2375
Epoch 4, Learning Rate: 0.0001, Training Loss: 6.18, Training Accuracy: 0.2181, Validation Loss: 3.13, Validation Accuracy: 0.2450
Epoch 5, Learning Rate: 0.0001, Training Loss: 6.07, Training Accuracy: 0.2500, Validation Loss: 3.03, Validation Accuracy: 0.2825
Epoch 6, Learning Rate: 0.0001, Training Loss: 5.95, Training Accuracy: 0.3231, Validation Loss: 2.94, Validation Accuracy: 0.3125
Epoch 7, Learning Rate: 0.0001, Training Loss: 5.84, Training Accuracy: 0.3563, Validation Loss: 2.90, Validation Accuracy: 0.3300
Epoch 8, Learning Rate: 0.0001, Training Loss: 5.76, Training A

### Resnet raw without knowledge distillation

In [None]:
epoches = 50
patience = 20
early_stopping_patience=30
load_model_name = "ResNetRaw1"
classifier_type='MLP'

model_raw = ResNetRaw1(img_channel=3, num_classes=50)

# Replace the last fully connected layer
num_features = model_raw.fc.in_features
model_raw.fc = nn.Sequential(
    nn.Linear(num_features, 500),
    nn.ReLU(),
    nn.Dropout(),
    nn.Linear(500, 50)
)

model_raw.to(device)
loss_fn = nn.CrossEntropyLoss()

optimizer = optim.Adam([
                        {'params': model_raw.conv1.parameters(), 'lr': 1e-4},
                        {'params': model_raw.layer1.parameters(), 'lr': 1e-4},
                        {'params': model_raw.layer2.parameters(), 'lr': 1e-4},
                        {'params': model_raw.layer3.parameters(), 'lr': 1e-4},
                        {'params': model_raw.layer4.parameters(), 'lr': 1e-4},
                        {'params': model_raw.fc.parameters(), 'lr': 1e-8}
                        ], lr=1e-2)


train(model_raw, optimizer, loss_fn, esc50_train_loader, esc50_val_loader, esc50_test_loader, load_model_name, epochs=epoches, device=device,
      log_dir = 'tensorboard_logs', patience=patience, early_stopping_patience=early_stopping_patience, classifier_type=classifier_type)


Epoch: 1, Learning Rate: 0.0001, Training Loss: 3.72, Training Accuracy: 0.0663, Validation Loss: 3.49, Validation Accuracy: 0.1200
Epoch: 2, Learning Rate: 0.0001, Training Loss: 3.47, Training Accuracy: 0.1219, Validation Loss: 3.34, Validation Accuracy: 0.1950
Epoch: 3, Learning Rate: 0.0001, Training Loss: 3.32, Training Accuracy: 0.1769, Validation Loss: 3.20, Validation Accuracy: 0.2250
Epoch: 4, Learning Rate: 0.0001, Training Loss: 3.14, Training Accuracy: 0.2225, Validation Loss: 3.07, Validation Accuracy: 0.2525
Epoch: 5, Learning Rate: 0.0001, Training Loss: 3.02, Training Accuracy: 0.2762, Validation Loss: 3.03, Validation Accuracy: 0.2650
Epoch: 6, Learning Rate: 0.0001, Training Loss: 2.91, Training Accuracy: 0.3100, Validation Loss: 2.88, Validation Accuracy: 0.3400
Epoch: 7, Learning Rate: 0.0001, Training Loss: 2.80, Training Accuracy: 0.3581, Validation Loss: 2.83, Validation Accuracy: 0.3625
Epoch: 8, Learning Rate: 0.0001, Training Loss: 2.66, Training Accuracy: 0.4